[GENERAL] Removed deprecated driver files and added basic compatibility with rocm (#268)
- Removed driver module -- accelerator runtime is handled by pytorch - Added basic support for ROCM based on @micmelesse 's PR -- now can execute empty kernel on AMD devices without any compile-time changes - Now only using PREFER_SHARED for kernels when the size of shared memory is greater than 49k. Otherwise there can be poor L1 performance for broadcast tensors
This commit is contained in:
		@@ -31,7 +31,7 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D__STDC_FORMAT_MACROS  -std=gnu++17")
 | 
				
			|||||||
# LLVM
 | 
					# LLVM
 | 
				
			||||||
##########
 | 
					##########
 | 
				
			||||||
if("${LLVM_LIBRARY_DIR}" STREQUAL "")
 | 
					if("${LLVM_LIBRARY_DIR}" STREQUAL "")
 | 
				
			||||||
    find_package(LLVM 11 REQUIRED COMPONENTS "nvptx")
 | 
					    find_package(LLVM 11 REQUIRED COMPONENTS "nvptx;amdgpu")
 | 
				
			||||||
    message(STATUS "Found LLVM ${LLVM_PACKAGE_VERSION}")
 | 
					    message(STATUS "Found LLVM ${LLVM_PACKAGE_VERSION}")
 | 
				
			||||||
    if(APPLE)
 | 
					    if(APPLE)
 | 
				
			||||||
      set(CMAKE_OSX_DEPLOYMENT_TARGET "10.14")
 | 
					      set(CMAKE_OSX_DEPLOYMENT_TARGET "10.14")
 | 
				
			||||||
@@ -39,14 +39,52 @@ if("${LLVM_LIBRARY_DIR}" STREQUAL "")
 | 
				
			|||||||
# sometimes we don't want to use llvm-config, since it may have been downloaded for some specific linux distros
 | 
					# sometimes we don't want to use llvm-config, since it may have been downloaded for some specific linux distros
 | 
				
			||||||
else()
 | 
					else()
 | 
				
			||||||
    set(LLVM_LDFLAGS "-L${LLVM_LIBRARY_DIR}")
 | 
					    set(LLVM_LDFLAGS "-L${LLVM_LIBRARY_DIR}")
 | 
				
			||||||
    set(LLVM_LIBRARIES libLLVMNVPTXCodeGen.a libLLVMSelectionDAG.a libLLVMipo.a libLLVMInstrumentation.a
 | 
					    set(LLVM_LIBRARIES 
 | 
				
			||||||
                       libLLVMVectorize.a libLLVMLinker.a libLLVMIRReader.a libLLVMAsmParser.a libLLVMFrontendOpenMP.a
 | 
					libLLVMNVPTXCodeGen.a
 | 
				
			||||||
                       libLLVMAsmPrinter.a libLLVMDebugInfoDWARF.a libLLVMCodeGen.a libLLVMTarget.a libLLVMScalarOpts.a
 | 
					libLLVMNVPTXDesc.a
 | 
				
			||||||
                       libLLVMInstCombine.a libLLVMAggressiveInstCombine.a libLLVMTransformUtils.a libLLVMBitWriter.a
 | 
					libLLVMNVPTXInfo.a
 | 
				
			||||||
                       libLLVMAnalysis.a libLLVMProfileData.a libLLVMObject.a libLLVMTextAPI.a libLLVMMCParser.a
 | 
					libLLVMAMDGPUDisassembler.a
 | 
				
			||||||
                       libLLVMBitReader.a libLLVMCore.a libLLVMRemarks.a libLLVMBitstreamReader.a libLLVMNVPTXDesc.a
 | 
					libLLVMMCDisassembler.a
 | 
				
			||||||
                       libLLVMMC.a libLLVMDebugInfoCodeView.a libLLVMDebugInfoMSF.a libLLVMBinaryFormat.a libLLVMNVPTXInfo.a
 | 
					libLLVMAMDGPUCodeGen.a
 | 
				
			||||||
                       libLLVMSupport.a libLLVMDemangle.a)
 | 
					libLLVMMIRParser.a
 | 
				
			||||||
 | 
					libLLVMGlobalISel.a
 | 
				
			||||||
 | 
					libLLVMSelectionDAG.a
 | 
				
			||||||
 | 
					libLLVMipo.a
 | 
				
			||||||
 | 
					libLLVMInstrumentation.a
 | 
				
			||||||
 | 
					libLLVMVectorize.a
 | 
				
			||||||
 | 
					libLLVMLinker.a
 | 
				
			||||||
 | 
					libLLVMIRReader.a
 | 
				
			||||||
 | 
					libLLVMAsmParser.a
 | 
				
			||||||
 | 
					libLLVMFrontendOpenMP.a
 | 
				
			||||||
 | 
					libLLVMAsmPrinter.a
 | 
				
			||||||
 | 
					libLLVMDebugInfoDWARF.a
 | 
				
			||||||
 | 
					libLLVMCodeGen.a
 | 
				
			||||||
 | 
					libLLVMTarget.a
 | 
				
			||||||
 | 
					libLLVMScalarOpts.a
 | 
				
			||||||
 | 
					libLLVMInstCombine.a
 | 
				
			||||||
 | 
					libLLVMAggressiveInstCombine.a
 | 
				
			||||||
 | 
					libLLVMTransformUtils.a
 | 
				
			||||||
 | 
					libLLVMBitWriter.a
 | 
				
			||||||
 | 
					libLLVMAnalysis.a
 | 
				
			||||||
 | 
					libLLVMProfileData.a
 | 
				
			||||||
 | 
					libLLVMObject.a
 | 
				
			||||||
 | 
					libLLVMTextAPI.a
 | 
				
			||||||
 | 
					libLLVMBitReader.a
 | 
				
			||||||
 | 
					libLLVMAMDGPUAsmParser.a
 | 
				
			||||||
 | 
					libLLVMMCParser.a
 | 
				
			||||||
 | 
					libLLVMAMDGPUDesc.a
 | 
				
			||||||
 | 
					libLLVMAMDGPUUtils.a
 | 
				
			||||||
 | 
					libLLVMMC.a
 | 
				
			||||||
 | 
					libLLVMDebugInfoCodeView.a
 | 
				
			||||||
 | 
					libLLVMDebugInfoMSF.a
 | 
				
			||||||
 | 
					libLLVMCore.a
 | 
				
			||||||
 | 
					libLLVMRemarks.a
 | 
				
			||||||
 | 
					libLLVMBitstreamReader.a
 | 
				
			||||||
 | 
					libLLVMBinaryFormat.a
 | 
				
			||||||
 | 
					libLLVMAMDGPUInfo.a
 | 
				
			||||||
 | 
					libLLVMSupport.a
 | 
				
			||||||
 | 
					libLLVMDemangle.a
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
endif()
 | 
					endif()
 | 
				
			||||||
include_directories("${LLVM_INCLUDE_DIRS}")
 | 
					include_directories("${LLVM_INCLUDE_DIRS}")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -4,8 +4,17 @@
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
#include <memory>
 | 
					#include <memory>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					namespace llvm{
 | 
				
			||||||
 | 
					  class Module;
 | 
				
			||||||
 | 
					  class LLVMContext;
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
namespace triton{
 | 
					namespace triton{
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					namespace codegen {
 | 
				
			||||||
 | 
					  class target;
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
namespace ir{
 | 
					namespace ir{
 | 
				
			||||||
  class module;
 | 
					  class module;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
@@ -21,8 +30,10 @@ namespace codegen{
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
// TODO:
 | 
					// TODO:
 | 
				
			||||||
// There should be a proper pass manager there!
 | 
					// There should be a proper pass manager there!
 | 
				
			||||||
void add_passes_to_emit_bin(ir::module &ir, driver::device* dev, int num_warps, int num_stages, bool force_nc_cache,
 | 
					std::unique_ptr<llvm::Module> add_passes_to_emit_bin(ir::module &ir, llvm::LLVMContext& ctx,
 | 
				
			||||||
                            driver::module*& mod, driver::kernel*& ker, size_t& shared_mem);
 | 
					                                                     codegen::target* target,
 | 
				
			||||||
 | 
					                                                     int sm, int num_warps,
 | 
				
			||||||
 | 
					                                                     int num_stages, bool force_nc_cache, int &shared_static);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -1,137 +0,0 @@
 | 
				
			|||||||
#pragma once
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#ifndef _TRITON_DRIVER_BACKEND_H_
 | 
					 | 
				
			||||||
#define _TRITON_DRIVER_BACKEND_H_
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#include <map>
 | 
					 | 
				
			||||||
#include <list>
 | 
					 | 
				
			||||||
#include <vector>
 | 
					 | 
				
			||||||
#include "triton/driver/context.h"
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
namespace llvm
 | 
					 | 
				
			||||||
{
 | 
					 | 
				
			||||||
class Module;
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
namespace triton
 | 
					 | 
				
			||||||
{
 | 
					 | 
				
			||||||
namespace driver
 | 
					 | 
				
			||||||
{
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
class buffer;
 | 
					 | 
				
			||||||
class stream;
 | 
					 | 
				
			||||||
class device;
 | 
					 | 
				
			||||||
class context;
 | 
					 | 
				
			||||||
class platform;
 | 
					 | 
				
			||||||
class module;
 | 
					 | 
				
			||||||
class kernel;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
struct backend
 | 
					 | 
				
			||||||
{
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  // platforms
 | 
					 | 
				
			||||||
  class platforms
 | 
					 | 
				
			||||||
  {
 | 
					 | 
				
			||||||
    friend class backend;
 | 
					 | 
				
			||||||
  private:
 | 
					 | 
				
			||||||
    static void init();
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  public:
 | 
					 | 
				
			||||||
    static void get(std::vector<driver::platform*> &results);
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  private:
 | 
					 | 
				
			||||||
    static std::vector<driver::platform*> cache_;
 | 
					 | 
				
			||||||
  };
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  // devices
 | 
					 | 
				
			||||||
  class devices
 | 
					 | 
				
			||||||
  {
 | 
					 | 
				
			||||||
    friend class backend;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  private:
 | 
					 | 
				
			||||||
    static void init(const std::vector<platform *> &platforms);
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  public:
 | 
					 | 
				
			||||||
    static void get(std::vector<driver::device*>& devs);
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  private:
 | 
					 | 
				
			||||||
    static std::vector<driver::device*> cache_;
 | 
					 | 
				
			||||||
  };
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  // modules
 | 
					 | 
				
			||||||
  class modules
 | 
					 | 
				
			||||||
  {
 | 
					 | 
				
			||||||
    friend class backend;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  public:
 | 
					 | 
				
			||||||
    static void release();
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  private:
 | 
					 | 
				
			||||||
    static std::map<std::tuple<driver::stream*, std::string>, driver::module*> cache_;
 | 
					 | 
				
			||||||
  };
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  // kernels
 | 
					 | 
				
			||||||
  class kernels
 | 
					 | 
				
			||||||
  {
 | 
					 | 
				
			||||||
    friend class backend;
 | 
					 | 
				
			||||||
  public:
 | 
					 | 
				
			||||||
    static void release();
 | 
					 | 
				
			||||||
    static driver::kernel* get(driver::module* mod, const std::string & name);
 | 
					 | 
				
			||||||
  private:
 | 
					 | 
				
			||||||
    static std::map<std::tuple<module*, std::string>, driver::kernel*> cache_;
 | 
					 | 
				
			||||||
  };
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  // contexts
 | 
					 | 
				
			||||||
  class contexts
 | 
					 | 
				
			||||||
  {
 | 
					 | 
				
			||||||
    friend class backend;
 | 
					 | 
				
			||||||
  private:
 | 
					 | 
				
			||||||
    static void init(const std::vector<device *> &);
 | 
					 | 
				
			||||||
    static void release();
 | 
					 | 
				
			||||||
  public:
 | 
					 | 
				
			||||||
    static driver::context* get_default();
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    static driver::context* import(CUcontext ctx)
 | 
					 | 
				
			||||||
    {
 | 
					 | 
				
			||||||
      for(driver::context* x: cache_){
 | 
					 | 
				
			||||||
        driver::cu_context* cu_x = (driver::cu_context*)x;
 | 
					 | 
				
			||||||
        if(*cu_x->cu()==ctx)
 | 
					 | 
				
			||||||
          return x;
 | 
					 | 
				
			||||||
      }
 | 
					 | 
				
			||||||
      cache_.emplace_back(new driver::cu_context(ctx, false));
 | 
					 | 
				
			||||||
      return cache_.back();
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    static void get(std::list<driver::context*> &);
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  private:
 | 
					 | 
				
			||||||
    static std::list<driver::context*> cache_;
 | 
					 | 
				
			||||||
  };
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  // streams
 | 
					 | 
				
			||||||
  class streams
 | 
					 | 
				
			||||||
  {
 | 
					 | 
				
			||||||
    friend class backend;
 | 
					 | 
				
			||||||
  private:
 | 
					 | 
				
			||||||
    static void init(std::list<context*> const &);
 | 
					 | 
				
			||||||
    static void release();
 | 
					 | 
				
			||||||
  public:
 | 
					 | 
				
			||||||
    static void get(driver::context*, std::vector<driver::stream *> &streams);
 | 
					 | 
				
			||||||
    static driver::stream* get(driver::context*, unsigned int id = 0);
 | 
					 | 
				
			||||||
    static driver::stream* get_default();
 | 
					 | 
				
			||||||
  private:
 | 
					 | 
				
			||||||
    static std::map<driver::context*, std::vector<driver::stream*> > cache_;
 | 
					 | 
				
			||||||
  };
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  static void init();
 | 
					 | 
				
			||||||
  static void release();
 | 
					 | 
				
			||||||
  static void synchronize(triton::driver::context *);
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  static unsigned int default_device;
 | 
					 | 
				
			||||||
};
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#endif
 | 
					 | 
				
			||||||
@@ -1,48 +0,0 @@
 | 
				
			|||||||
#pragma once
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#ifndef _TRITON_DRIVER_BUFFER_H_
 | 
					 | 
				
			||||||
#define _TRITON_DRIVER_BUFFER_H_
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#include "triton/driver/handle.h"
 | 
					 | 
				
			||||||
#include "triton/driver/context.h"
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
namespace triton
 | 
					 | 
				
			||||||
{
 | 
					 | 
				
			||||||
namespace driver
 | 
					 | 
				
			||||||
{
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
class stream;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
// Base
 | 
					 | 
				
			||||||
class buffer : public polymorphic_resource<CUdeviceptr, host_buffer_t> {
 | 
					 | 
				
			||||||
public:
 | 
					 | 
				
			||||||
  buffer(size_t size, CUdeviceptr cl, bool take_ownership);
 | 
					 | 
				
			||||||
  buffer(size_t size, host_buffer_t hst, bool take_ownership);
 | 
					 | 
				
			||||||
  uintptr_t addr_as_uintptr_t();
 | 
					 | 
				
			||||||
  static buffer* create(driver::context* ctx, size_t size);
 | 
					 | 
				
			||||||
  size_t size();
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
protected:
 | 
					 | 
				
			||||||
  size_t size_;
 | 
					 | 
				
			||||||
};
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
// CPU
 | 
					 | 
				
			||||||
class host_buffer: public buffer
 | 
					 | 
				
			||||||
{
 | 
					 | 
				
			||||||
public:
 | 
					 | 
				
			||||||
  host_buffer(size_t size);
 | 
					 | 
				
			||||||
};
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
// CUDA
 | 
					 | 
				
			||||||
class cu_buffer: public buffer
 | 
					 | 
				
			||||||
{
 | 
					 | 
				
			||||||
public:
 | 
					 | 
				
			||||||
  cu_buffer(size_t size);
 | 
					 | 
				
			||||||
  cu_buffer(size_t size, CUdeviceptr cu, bool take_ownership);
 | 
					 | 
				
			||||||
  void set_zero(triton::driver::stream *queue, size_t size);
 | 
					 | 
				
			||||||
};
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#endif
 | 
					 | 
				
			||||||
@@ -1,50 +0,0 @@
 | 
				
			|||||||
#pragma once
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#ifndef _TRITON_DRIVER_CONTEXT_H_
 | 
					 | 
				
			||||||
#define _TRITON_DRIVER_CONTEXT_H_
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#include "triton/driver/device.h"
 | 
					 | 
				
			||||||
#include "triton/driver/handle.h"
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
namespace triton
 | 
					 | 
				
			||||||
{
 | 
					 | 
				
			||||||
namespace driver
 | 
					 | 
				
			||||||
{
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
class context: public polymorphic_resource<CUcontext, host_context_t>{
 | 
					 | 
				
			||||||
protected:
 | 
					 | 
				
			||||||
  static std::string get_cache_path();
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
public:
 | 
					 | 
				
			||||||
  context(driver::device *dev, CUcontext cu, bool take_ownership);
 | 
					 | 
				
			||||||
  context(driver::device *dev, host_context_t hst, bool take_ownership);
 | 
					 | 
				
			||||||
  driver::device* device() const;
 | 
					 | 
				
			||||||
  std::string const & cache_path() const;
 | 
					 | 
				
			||||||
  // factory methods
 | 
					 | 
				
			||||||
  static context* create(driver::device *dev);
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
protected:
 | 
					 | 
				
			||||||
  driver::device* dev_;
 | 
					 | 
				
			||||||
  std::string cache_path_;
 | 
					 | 
				
			||||||
};
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
// Host
 | 
					 | 
				
			||||||
class host_context: public context {
 | 
					 | 
				
			||||||
public:
 | 
					 | 
				
			||||||
  host_context(driver::device* dev);
 | 
					 | 
				
			||||||
};
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
// CUDA
 | 
					 | 
				
			||||||
class cu_context: public context {
 | 
					 | 
				
			||||||
private:
 | 
					 | 
				
			||||||
  static CUdevice get_device_of(CUcontext);
 | 
					 | 
				
			||||||
public:
 | 
					 | 
				
			||||||
  //Constructors
 | 
					 | 
				
			||||||
  cu_context(CUcontext cu, bool take_ownership = true);
 | 
					 | 
				
			||||||
  cu_context(driver::device* dev);
 | 
					 | 
				
			||||||
};
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#endif
 | 
					 | 
				
			||||||
@@ -1,82 +0,0 @@
 | 
				
			|||||||
#pragma once
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#ifndef _TRITON_DRIVER_DEVICE_H_
 | 
					 | 
				
			||||||
#define _TRITON_DRIVER_DEVICE_H_
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#include "triton/driver/platform.h"
 | 
					 | 
				
			||||||
#include "triton/driver/handle.h"
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
namespace triton
 | 
					 | 
				
			||||||
{
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
namespace codegen
 | 
					 | 
				
			||||||
{
 | 
					 | 
				
			||||||
class target;
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
namespace driver
 | 
					 | 
				
			||||||
{
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
class context;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
// Base device
 | 
					 | 
				
			||||||
class device: public polymorphic_resource<CUdevice, host_device_t>{
 | 
					 | 
				
			||||||
public:
 | 
					 | 
				
			||||||
  using polymorphic_resource::polymorphic_resource;
 | 
					 | 
				
			||||||
  virtual size_t max_threads_per_block() const = 0;
 | 
					 | 
				
			||||||
  virtual size_t max_shared_memory() const = 0;
 | 
					 | 
				
			||||||
  virtual std::unique_ptr<codegen::target> make_target() const = 0;
 | 
					 | 
				
			||||||
};
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
// Host device
 | 
					 | 
				
			||||||
class host_device: public device {
 | 
					 | 
				
			||||||
public:
 | 
					 | 
				
			||||||
  host_device(): device(host_device_t(), true){ }
 | 
					 | 
				
			||||||
  size_t max_threads_per_block() const { return 1; }
 | 
					 | 
				
			||||||
  size_t max_shared_memory() const { return 0; }
 | 
					 | 
				
			||||||
  std::unique_ptr<codegen::target> make_target() const;
 | 
					 | 
				
			||||||
};
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
// CUDA device
 | 
					 | 
				
			||||||
class cu_device: public device {
 | 
					 | 
				
			||||||
private:
 | 
					 | 
				
			||||||
  //Metaprogramming elper to get cuda info from attribute
 | 
					 | 
				
			||||||
  template<CUdevice_attribute attr>
 | 
					 | 
				
			||||||
  int cuGetInfo() const;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  inline nvmlDevice_t nvml_device() const;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
public:
 | 
					 | 
				
			||||||
  cu_device(CUdevice cu = CUdevice(), bool take_ownership = true): device(cu, take_ownership){}
 | 
					 | 
				
			||||||
  // Informations
 | 
					 | 
				
			||||||
  std::string infos() const;
 | 
					 | 
				
			||||||
  size_t address_bits() const;
 | 
					 | 
				
			||||||
  std::vector<size_t> max_block_dim() const;
 | 
					 | 
				
			||||||
  size_t warp_size() const;
 | 
					 | 
				
			||||||
  // Compute Capability
 | 
					 | 
				
			||||||
  void interpret_as(int cc);
 | 
					 | 
				
			||||||
  int compute_capability() const;
 | 
					 | 
				
			||||||
  // Identifier
 | 
					 | 
				
			||||||
  std::string name() const;
 | 
					 | 
				
			||||||
  std::string pci_bus_id() const;
 | 
					 | 
				
			||||||
  // Clocks
 | 
					 | 
				
			||||||
  size_t current_sm_clock() const;
 | 
					 | 
				
			||||||
  size_t current_mem_clock() const;
 | 
					 | 
				
			||||||
  size_t max_threads_per_block() const;
 | 
					 | 
				
			||||||
  size_t max_shared_memory() const;
 | 
					 | 
				
			||||||
  size_t max_sm_clock() const;
 | 
					 | 
				
			||||||
  size_t max_mem_clock() const;
 | 
					 | 
				
			||||||
  void set_max_clock();
 | 
					 | 
				
			||||||
  void enable_peer_access(CUdeviceptr peer_mem_ptr) const;
 | 
					 | 
				
			||||||
  // Target
 | 
					 | 
				
			||||||
  std::unique_ptr<codegen::target> make_target() const;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
private:
 | 
					 | 
				
			||||||
  std::shared_ptr<int> interpreted_as_;
 | 
					 | 
				
			||||||
};
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#endif
 | 
					 | 
				
			||||||
@@ -10,6 +10,10 @@
 | 
				
			|||||||
#include "triton/external/CUDA/cuda.h"
 | 
					#include "triton/external/CUDA/cuda.h"
 | 
				
			||||||
#include "triton/external/CUDA/nvml.h"
 | 
					#include "triton/external/CUDA/nvml.h"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					//// HIP backend
 | 
				
			||||||
 | 
					//#define __HIP_PLATFORM_AMD__
 | 
				
			||||||
 | 
					#include "triton/external/hip.h"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
//Exceptions
 | 
					//Exceptions
 | 
				
			||||||
#include <iostream>
 | 
					#include <iostream>
 | 
				
			||||||
#include <stdexcept>
 | 
					#include <stdexcept>
 | 
				
			||||||
@@ -28,6 +32,7 @@ class cu_context;
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
template<class T> void check(T){}
 | 
					template<class T> void check(T){}
 | 
				
			||||||
void check(CUresult err);
 | 
					void check(CUresult err);
 | 
				
			||||||
 | 
					void check(hipError_t err);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class dispatch
 | 
					class dispatch
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
@@ -58,17 +63,18 @@ protected:
 | 
				
			|||||||
  }
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
public:
 | 
					public:
 | 
				
			||||||
 | 
					  static void release();
 | 
				
			||||||
 | 
					  // Nvidia
 | 
				
			||||||
  static bool nvmlinit();
 | 
					  static bool nvmlinit();
 | 
				
			||||||
  static bool cuinit();
 | 
					  static bool cuinit();
 | 
				
			||||||
  static void release();
 | 
					  // AMD
 | 
				
			||||||
 | 
					  static bool hipinit();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  /* ------------------- *
 | 
					  /* ------------------- *
 | 
				
			||||||
   * CUDA
 | 
					   * CUDA
 | 
				
			||||||
   * ------------------- */
 | 
					   * ------------------- */
 | 
				
			||||||
  // context management
 | 
					  // context management
 | 
				
			||||||
  static CUresult cuInit(unsigned int Flags);
 | 
					  static CUresult cuInit(unsigned int Flags);
 | 
				
			||||||
  static CUresult cuCtxGetCurrent(CUcontext *pctx);
 | 
					 | 
				
			||||||
  static CUresult cuCtxSetCurrent(CUcontext ctx);
 | 
					 | 
				
			||||||
  static CUresult cuCtxDestroy_v2(CUcontext ctx);
 | 
					  static CUresult cuCtxDestroy_v2(CUcontext ctx);
 | 
				
			||||||
  static CUresult cuCtxCreate_v2(CUcontext *pctx, unsigned int flags, CUdevice dev);
 | 
					  static CUresult cuCtxCreate_v2(CUcontext *pctx, unsigned int flags, CUdevice dev);
 | 
				
			||||||
  static CUresult cuCtxPushCurrent_v2(CUcontext ctx);
 | 
					  static CUresult cuCtxPushCurrent_v2(CUcontext ctx);
 | 
				
			||||||
@@ -128,6 +134,55 @@ public:
 | 
				
			|||||||
  static nvmlReturn_t nvmlDeviceGetMaxClockInfo(nvmlDevice_t device, nvmlClockType_t type, unsigned int *clock);
 | 
					  static nvmlReturn_t nvmlDeviceGetMaxClockInfo(nvmlDevice_t device, nvmlClockType_t type, unsigned int *clock);
 | 
				
			||||||
  static nvmlReturn_t nvmlDeviceSetApplicationsClocks(nvmlDevice_t device, unsigned int mem_clock, unsigned int sm_clock);
 | 
					  static nvmlReturn_t nvmlDeviceSetApplicationsClocks(nvmlDevice_t device, unsigned int mem_clock, unsigned int sm_clock);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  /* ------------------- *
 | 
				
			||||||
 | 
					   * HIP
 | 
				
			||||||
 | 
					   * ------------------- */
 | 
				
			||||||
 | 
					  // context management
 | 
				
			||||||
 | 
					  static hipError_t hipInit(unsigned int Flags);
 | 
				
			||||||
 | 
					  static hipError_t hipCtxDestroy(hipCtx_t ctx);
 | 
				
			||||||
 | 
					  static hipError_t hipCtxCreate(hipCtx_t *pctx, unsigned int flags, hipDevice_t dev);
 | 
				
			||||||
 | 
					  static hipError_t hipCtxPushCurrent(hipCtx_t ctx);
 | 
				
			||||||
 | 
					  static hipError_t hipCtxPopCurrent(hipCtx_t *pctx);
 | 
				
			||||||
 | 
					  static hipError_t hipCtxGetDevice(hipDevice_t* result);
 | 
				
			||||||
 | 
					  static hipError_t hipCtxEnablePeerAccess(hipCtx_t peerContext, unsigned int flags);
 | 
				
			||||||
 | 
					  static hipError_t hipDriverGetVersion(int *driverVersion);
 | 
				
			||||||
 | 
					  // device management
 | 
				
			||||||
 | 
					  static hipError_t hipGetDevice(hipDevice_t *device, int ordinal);
 | 
				
			||||||
 | 
					  static hipError_t hipDeviceGetName(char *name, int len, hipDevice_t dev);
 | 
				
			||||||
 | 
					  static hipError_t hipDeviceGetPCIBusId(char *id, int len, hipDevice_t dev);
 | 
				
			||||||
 | 
					  static hipError_t hipDeviceGetAttribute(int *pi, hipDeviceAttribute_t attrib, hipDevice_t dev);
 | 
				
			||||||
 | 
					  static hipError_t hipGetDeviceCount(int *count);
 | 
				
			||||||
 | 
					  // module management
 | 
				
			||||||
 | 
					  static hipError_t hipModuleGetGlobal(hipDeviceptr_t *dptr, size_t* bytes, hipModule_t hmod, const char *name);
 | 
				
			||||||
 | 
					  static hipError_t hipModuleLoad(hipModule_t *module, const char *fname);
 | 
				
			||||||
 | 
					  static hipError_t hipModuleLoadData(hipModule_t* module, const void* image);
 | 
				
			||||||
 | 
					  static hipError_t hipModuleUnload(hipModule_t hmod);
 | 
				
			||||||
 | 
					  static hipError_t hipModuleLoadDataEx(hipModule_t *module, const void *image, unsigned int numOptions, hipJitOption *options, void **optionValues);
 | 
				
			||||||
 | 
					  static hipError_t hipModuleGetFunction(hipFunction_t *hfunc, hipModule_t hmod, const char *name);
 | 
				
			||||||
 | 
					  // stream management
 | 
				
			||||||
 | 
					  static hipError_t hipStreamCreate(hipStream_t *phStream, unsigned int Flags);
 | 
				
			||||||
 | 
					  static hipError_t hipStreamSynchronize(hipStream_t hStream);
 | 
				
			||||||
 | 
					  static hipError_t hipStreamDestroy(hipStream_t hStream);
 | 
				
			||||||
 | 
					  static hipError_t hipModuleLaunchKernel(hipFunction_t f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, hipStream_t hStream, void **kernelParams, void **extra);
 | 
				
			||||||
 | 
					  // function management
 | 
				
			||||||
 | 
					  static hipError_t hipFuncGetAttributes(hipFuncAttributes* attrib, void* hfunc);
 | 
				
			||||||
 | 
					  static hipError_t hipFuncSetAttribute(hipFunction_t hfunc, hipFuncAttribute attrib, int value);
 | 
				
			||||||
 | 
					  static hipError_t hipFuncSetCacheConfig(hipFunction_t hfunc, hipFuncCache_t config);
 | 
				
			||||||
 | 
					  // memory management
 | 
				
			||||||
 | 
					  static hipError_t hipMalloc(hipDeviceptr_t *dptr, size_t bytesize);
 | 
				
			||||||
 | 
					  static hipError_t hipPointerGetAttribute(void * data, CUpointer_attribute attribute, hipDeviceptr_t ptr);
 | 
				
			||||||
 | 
					  static hipError_t hipMemsetD8Async(hipDeviceptr_t dst, unsigned char x, size_t N, hipStream_t stream);
 | 
				
			||||||
 | 
					  static hipError_t hipMemcpyDtoH(void *dstHost, hipDeviceptr_t srcDevice, size_t ByteCount);
 | 
				
			||||||
 | 
					  static hipError_t hipFree(hipDeviceptr_t dptr);
 | 
				
			||||||
 | 
					  static hipError_t hipMemcpyDtoHAsync(void *dstHost, hipDeviceptr_t srcDevice, size_t ByteCount, hipStream_t hStream);
 | 
				
			||||||
 | 
					  static hipError_t hipMemcpyHtoDAsync(hipDeviceptr_t dstDevice, const void *srcHost, size_t ByteCount, hipStream_t hStream);
 | 
				
			||||||
 | 
					  static hipError_t hipMemcpyHtoD(hipDeviceptr_t dstDevice, const void *srcHost, size_t ByteCount);
 | 
				
			||||||
 | 
					  // event management
 | 
				
			||||||
 | 
					  static hipError_t hipEventCreate(hipEvent_t *phEvent, unsigned int Flags);
 | 
				
			||||||
 | 
					  static hipError_t hipEventElapsedTime(float *pMilliseconds, hipEvent_t hStart, hipEvent_t hEnd);
 | 
				
			||||||
 | 
					  static hipError_t hipEventRecord(hipEvent_t hEvent, hipStream_t hStream);
 | 
				
			||||||
 | 
					  static hipError_t hipEventDestroy(hipEvent_t hEvent);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
private:
 | 
					private:
 | 
				
			||||||
@@ -135,6 +190,7 @@ private:
 | 
				
			|||||||
  // Libraries
 | 
					  // Libraries
 | 
				
			||||||
  static void* cuda_;
 | 
					  static void* cuda_;
 | 
				
			||||||
  static void* nvml_;
 | 
					  static void* nvml_;
 | 
				
			||||||
 | 
					  static void* hip_;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  /* ------------------- *
 | 
					  /* ------------------- *
 | 
				
			||||||
@@ -194,9 +250,6 @@ private:
 | 
				
			|||||||
  static void* cuEventRecord_;
 | 
					  static void* cuEventRecord_;
 | 
				
			||||||
  static void* cuEventDestroy_v2_;
 | 
					  static void* cuEventDestroy_v2_;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  /* ------------------- *
 | 
					  /* ------------------- *
 | 
				
			||||||
   * NVML
 | 
					   * NVML
 | 
				
			||||||
   * ------------------- */
 | 
					   * ------------------- */
 | 
				
			||||||
@@ -205,6 +258,55 @@ private:
 | 
				
			|||||||
  static void* nvmlDeviceGetClockInfo_;
 | 
					  static void* nvmlDeviceGetClockInfo_;
 | 
				
			||||||
  static void* nvmlDeviceGetMaxClockInfo_;
 | 
					  static void* nvmlDeviceGetMaxClockInfo_;
 | 
				
			||||||
  static void* nvmlDeviceSetApplicationsClocks_;
 | 
					  static void* nvmlDeviceSetApplicationsClocks_;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  /* ------------------- *
 | 
				
			||||||
 | 
					   * HIP
 | 
				
			||||||
 | 
					   * ------------------- */
 | 
				
			||||||
 | 
					  // context management
 | 
				
			||||||
 | 
					  static void* hipInit_;
 | 
				
			||||||
 | 
					  static void* hipCtxDestroy_;
 | 
				
			||||||
 | 
					  static void* hipCtxCreate_;
 | 
				
			||||||
 | 
					  static void* hipCtxPushCurrent_;
 | 
				
			||||||
 | 
					  static void* hipCtxPopCurrent_;
 | 
				
			||||||
 | 
					  static void* hipCtxGetDevice_;
 | 
				
			||||||
 | 
					  static void* hipCtxEnablePeerAccess_;
 | 
				
			||||||
 | 
					  static void* hipDriverGetVersion_;
 | 
				
			||||||
 | 
					  // device management
 | 
				
			||||||
 | 
					  static void* hipGetDevice_;
 | 
				
			||||||
 | 
					  static void* hipDeviceGetName_;
 | 
				
			||||||
 | 
					  static void* hipDeviceGetPCIBusId_;
 | 
				
			||||||
 | 
					  static void* hipDeviceGetAttribute_;
 | 
				
			||||||
 | 
					  static void* hipGetDeviceCount_;
 | 
				
			||||||
 | 
					  // module management
 | 
				
			||||||
 | 
					  static void* hipModuleGetGlobal_;
 | 
				
			||||||
 | 
					  static void* hipModuleLoad_;
 | 
				
			||||||
 | 
					  static void* hipModuleLoadData_;
 | 
				
			||||||
 | 
					  static void* hipModuleUnload_;
 | 
				
			||||||
 | 
					  static void* hipModuleLoadDataEx_;
 | 
				
			||||||
 | 
					  static void* hipModuleGetFunction_;
 | 
				
			||||||
 | 
					  // stream management
 | 
				
			||||||
 | 
					  static void* hipStreamCreate_;
 | 
				
			||||||
 | 
					  static void* hipStreamSynchronize_;
 | 
				
			||||||
 | 
					  static void* hipStreamDestroy_;
 | 
				
			||||||
 | 
					  static void* hipModuleLaunchKernel_;;
 | 
				
			||||||
 | 
					  // function management
 | 
				
			||||||
 | 
					  static void* hipFuncGetAttributes_;
 | 
				
			||||||
 | 
					  static void* hipFuncSetAttribute_;
 | 
				
			||||||
 | 
					  static void* hipFuncSetCacheConfig_;
 | 
				
			||||||
 | 
					  // memory management
 | 
				
			||||||
 | 
					  static void* hipMalloc_;
 | 
				
			||||||
 | 
					  static void* hipPointerGetAttribute_;
 | 
				
			||||||
 | 
					  static void* hipMemsetD8Async_;
 | 
				
			||||||
 | 
					  static void* hipMemcpyDtoH_;
 | 
				
			||||||
 | 
					  static void* hipFree_;
 | 
				
			||||||
 | 
					  static void* hipMemcpyDtoHAsync_;
 | 
				
			||||||
 | 
					  static void* hipMemcpyHtoDAsync_;
 | 
				
			||||||
 | 
					  static void* hipMemcpyHtoD_;
 | 
				
			||||||
 | 
					  // event management
 | 
				
			||||||
 | 
					  static void* hipEventCreate_;
 | 
				
			||||||
 | 
					  static void* hipEventElapsedTime_;
 | 
				
			||||||
 | 
					  static void* hipEventRecord_;
 | 
				
			||||||
 | 
					  static void* hipEventDestroy_;
 | 
				
			||||||
};
 | 
					};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -141,6 +141,78 @@ namespace triton
 | 
				
			|||||||
  TRITON_CREATE_CUDNN_EXCEPTION(runtime_fp_overflow          ,"runtime fp overflow");
 | 
					  TRITON_CREATE_CUDNN_EXCEPTION(runtime_fp_overflow          ,"runtime fp overflow");
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  namespace hip
 | 
				
			||||||
 | 
					  {
 | 
				
			||||||
 | 
					  class base: public std::exception{};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define TRITON_CREATE_HIP_EXCEPTION(name, msg) class name: public base { public:const char * what() const throw(){ return "HIP: Error- " msg; } }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  TRITON_CREATE_HIP_EXCEPTION(invalid_value                   ,"invalid value");
 | 
				
			||||||
 | 
					  TRITON_CREATE_HIP_EXCEPTION(out_of_memory                   ,"out of memory");
 | 
				
			||||||
 | 
					  TRITON_CREATE_HIP_EXCEPTION(not_initialized                 ,"not initialized");
 | 
				
			||||||
 | 
					  TRITON_CREATE_HIP_EXCEPTION(deinitialized                   ,"deinitialized");
 | 
				
			||||||
 | 
					  TRITON_CREATE_HIP_EXCEPTION(profiler_disabled               ,"profiler disabled");
 | 
				
			||||||
 | 
					  TRITON_CREATE_HIP_EXCEPTION(profiler_not_initialized        ,"profiler not initialized");
 | 
				
			||||||
 | 
					  TRITON_CREATE_HIP_EXCEPTION(profiler_already_started        ,"profiler already started");
 | 
				
			||||||
 | 
					  TRITON_CREATE_HIP_EXCEPTION(profiler_already_stopped        ,"profiler already stopped");
 | 
				
			||||||
 | 
					  TRITON_CREATE_HIP_EXCEPTION(no_device                       ,"no device");
 | 
				
			||||||
 | 
					  TRITON_CREATE_HIP_EXCEPTION(invalid_device                  ,"invalid device");
 | 
				
			||||||
 | 
					  TRITON_CREATE_HIP_EXCEPTION(invalid_image                   ,"invalid image");
 | 
				
			||||||
 | 
					  TRITON_CREATE_HIP_EXCEPTION(invalid_context                 ,"invalid context");
 | 
				
			||||||
 | 
					  TRITON_CREATE_HIP_EXCEPTION(context_already_current         ,"context already current");
 | 
				
			||||||
 | 
					  TRITON_CREATE_HIP_EXCEPTION(map_failed                      ,"map failed");
 | 
				
			||||||
 | 
					  TRITON_CREATE_HIP_EXCEPTION(unmap_failed                    ,"unmap failed");
 | 
				
			||||||
 | 
					  TRITON_CREATE_HIP_EXCEPTION(array_is_mapped                 ,"array is mapped");
 | 
				
			||||||
 | 
					  TRITON_CREATE_HIP_EXCEPTION(already_mapped                  ,"already mapped");
 | 
				
			||||||
 | 
					  TRITON_CREATE_HIP_EXCEPTION(no_binary_for_gpu               ,"no binary for gpu");
 | 
				
			||||||
 | 
					  TRITON_CREATE_HIP_EXCEPTION(already_acquired                ,"already acquired");
 | 
				
			||||||
 | 
					  TRITON_CREATE_HIP_EXCEPTION(not_mapped                      ,"not mapped");
 | 
				
			||||||
 | 
					  TRITON_CREATE_HIP_EXCEPTION(not_mapped_as_array             ,"not mapped as array");
 | 
				
			||||||
 | 
					  TRITON_CREATE_HIP_EXCEPTION(not_mapped_as_pointer           ,"not mapped as pointer");
 | 
				
			||||||
 | 
					  TRITON_CREATE_HIP_EXCEPTION(ecc_uncorrectable               ,"ecc uncorrectable");
 | 
				
			||||||
 | 
					  TRITON_CREATE_HIP_EXCEPTION(unsupported_limit               ,"unsupported limit");
 | 
				
			||||||
 | 
					  TRITON_CREATE_HIP_EXCEPTION(context_already_in_use          ,"context already in use");
 | 
				
			||||||
 | 
					  TRITON_CREATE_HIP_EXCEPTION(peer_access_unsupported         ,"peer access unsupported");
 | 
				
			||||||
 | 
					  TRITON_CREATE_HIP_EXCEPTION(invalid_ptx                     ,"invalid ptx");
 | 
				
			||||||
 | 
					  TRITON_CREATE_HIP_EXCEPTION(invalid_graphics_context        ,"invalid graphics context");
 | 
				
			||||||
 | 
					  TRITON_CREATE_HIP_EXCEPTION(invalid_source                  ,"invalid source");
 | 
				
			||||||
 | 
					  TRITON_CREATE_HIP_EXCEPTION(file_not_found                  ,"file not found");
 | 
				
			||||||
 | 
					  TRITON_CREATE_HIP_EXCEPTION(shared_object_symbol_not_found  ,"shared object symbol not found");
 | 
				
			||||||
 | 
					  TRITON_CREATE_HIP_EXCEPTION(shared_object_init_failed       ,"shared object init failed");
 | 
				
			||||||
 | 
					  TRITON_CREATE_HIP_EXCEPTION(operating_system                ,"operating system");
 | 
				
			||||||
 | 
					  TRITON_CREATE_HIP_EXCEPTION(invalid_handle                  ,"invalid handle");
 | 
				
			||||||
 | 
					  TRITON_CREATE_HIP_EXCEPTION(not_found                       ,"not found");
 | 
				
			||||||
 | 
					  TRITON_CREATE_HIP_EXCEPTION(not_ready                       ,"not ready");
 | 
				
			||||||
 | 
					  TRITON_CREATE_HIP_EXCEPTION(illegal_address                 ,"illegal address");
 | 
				
			||||||
 | 
					  TRITON_CREATE_HIP_EXCEPTION(launch_out_of_resources         ,"launch out of resources");
 | 
				
			||||||
 | 
					  TRITON_CREATE_HIP_EXCEPTION(launch_timeout                  ,"launch timeout");
 | 
				
			||||||
 | 
					  TRITON_CREATE_HIP_EXCEPTION(launch_incompatible_texturing   ,"launch incompatible texturing");
 | 
				
			||||||
 | 
					  TRITON_CREATE_HIP_EXCEPTION(peer_access_already_enabled     ,"peer access already enabled");
 | 
				
			||||||
 | 
					  TRITON_CREATE_HIP_EXCEPTION(peer_access_not_enabled         ,"peer access not enabled");
 | 
				
			||||||
 | 
					  TRITON_CREATE_HIP_EXCEPTION(primary_context_active          ,"primary context active");
 | 
				
			||||||
 | 
					  TRITON_CREATE_HIP_EXCEPTION(context_is_destroyed            ,"context is destroyed");
 | 
				
			||||||
 | 
					  TRITON_CREATE_HIP_EXCEPTION(assert_error                    ,"assert");
 | 
				
			||||||
 | 
					  TRITON_CREATE_HIP_EXCEPTION(too_many_peers                  ,"too many peers");
 | 
				
			||||||
 | 
					  TRITON_CREATE_HIP_EXCEPTION(host_memory_already_registered  ,"host memory already registered");
 | 
				
			||||||
 | 
					  TRITON_CREATE_HIP_EXCEPTION(host_memory_not_registered      ,"hot memory not registered");
 | 
				
			||||||
 | 
					  TRITON_CREATE_HIP_EXCEPTION(hardware_stack_error            ,"hardware stack error");
 | 
				
			||||||
 | 
					  TRITON_CREATE_HIP_EXCEPTION(illegal_instruction             ,"illegal instruction");
 | 
				
			||||||
 | 
					  TRITON_CREATE_HIP_EXCEPTION(misaligned_address              ,"misaligned address");
 | 
				
			||||||
 | 
					  TRITON_CREATE_HIP_EXCEPTION(invalid_address_space           ,"invalid address space");
 | 
				
			||||||
 | 
					  TRITON_CREATE_HIP_EXCEPTION(invalid_pc                      ,"invalid pc");
 | 
				
			||||||
 | 
					  TRITON_CREATE_HIP_EXCEPTION(launch_failed                   ,"launch failed");
 | 
				
			||||||
 | 
					  TRITON_CREATE_HIP_EXCEPTION(not_permitted                   ,"not permitted");
 | 
				
			||||||
 | 
					  TRITON_CREATE_HIP_EXCEPTION(not_supported                   ,"not supported");
 | 
				
			||||||
 | 
					  TRITON_CREATE_HIP_EXCEPTION(invalid_symbol                   ,"invalid symbol");
 | 
				
			||||||
 | 
					  TRITON_CREATE_HIP_EXCEPTION(unknown                         ,"unknown");
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#undef TRITON_CREATE_CUDA_EXCEPTION
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -1,146 +0,0 @@
 | 
				
			|||||||
#pragma once
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#ifndef _TRITON_DRIVER_HANDLE_H_
 | 
					 | 
				
			||||||
#define _TRITON_DRIVER_HANDLE_H_
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#include <memory>
 | 
					 | 
				
			||||||
#include <map>
 | 
					 | 
				
			||||||
#include <iostream>
 | 
					 | 
				
			||||||
#include <functional>
 | 
					 | 
				
			||||||
#include <type_traits>
 | 
					 | 
				
			||||||
#include "triton/driver/dispatch.h"
 | 
					 | 
				
			||||||
#include "llvm/ExecutionEngine/JITSymbol.h"
 | 
					 | 
				
			||||||
#include "llvm/ExecutionEngine/Orc/CompileUtils.h"
 | 
					 | 
				
			||||||
#include "llvm/ExecutionEngine/Orc/Core.h"
 | 
					 | 
				
			||||||
#include "llvm/ExecutionEngine/Orc/ExecutionUtils.h"
 | 
					 | 
				
			||||||
#include "llvm/ExecutionEngine/Orc/IRCompileLayer.h"
 | 
					 | 
				
			||||||
#include "llvm/ExecutionEngine/Orc/JITTargetMachineBuilder.h"
 | 
					 | 
				
			||||||
#include "llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h"
 | 
					 | 
				
			||||||
#include "llvm/ExecutionEngine/SectionMemoryManager.h"
 | 
					 | 
				
			||||||
#include "triton/tools/thread_pool.h"
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
namespace llvm
 | 
					 | 
				
			||||||
{
 | 
					 | 
				
			||||||
class ExecutionEngine;
 | 
					 | 
				
			||||||
class Function;
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
namespace triton
 | 
					 | 
				
			||||||
{
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
namespace driver
 | 
					 | 
				
			||||||
{
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
enum backend_t {
 | 
					 | 
				
			||||||
  CUDA,
 | 
					 | 
				
			||||||
  Host
 | 
					 | 
				
			||||||
};
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
// Host handles
 | 
					 | 
				
			||||||
struct host_platform_t{
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
};
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
struct host_device_t{
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
};
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
struct host_context_t{
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
};
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
struct host_stream_t{
 | 
					 | 
				
			||||||
  std::shared_ptr<ThreadPool> pool;
 | 
					 | 
				
			||||||
  std::shared_ptr<std::vector<std::future<void>>> futures;
 | 
					 | 
				
			||||||
  std::vector<std::shared_ptr<char*>> args;
 | 
					 | 
				
			||||||
};
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
struct host_module_t{
 | 
					 | 
				
			||||||
  std::string error;
 | 
					 | 
				
			||||||
  llvm::ExecutionEngine* engine;
 | 
					 | 
				
			||||||
  std::map<std::string, llvm::Function*> functions;
 | 
					 | 
				
			||||||
  void(*fn)(char**, int32_t, int32_t, int32_t);
 | 
					 | 
				
			||||||
  llvm::orc::ExecutionSession* ES;
 | 
					 | 
				
			||||||
  llvm::orc::RTDyldObjectLinkingLayer* ObjectLayer;
 | 
					 | 
				
			||||||
  llvm::orc::IRCompileLayer* CompileLayer;
 | 
					 | 
				
			||||||
  llvm::DataLayout* DL;
 | 
					 | 
				
			||||||
  llvm::orc::MangleAndInterner* Mangle;
 | 
					 | 
				
			||||||
  llvm::orc::ThreadSafeContext* Ctx;
 | 
					 | 
				
			||||||
  llvm::orc::JITDylib *MainJD;
 | 
					 | 
				
			||||||
};
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
struct host_function_t{
 | 
					 | 
				
			||||||
  llvm::Function* fn;
 | 
					 | 
				
			||||||
};
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
struct host_buffer_t{
 | 
					 | 
				
			||||||
  char* data;
 | 
					 | 
				
			||||||
};
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
// Extra CUDA handles
 | 
					 | 
				
			||||||
struct cu_event_t{
 | 
					 | 
				
			||||||
  operator bool() const { return first && second; }
 | 
					 | 
				
			||||||
  CUevent first;
 | 
					 | 
				
			||||||
  CUevent second;
 | 
					 | 
				
			||||||
};
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
struct CUPlatform{
 | 
					 | 
				
			||||||
  CUPlatform() : status_(dispatch::cuInit(0)) { }
 | 
					 | 
				
			||||||
  operator bool() const { return status_; }
 | 
					 | 
				
			||||||
private:
 | 
					 | 
				
			||||||
  CUresult status_;
 | 
					 | 
				
			||||||
};
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
template<class T, class CUType>
 | 
					 | 
				
			||||||
class handle_interface{
 | 
					 | 
				
			||||||
public:
 | 
					 | 
				
			||||||
    //Accessors
 | 
					 | 
				
			||||||
    operator CUType() const { return *(((T*)this)->cu().h_); }
 | 
					 | 
				
			||||||
    //Comparison
 | 
					 | 
				
			||||||
    bool operator==(handle_interface const & y) { return (CUType)(*this) == (CUType)(y); }
 | 
					 | 
				
			||||||
    bool operator!=(handle_interface const & y) { return (CUType)(*this) != (CUType)(y); }
 | 
					 | 
				
			||||||
    bool operator<(handle_interface const & y) { return (CUType)(*this) < (CUType)(y); }
 | 
					 | 
				
			||||||
};
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
template<class T>
 | 
					 | 
				
			||||||
class handle{
 | 
					 | 
				
			||||||
public:
 | 
					 | 
				
			||||||
  template<class, class> friend class handle_interface;
 | 
					 | 
				
			||||||
public:
 | 
					 | 
				
			||||||
  //Constructors
 | 
					 | 
				
			||||||
  handle(T h, bool take_ownership = true);
 | 
					 | 
				
			||||||
  handle();
 | 
					 | 
				
			||||||
  ~handle();
 | 
					 | 
				
			||||||
  T& operator*() { return *h_; }
 | 
					 | 
				
			||||||
  T const & operator*() const { return *h_; }
 | 
					 | 
				
			||||||
  T* operator->() const { return h_.get(); }
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
protected:
 | 
					 | 
				
			||||||
  std::shared_ptr<T> h_;
 | 
					 | 
				
			||||||
  bool has_ownership_;
 | 
					 | 
				
			||||||
};
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
template<class CUType, class HostType>
 | 
					 | 
				
			||||||
class polymorphic_resource {
 | 
					 | 
				
			||||||
public:
 | 
					 | 
				
			||||||
  polymorphic_resource(CUType cu, bool take_ownership): cu_(cu, take_ownership), backend_(CUDA){}
 | 
					 | 
				
			||||||
  polymorphic_resource(HostType hst, bool take_ownership): hst_(hst, take_ownership), backend_(Host){}
 | 
					 | 
				
			||||||
  virtual ~polymorphic_resource() { }
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  handle<CUType> cu() { return cu_; }
 | 
					 | 
				
			||||||
  handle<HostType> hst() { return hst_; }
 | 
					 | 
				
			||||||
  const handle<CUType>& cu() const { return cu_; }
 | 
					 | 
				
			||||||
  const handle<HostType>& hst() const { return hst_; }
 | 
					 | 
				
			||||||
  backend_t backend() { return backend_; }
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
protected:
 | 
					 | 
				
			||||||
  handle<CUType> cu_;
 | 
					 | 
				
			||||||
  handle<HostType> hst_;
 | 
					 | 
				
			||||||
  backend_t backend_;
 | 
					 | 
				
			||||||
};
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#endif
 | 
					 | 
				
			||||||
@@ -1,53 +0,0 @@
 | 
				
			|||||||
#pragma once
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#ifndef _TRITON_DRIVER_KERNEL_H_
 | 
					 | 
				
			||||||
#define _TRITON_DRIVER_KERNEL_H_
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#include "triton/driver/module.h"
 | 
					 | 
				
			||||||
#include "triton/driver/handle.h"
 | 
					 | 
				
			||||||
#include <memory>
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
namespace llvm
 | 
					 | 
				
			||||||
{
 | 
					 | 
				
			||||||
class GenericValue;
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
namespace triton
 | 
					 | 
				
			||||||
{
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
namespace driver
 | 
					 | 
				
			||||||
{
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
class cu_buffer;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
// Base
 | 
					 | 
				
			||||||
class kernel: public polymorphic_resource<CUfunction, host_function_t> {
 | 
					 | 
				
			||||||
public:
 | 
					 | 
				
			||||||
  kernel(driver::module* program, CUfunction fn, bool has_ownership);
 | 
					 | 
				
			||||||
  kernel(driver::module* program, host_function_t fn, bool has_ownership);
 | 
					 | 
				
			||||||
  driver::module* module();
 | 
					 | 
				
			||||||
  static kernel* create(driver::module* program, const char* name);
 | 
					 | 
				
			||||||
private:
 | 
					 | 
				
			||||||
  driver::module* program_;
 | 
					 | 
				
			||||||
};
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
// Host
 | 
					 | 
				
			||||||
class host_kernel: public kernel {
 | 
					 | 
				
			||||||
public:
 | 
					 | 
				
			||||||
  //Constructors
 | 
					 | 
				
			||||||
  host_kernel(driver::module* program, const char* name);
 | 
					 | 
				
			||||||
};
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
// CUDA
 | 
					 | 
				
			||||||
class cu_kernel: public kernel {
 | 
					 | 
				
			||||||
public:
 | 
					 | 
				
			||||||
  //Constructors
 | 
					 | 
				
			||||||
  cu_kernel(driver::module* program, const char * name);
 | 
					 | 
				
			||||||
};
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#endif
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
							
								
								
									
										18
									
								
								include/triton/driver/llvm.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										18
									
								
								include/triton/driver/llvm.h
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,18 @@
 | 
				
			|||||||
 | 
					#include <string>
 | 
				
			||||||
 | 
					#include "triton/driver/dispatch.h"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					namespace llvm{
 | 
				
			||||||
 | 
					class Module;
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					namespace triton{
 | 
				
			||||||
 | 
					namespace driver{
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					void init_llvm();
 | 
				
			||||||
 | 
					std::string llir_to_ptx(llvm::Module* module, int cc, int version);
 | 
				
			||||||
 | 
					CUmodule ptx_to_cumodule(const std::string& ptx, int cc);
 | 
				
			||||||
 | 
					std::string llir_to_amdgpu(llvm::Module* module, const std::string& proc);
 | 
				
			||||||
 | 
					hipModule_t amdgpu_to_hipmodule(const std::string& path);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
@@ -1,84 +0,0 @@
 | 
				
			|||||||
#pragma once
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#ifndef _TRITON_DRIVER_MODULE_H_
 | 
					 | 
				
			||||||
#define _TRITON_DRIVER_MODULE_H_
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#include <map>
 | 
					 | 
				
			||||||
#include "triton/driver/handle.h"
 | 
					 | 
				
			||||||
#include "triton/driver/context.h"
 | 
					 | 
				
			||||||
#include "triton/driver/buffer.h"
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
namespace llvm
 | 
					 | 
				
			||||||
{
 | 
					 | 
				
			||||||
  class Module;
 | 
					 | 
				
			||||||
  template<class T>
 | 
					 | 
				
			||||||
  class SmallVectorImpl;
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
namespace triton
 | 
					 | 
				
			||||||
{
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
namespace driver
 | 
					 | 
				
			||||||
{
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
class cu_context;
 | 
					 | 
				
			||||||
class cu_device;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
// Base
 | 
					 | 
				
			||||||
class module: public polymorphic_resource<CUmodule, host_module_t> {
 | 
					 | 
				
			||||||
protected:
 | 
					 | 
				
			||||||
  void init_llvm();
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  enum file_type_t{
 | 
					 | 
				
			||||||
    Object,
 | 
					 | 
				
			||||||
    Assembly
 | 
					 | 
				
			||||||
  };
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
public:
 | 
					 | 
				
			||||||
  module(CUmodule mod, bool has_ownership);
 | 
					 | 
				
			||||||
  module(host_module_t mod, bool has_ownership);
 | 
					 | 
				
			||||||
  static module* create(driver::device* device, std::unique_ptr<llvm::Module> src);
 | 
					 | 
				
			||||||
  void compile_llvm_module(std::unique_ptr<llvm::Module> module, const std::string& triple,
 | 
					 | 
				
			||||||
                           const std::string &proc, std::string layout,
 | 
					 | 
				
			||||||
                           llvm::SmallVectorImpl<char> &buffer,
 | 
					 | 
				
			||||||
                           const std::string &features,
 | 
					 | 
				
			||||||
                           file_type_t file_type);
 | 
					 | 
				
			||||||
  virtual std::unique_ptr<buffer> symbol(const char * name) const = 0;
 | 
					 | 
				
			||||||
  int spilled() const { return spilled_; }
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
protected:
 | 
					 | 
				
			||||||
  int spilled_;
 | 
					 | 
				
			||||||
};
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
// CPU
 | 
					 | 
				
			||||||
class host_module: public module{
 | 
					 | 
				
			||||||
public:
 | 
					 | 
				
			||||||
  host_module(std::unique_ptr<llvm::Module> module);
 | 
					 | 
				
			||||||
  std::unique_ptr<buffer> symbol(const char * name) const;
 | 
					 | 
				
			||||||
};
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
// CUDA
 | 
					 | 
				
			||||||
class cu_module: public module {
 | 
					 | 
				
			||||||
  std::string compile_llvm_module(llvm::Module* module, driver::device* device);
 | 
					 | 
				
			||||||
  void init_from_ptx(const std::string& ptx, cu_device *device);
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
public:
 | 
					 | 
				
			||||||
  cu_module(driver::device* device, std::unique_ptr<llvm::Module> module);
 | 
					 | 
				
			||||||
  cu_module(driver::device* device, const std::string& source);
 | 
					 | 
				
			||||||
  std::unique_ptr<buffer> symbol(const char * name) const;
 | 
					 | 
				
			||||||
  std::string llir() const { return llir_; }
 | 
					 | 
				
			||||||
  const std::string& ptx() const { return ptx_; }
 | 
					 | 
				
			||||||
  const std::string& cubin() const { return cubin_; }
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
private:
 | 
					 | 
				
			||||||
  std::string ptx_;
 | 
					 | 
				
			||||||
  std::string cubin_;
 | 
					 | 
				
			||||||
  std::string llir_;
 | 
					 | 
				
			||||||
};
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#endif
 | 
					 | 
				
			||||||
@@ -1,58 +0,0 @@
 | 
				
			|||||||
#pragma once
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#ifndef _TRITON_DRIVER_PLATFORM_H_
 | 
					 | 
				
			||||||
#define _TRITON_DRIVER_PLATFORM_H_
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#include <vector>
 | 
					 | 
				
			||||||
#include <string>
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#include "triton/driver/handle.h"
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
namespace triton
 | 
					 | 
				
			||||||
{
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
namespace driver
 | 
					 | 
				
			||||||
{
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
class device;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
class platform
 | 
					 | 
				
			||||||
{
 | 
					 | 
				
			||||||
public:
 | 
					 | 
				
			||||||
  // Constructor
 | 
					 | 
				
			||||||
  platform(const std::string& name): name_(name){ }
 | 
					 | 
				
			||||||
  // Accessors
 | 
					 | 
				
			||||||
  std::string name() const { return name_; }
 | 
					 | 
				
			||||||
  // Virtual methods
 | 
					 | 
				
			||||||
  virtual std::string version() const = 0;
 | 
					 | 
				
			||||||
  virtual void devices(std::vector<driver::device *> &devices) const = 0;
 | 
					 | 
				
			||||||
private:
 | 
					 | 
				
			||||||
  std::string name_;
 | 
					 | 
				
			||||||
};
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
// CUDA
 | 
					 | 
				
			||||||
class cu_platform: public platform
 | 
					 | 
				
			||||||
{
 | 
					 | 
				
			||||||
public:
 | 
					 | 
				
			||||||
  cu_platform(): platform("CUDA") { }
 | 
					 | 
				
			||||||
  std::string version() const;
 | 
					 | 
				
			||||||
  void devices(std::vector<driver::device*> &devices) const;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
private:
 | 
					 | 
				
			||||||
  handle<CUPlatform> cu_;
 | 
					 | 
				
			||||||
};
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
// Host
 | 
					 | 
				
			||||||
class host_platform: public platform
 | 
					 | 
				
			||||||
{
 | 
					 | 
				
			||||||
public:
 | 
					 | 
				
			||||||
  host_platform(): platform("CPU") { }
 | 
					 | 
				
			||||||
  std::string version() const;
 | 
					 | 
				
			||||||
  void devices(std::vector<driver::device*> &devices) const;
 | 
					 | 
				
			||||||
};
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#endif
 | 
					 | 
				
			||||||
@@ -1,68 +0,0 @@
 | 
				
			|||||||
#pragma once
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#ifndef _TRITON_DRIVER_STREAM_H_
 | 
					 | 
				
			||||||
#define _TRITON_DRIVER_STREAM_H_
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#include <map>
 | 
					 | 
				
			||||||
#include "triton/driver/context.h"
 | 
					 | 
				
			||||||
#include "triton/driver/device.h"
 | 
					 | 
				
			||||||
#include "triton/driver/handle.h"
 | 
					 | 
				
			||||||
#include "triton/driver/buffer.h"
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
namespace triton
 | 
					 | 
				
			||||||
{
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
namespace driver
 | 
					 | 
				
			||||||
{
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
class kernel;
 | 
					 | 
				
			||||||
class event;
 | 
					 | 
				
			||||||
class Range;
 | 
					 | 
				
			||||||
class cu_buffer;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
// Base
 | 
					 | 
				
			||||||
class stream: public polymorphic_resource<CUstream, host_stream_t> {
 | 
					 | 
				
			||||||
public:
 | 
					 | 
				
			||||||
  stream(CUstream, bool has_ownership);
 | 
					 | 
				
			||||||
  stream(host_stream_t, bool has_ownership);
 | 
					 | 
				
			||||||
  // factory
 | 
					 | 
				
			||||||
  static driver::stream* create(backend_t backend);
 | 
					 | 
				
			||||||
  // methods
 | 
					 | 
				
			||||||
  virtual void synchronize() = 0;
 | 
					 | 
				
			||||||
  virtual void enqueue(driver::kernel* kernel, std::array<size_t, 3> grid, std::array<size_t, 3> block, void* args, size_t args_size, size_t shared_mem = 0) = 0;
 | 
					 | 
				
			||||||
  virtual void write(driver::buffer* buf, bool blocking, std::size_t offset, std::size_t size, void const* ptr) = 0;
 | 
					 | 
				
			||||||
  virtual void read(driver::buffer* buf, bool blocking, std::size_t offset, std::size_t size, void* ptr) = 0;
 | 
					 | 
				
			||||||
  // template helpers
 | 
					 | 
				
			||||||
  template<class T> void write(driver::buffer* buf, bool blocking, std::size_t offset, std::vector<T> const & x)
 | 
					 | 
				
			||||||
  { write(buf, blocking, offset, x.size()*sizeof(T), x.data()); }
 | 
					 | 
				
			||||||
  template<class T> void read(driver::buffer* buf, bool blocking, std::size_t offset, std::vector<T>& x)
 | 
					 | 
				
			||||||
  { read(buf, blocking, offset, x.size()*sizeof(T), x.data()); }
 | 
					 | 
				
			||||||
};
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
// Host
 | 
					 | 
				
			||||||
class host_stream: public stream {
 | 
					 | 
				
			||||||
public:
 | 
					 | 
				
			||||||
  host_stream();
 | 
					 | 
				
			||||||
  void synchronize();
 | 
					 | 
				
			||||||
  void enqueue(driver::kernel* kernel, std::array<size_t, 3> grid, std::array<size_t, 3> block, void* args, size_t args_size, size_t shared_mem);
 | 
					 | 
				
			||||||
  void write(driver::buffer* buf, bool blocking, std::size_t offset, std::size_t size, void const* ptr);
 | 
					 | 
				
			||||||
  void read(driver::buffer* buf, bool blocking, std::size_t offset, std::size_t size, void* ptr);
 | 
					 | 
				
			||||||
};
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
// CUDA
 | 
					 | 
				
			||||||
class cu_stream: public stream {
 | 
					 | 
				
			||||||
public:
 | 
					 | 
				
			||||||
  cu_stream(CUstream str, bool take_ownership);
 | 
					 | 
				
			||||||
  cu_stream();
 | 
					 | 
				
			||||||
  void synchronize();
 | 
					 | 
				
			||||||
  void enqueue(driver::kernel* kernel, std::array<size_t, 3> grid, std::array<size_t, 3> block, void* args, size_t args_size, size_t shared_mem);
 | 
					 | 
				
			||||||
  void write(driver::buffer* buf, bool blocking, std::size_t offset, std::size_t size, void const* ptr);
 | 
					 | 
				
			||||||
  void read(driver::buffer* buf, bool blocking, std::size_t offset, std::size_t size, void* ptr);
 | 
					 | 
				
			||||||
};
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#endif
 | 
					 | 
				
			||||||
							
								
								
									
										1468
									
								
								include/triton/external/CL/cl.h
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										1468
									
								
								include/triton/external/CL/cl.h
									
									
									
									
										vendored
									
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
							
								
								
									
										12947
									
								
								include/triton/external/CL/cl.hpp
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										12947
									
								
								include/triton/external/CL/cl.hpp
									
									
									
									
										vendored
									
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
							
								
								
									
										9677
									
								
								include/triton/external/CL/cl2.hpp
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										9677
									
								
								include/triton/external/CL/cl2.hpp
									
									
									
									
										vendored
									
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
							
								
								
									
										131
									
								
								include/triton/external/CL/cl_d3d10.h
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										131
									
								
								include/triton/external/CL/cl_d3d10.h
									
									
									
									
										vendored
									
									
								
							@@ -1,131 +0,0 @@
 | 
				
			|||||||
/**********************************************************************************
 | 
					 | 
				
			||||||
 * Copyright (c) 2008-2015 The Khronos Group Inc.
 | 
					 | 
				
			||||||
 *
 | 
					 | 
				
			||||||
 * Permission is hereby granted, free of charge, to any person obtaining a
 | 
					 | 
				
			||||||
 * copy of this software and/or associated documentation files (the
 | 
					 | 
				
			||||||
 * "Materials"), to deal in the Materials without restriction, including
 | 
					 | 
				
			||||||
 * without limitation the rights to use, copy, modify, merge, publish,
 | 
					 | 
				
			||||||
 * distribute, sublicense, and/or sell copies of the Materials, and to
 | 
					 | 
				
			||||||
 * permit persons to whom the Materials are furnished to do so, subject to
 | 
					 | 
				
			||||||
 * the following conditions:
 | 
					 | 
				
			||||||
 *
 | 
					 | 
				
			||||||
 * The above copyright notice and this permission notice shall be included
 | 
					 | 
				
			||||||
 * in all copies or substantial portions of the Materials.
 | 
					 | 
				
			||||||
 *
 | 
					 | 
				
			||||||
 * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
 | 
					 | 
				
			||||||
 * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
 | 
					 | 
				
			||||||
 * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
 | 
					 | 
				
			||||||
 *    https://www.khronos.org/registry/
 | 
					 | 
				
			||||||
 *
 | 
					 | 
				
			||||||
 * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 | 
					 | 
				
			||||||
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 | 
					 | 
				
			||||||
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 | 
					 | 
				
			||||||
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 | 
					 | 
				
			||||||
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 | 
					 | 
				
			||||||
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 | 
					 | 
				
			||||||
 * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
 | 
					 | 
				
			||||||
 **********************************************************************************/
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#ifndef __OPENCL_CL_D3D10_H
 | 
					 | 
				
			||||||
#define __OPENCL_CL_D3D10_H
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#include <d3d10.h>
 | 
					 | 
				
			||||||
#include "cl.h"
 | 
					 | 
				
			||||||
#include "cl_platform.h"
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#ifdef __cplusplus
 | 
					 | 
				
			||||||
extern "C" {
 | 
					 | 
				
			||||||
#endif
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/******************************************************************************
 | 
					 | 
				
			||||||
 * cl_khr_d3d10_sharing                                                       */
 | 
					 | 
				
			||||||
#define cl_khr_d3d10_sharing 1
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
typedef cl_uint cl_d3d10_device_source_khr;
 | 
					 | 
				
			||||||
typedef cl_uint cl_d3d10_device_set_khr;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/******************************************************************************/
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* Error Codes */
 | 
					 | 
				
			||||||
#define CL_INVALID_D3D10_DEVICE_KHR                  -1002
 | 
					 | 
				
			||||||
#define CL_INVALID_D3D10_RESOURCE_KHR                -1003
 | 
					 | 
				
			||||||
#define CL_D3D10_RESOURCE_ALREADY_ACQUIRED_KHR       -1004
 | 
					 | 
				
			||||||
#define CL_D3D10_RESOURCE_NOT_ACQUIRED_KHR           -1005
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* cl_d3d10_device_source_nv */
 | 
					 | 
				
			||||||
#define CL_D3D10_DEVICE_KHR                          0x4010
 | 
					 | 
				
			||||||
#define CL_D3D10_DXGI_ADAPTER_KHR                    0x4011
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* cl_d3d10_device_set_nv */
 | 
					 | 
				
			||||||
#define CL_PREFERRED_DEVICES_FOR_D3D10_KHR           0x4012
 | 
					 | 
				
			||||||
#define CL_ALL_DEVICES_FOR_D3D10_KHR                 0x4013
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* cl_context_info */
 | 
					 | 
				
			||||||
#define CL_CONTEXT_D3D10_DEVICE_KHR                  0x4014
 | 
					 | 
				
			||||||
#define CL_CONTEXT_D3D10_PREFER_SHARED_RESOURCES_KHR 0x402C
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* cl_mem_info */
 | 
					 | 
				
			||||||
#define CL_MEM_D3D10_RESOURCE_KHR                    0x4015
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* cl_image_info */
 | 
					 | 
				
			||||||
#define CL_IMAGE_D3D10_SUBRESOURCE_KHR               0x4016
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* cl_command_type */
 | 
					 | 
				
			||||||
#define CL_COMMAND_ACQUIRE_D3D10_OBJECTS_KHR         0x4017
 | 
					 | 
				
			||||||
#define CL_COMMAND_RELEASE_D3D10_OBJECTS_KHR         0x4018
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/******************************************************************************/
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetDeviceIDsFromD3D10KHR_fn)(
 | 
					 | 
				
			||||||
    cl_platform_id             platform,
 | 
					 | 
				
			||||||
    cl_d3d10_device_source_khr d3d_device_source,
 | 
					 | 
				
			||||||
    void *                     d3d_object,
 | 
					 | 
				
			||||||
    cl_d3d10_device_set_khr    d3d_device_set,
 | 
					 | 
				
			||||||
    cl_uint                    num_entries,
 | 
					 | 
				
			||||||
    cl_device_id *             devices,
 | 
					 | 
				
			||||||
    cl_uint *                  num_devices) CL_API_SUFFIX__VERSION_1_0;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D10BufferKHR_fn)(
 | 
					 | 
				
			||||||
    cl_context     context,
 | 
					 | 
				
			||||||
    cl_mem_flags   flags,
 | 
					 | 
				
			||||||
    ID3D10Buffer * resource,
 | 
					 | 
				
			||||||
    cl_int *       errcode_ret) CL_API_SUFFIX__VERSION_1_0;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D10Texture2DKHR_fn)(
 | 
					 | 
				
			||||||
    cl_context        context,
 | 
					 | 
				
			||||||
    cl_mem_flags      flags,
 | 
					 | 
				
			||||||
    ID3D10Texture2D * resource,
 | 
					 | 
				
			||||||
    UINT              subresource,
 | 
					 | 
				
			||||||
    cl_int *          errcode_ret) CL_API_SUFFIX__VERSION_1_0;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D10Texture3DKHR_fn)(
 | 
					 | 
				
			||||||
    cl_context        context,
 | 
					 | 
				
			||||||
    cl_mem_flags      flags,
 | 
					 | 
				
			||||||
    ID3D10Texture3D * resource,
 | 
					 | 
				
			||||||
    UINT              subresource,
 | 
					 | 
				
			||||||
    cl_int *          errcode_ret) CL_API_SUFFIX__VERSION_1_0;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireD3D10ObjectsKHR_fn)(
 | 
					 | 
				
			||||||
    cl_command_queue command_queue,
 | 
					 | 
				
			||||||
    cl_uint          num_objects,
 | 
					 | 
				
			||||||
    const cl_mem *   mem_objects,
 | 
					 | 
				
			||||||
    cl_uint          num_events_in_wait_list,
 | 
					 | 
				
			||||||
    const cl_event * event_wait_list,
 | 
					 | 
				
			||||||
    cl_event *       event) CL_API_SUFFIX__VERSION_1_0;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseD3D10ObjectsKHR_fn)(
 | 
					 | 
				
			||||||
    cl_command_queue command_queue,
 | 
					 | 
				
			||||||
    cl_uint          num_objects,
 | 
					 | 
				
			||||||
    const cl_mem *   mem_objects,
 | 
					 | 
				
			||||||
    cl_uint          num_events_in_wait_list,
 | 
					 | 
				
			||||||
    const cl_event * event_wait_list,
 | 
					 | 
				
			||||||
    cl_event *       event) CL_API_SUFFIX__VERSION_1_0;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#ifdef __cplusplus
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
#endif
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#endif  /* __OPENCL_CL_D3D10_H */
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
							
								
								
									
										131
									
								
								include/triton/external/CL/cl_d3d11.h
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										131
									
								
								include/triton/external/CL/cl_d3d11.h
									
									
									
									
										vendored
									
									
								
							@@ -1,131 +0,0 @@
 | 
				
			|||||||
/**********************************************************************************
 | 
					 | 
				
			||||||
 * Copyright (c) 2008-2015 The Khronos Group Inc.
 | 
					 | 
				
			||||||
 *
 | 
					 | 
				
			||||||
 * Permission is hereby granted, free of charge, to any person obtaining a
 | 
					 | 
				
			||||||
 * copy of this software and/or associated documentation files (the
 | 
					 | 
				
			||||||
 * "Materials"), to deal in the Materials without restriction, including
 | 
					 | 
				
			||||||
 * without limitation the rights to use, copy, modify, merge, publish,
 | 
					 | 
				
			||||||
 * distribute, sublicense, and/or sell copies of the Materials, and to
 | 
					 | 
				
			||||||
 * permit persons to whom the Materials are furnished to do so, subject to
 | 
					 | 
				
			||||||
 * the following conditions:
 | 
					 | 
				
			||||||
 *
 | 
					 | 
				
			||||||
 * The above copyright notice and this permission notice shall be included
 | 
					 | 
				
			||||||
 * in all copies or substantial portions of the Materials.
 | 
					 | 
				
			||||||
 *
 | 
					 | 
				
			||||||
 * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
 | 
					 | 
				
			||||||
 * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
 | 
					 | 
				
			||||||
 * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
 | 
					 | 
				
			||||||
 *    https://www.khronos.org/registry/
 | 
					 | 
				
			||||||
 *
 | 
					 | 
				
			||||||
 * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 | 
					 | 
				
			||||||
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 | 
					 | 
				
			||||||
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 | 
					 | 
				
			||||||
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 | 
					 | 
				
			||||||
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 | 
					 | 
				
			||||||
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 | 
					 | 
				
			||||||
 * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
 | 
					 | 
				
			||||||
 **********************************************************************************/
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#ifndef __OPENCL_CL_D3D11_H
 | 
					 | 
				
			||||||
#define __OPENCL_CL_D3D11_H
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#include <d3d11.h>
 | 
					 | 
				
			||||||
#include "cl.h"
 | 
					 | 
				
			||||||
#include "cl_platform.h"
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#ifdef __cplusplus
 | 
					 | 
				
			||||||
extern "C" {
 | 
					 | 
				
			||||||
#endif
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/******************************************************************************
 | 
					 | 
				
			||||||
 * cl_khr_d3d11_sharing                                                       */
 | 
					 | 
				
			||||||
#define cl_khr_d3d11_sharing 1
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
typedef cl_uint cl_d3d11_device_source_khr;
 | 
					 | 
				
			||||||
typedef cl_uint cl_d3d11_device_set_khr;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/******************************************************************************/
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* Error Codes */
 | 
					 | 
				
			||||||
#define CL_INVALID_D3D11_DEVICE_KHR                  -1006
 | 
					 | 
				
			||||||
#define CL_INVALID_D3D11_RESOURCE_KHR                -1007
 | 
					 | 
				
			||||||
#define CL_D3D11_RESOURCE_ALREADY_ACQUIRED_KHR       -1008
 | 
					 | 
				
			||||||
#define CL_D3D11_RESOURCE_NOT_ACQUIRED_KHR           -1009
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* cl_d3d11_device_source */
 | 
					 | 
				
			||||||
#define CL_D3D11_DEVICE_KHR                          0x4019
 | 
					 | 
				
			||||||
#define CL_D3D11_DXGI_ADAPTER_KHR                    0x401A
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* cl_d3d11_device_set */
 | 
					 | 
				
			||||||
#define CL_PREFERRED_DEVICES_FOR_D3D11_KHR           0x401B
 | 
					 | 
				
			||||||
#define CL_ALL_DEVICES_FOR_D3D11_KHR                 0x401C
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* cl_context_info */
 | 
					 | 
				
			||||||
#define CL_CONTEXT_D3D11_DEVICE_KHR                  0x401D
 | 
					 | 
				
			||||||
#define CL_CONTEXT_D3D11_PREFER_SHARED_RESOURCES_KHR 0x402D
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* cl_mem_info */
 | 
					 | 
				
			||||||
#define CL_MEM_D3D11_RESOURCE_KHR                    0x401E
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* cl_image_info */
 | 
					 | 
				
			||||||
#define CL_IMAGE_D3D11_SUBRESOURCE_KHR               0x401F
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* cl_command_type */
 | 
					 | 
				
			||||||
#define CL_COMMAND_ACQUIRE_D3D11_OBJECTS_KHR         0x4020
 | 
					 | 
				
			||||||
#define CL_COMMAND_RELEASE_D3D11_OBJECTS_KHR         0x4021
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/******************************************************************************/
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetDeviceIDsFromD3D11KHR_fn)(
 | 
					 | 
				
			||||||
    cl_platform_id             platform,
 | 
					 | 
				
			||||||
    cl_d3d11_device_source_khr d3d_device_source,
 | 
					 | 
				
			||||||
    void *                     d3d_object,
 | 
					 | 
				
			||||||
    cl_d3d11_device_set_khr    d3d_device_set,
 | 
					 | 
				
			||||||
    cl_uint                    num_entries,
 | 
					 | 
				
			||||||
    cl_device_id *             devices,
 | 
					 | 
				
			||||||
    cl_uint *                  num_devices) CL_API_SUFFIX__VERSION_1_2;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D11BufferKHR_fn)(
 | 
					 | 
				
			||||||
    cl_context     context,
 | 
					 | 
				
			||||||
    cl_mem_flags   flags,
 | 
					 | 
				
			||||||
    ID3D11Buffer * resource,
 | 
					 | 
				
			||||||
    cl_int *       errcode_ret) CL_API_SUFFIX__VERSION_1_2;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D11Texture2DKHR_fn)(
 | 
					 | 
				
			||||||
    cl_context        context,
 | 
					 | 
				
			||||||
    cl_mem_flags      flags,
 | 
					 | 
				
			||||||
    ID3D11Texture2D * resource,
 | 
					 | 
				
			||||||
    UINT              subresource,
 | 
					 | 
				
			||||||
    cl_int *          errcode_ret) CL_API_SUFFIX__VERSION_1_2;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D11Texture3DKHR_fn)(
 | 
					 | 
				
			||||||
    cl_context        context,
 | 
					 | 
				
			||||||
    cl_mem_flags      flags,
 | 
					 | 
				
			||||||
    ID3D11Texture3D * resource,
 | 
					 | 
				
			||||||
    UINT              subresource,
 | 
					 | 
				
			||||||
    cl_int *          errcode_ret) CL_API_SUFFIX__VERSION_1_2;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireD3D11ObjectsKHR_fn)(
 | 
					 | 
				
			||||||
    cl_command_queue command_queue,
 | 
					 | 
				
			||||||
    cl_uint          num_objects,
 | 
					 | 
				
			||||||
    const cl_mem *   mem_objects,
 | 
					 | 
				
			||||||
    cl_uint          num_events_in_wait_list,
 | 
					 | 
				
			||||||
    const cl_event * event_wait_list,
 | 
					 | 
				
			||||||
    cl_event *       event) CL_API_SUFFIX__VERSION_1_2;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseD3D11ObjectsKHR_fn)(
 | 
					 | 
				
			||||||
    cl_command_queue command_queue,
 | 
					 | 
				
			||||||
    cl_uint          num_objects,
 | 
					 | 
				
			||||||
    const cl_mem *   mem_objects,
 | 
					 | 
				
			||||||
    cl_uint          num_events_in_wait_list,
 | 
					 | 
				
			||||||
    const cl_event * event_wait_list,
 | 
					 | 
				
			||||||
    cl_event *       event) CL_API_SUFFIX__VERSION_1_2;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#ifdef __cplusplus
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
#endif
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#endif  /* __OPENCL_CL_D3D11_H */
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
							
								
								
									
										132
									
								
								include/triton/external/CL/cl_dx9_media_sharing.h
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										132
									
								
								include/triton/external/CL/cl_dx9_media_sharing.h
									
									
									
									
										vendored
									
									
								
							@@ -1,132 +0,0 @@
 | 
				
			|||||||
/**********************************************************************************
 | 
					 | 
				
			||||||
 * Copyright (c) 2008-2015 The Khronos Group Inc.
 | 
					 | 
				
			||||||
 *
 | 
					 | 
				
			||||||
 * Permission is hereby granted, free of charge, to any person obtaining a
 | 
					 | 
				
			||||||
 * copy of this software and/or associated documentation files (the
 | 
					 | 
				
			||||||
 * "Materials"), to deal in the Materials without restriction, including
 | 
					 | 
				
			||||||
 * without limitation the rights to use, copy, modify, merge, publish,
 | 
					 | 
				
			||||||
 * distribute, sublicense, and/or sell copies of the Materials, and to
 | 
					 | 
				
			||||||
 * permit persons to whom the Materials are furnished to do so, subject to
 | 
					 | 
				
			||||||
 * the following conditions:
 | 
					 | 
				
			||||||
 *
 | 
					 | 
				
			||||||
 * The above copyright notice and this permission notice shall be included
 | 
					 | 
				
			||||||
 * in all copies or substantial portions of the Materials.
 | 
					 | 
				
			||||||
 *
 | 
					 | 
				
			||||||
 * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
 | 
					 | 
				
			||||||
 * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
 | 
					 | 
				
			||||||
 * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
 | 
					 | 
				
			||||||
 *    https://www.khronos.org/registry/
 | 
					 | 
				
			||||||
 *
 | 
					 | 
				
			||||||
 * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 | 
					 | 
				
			||||||
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 | 
					 | 
				
			||||||
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 | 
					 | 
				
			||||||
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 | 
					 | 
				
			||||||
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 | 
					 | 
				
			||||||
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 | 
					 | 
				
			||||||
 * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
 | 
					 | 
				
			||||||
 **********************************************************************************/
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#ifndef __OPENCL_CL_DX9_MEDIA_SHARING_H
 | 
					 | 
				
			||||||
#define __OPENCL_CL_DX9_MEDIA_SHARING_H
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#include "cl.h"
 | 
					 | 
				
			||||||
#include "cl_platform.h"
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#ifdef __cplusplus
 | 
					 | 
				
			||||||
extern "C" {
 | 
					 | 
				
			||||||
#endif
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/******************************************************************************/
 | 
					 | 
				
			||||||
/* cl_khr_dx9_media_sharing                                                   */
 | 
					 | 
				
			||||||
#define cl_khr_dx9_media_sharing 1
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
typedef cl_uint             cl_dx9_media_adapter_type_khr;
 | 
					 | 
				
			||||||
typedef cl_uint             cl_dx9_media_adapter_set_khr;
 | 
					 | 
				
			||||||
    
 | 
					 | 
				
			||||||
#if defined(_WIN32)
 | 
					 | 
				
			||||||
#include <d3d9.h>
 | 
					 | 
				
			||||||
typedef struct _cl_dx9_surface_info_khr
 | 
					 | 
				
			||||||
{
 | 
					 | 
				
			||||||
    IDirect3DSurface9 *resource;
 | 
					 | 
				
			||||||
    HANDLE shared_handle;
 | 
					 | 
				
			||||||
} cl_dx9_surface_info_khr;
 | 
					 | 
				
			||||||
#endif
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/******************************************************************************/
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* Error Codes */
 | 
					 | 
				
			||||||
#define CL_INVALID_DX9_MEDIA_ADAPTER_KHR                -1010
 | 
					 | 
				
			||||||
#define CL_INVALID_DX9_MEDIA_SURFACE_KHR                -1011
 | 
					 | 
				
			||||||
#define CL_DX9_MEDIA_SURFACE_ALREADY_ACQUIRED_KHR       -1012
 | 
					 | 
				
			||||||
#define CL_DX9_MEDIA_SURFACE_NOT_ACQUIRED_KHR           -1013
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* cl_media_adapter_type_khr */
 | 
					 | 
				
			||||||
#define CL_ADAPTER_D3D9_KHR                              0x2020
 | 
					 | 
				
			||||||
#define CL_ADAPTER_D3D9EX_KHR                            0x2021
 | 
					 | 
				
			||||||
#define CL_ADAPTER_DXVA_KHR                              0x2022
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* cl_media_adapter_set_khr */
 | 
					 | 
				
			||||||
#define CL_PREFERRED_DEVICES_FOR_DX9_MEDIA_ADAPTER_KHR   0x2023
 | 
					 | 
				
			||||||
#define CL_ALL_DEVICES_FOR_DX9_MEDIA_ADAPTER_KHR         0x2024
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* cl_context_info */
 | 
					 | 
				
			||||||
#define CL_CONTEXT_ADAPTER_D3D9_KHR                      0x2025
 | 
					 | 
				
			||||||
#define CL_CONTEXT_ADAPTER_D3D9EX_KHR                    0x2026
 | 
					 | 
				
			||||||
#define CL_CONTEXT_ADAPTER_DXVA_KHR                      0x2027
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* cl_mem_info */
 | 
					 | 
				
			||||||
#define CL_MEM_DX9_MEDIA_ADAPTER_TYPE_KHR                0x2028
 | 
					 | 
				
			||||||
#define CL_MEM_DX9_MEDIA_SURFACE_INFO_KHR                0x2029
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* cl_image_info */
 | 
					 | 
				
			||||||
#define CL_IMAGE_DX9_MEDIA_PLANE_KHR                     0x202A
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* cl_command_type */
 | 
					 | 
				
			||||||
#define CL_COMMAND_ACQUIRE_DX9_MEDIA_SURFACES_KHR        0x202B
 | 
					 | 
				
			||||||
#define CL_COMMAND_RELEASE_DX9_MEDIA_SURFACES_KHR        0x202C
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/******************************************************************************/
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetDeviceIDsFromDX9MediaAdapterKHR_fn)(
 | 
					 | 
				
			||||||
    cl_platform_id                   platform,
 | 
					 | 
				
			||||||
    cl_uint                          num_media_adapters,
 | 
					 | 
				
			||||||
    cl_dx9_media_adapter_type_khr *  media_adapter_type,
 | 
					 | 
				
			||||||
    void *                           media_adapters,
 | 
					 | 
				
			||||||
    cl_dx9_media_adapter_set_khr     media_adapter_set,
 | 
					 | 
				
			||||||
    cl_uint                          num_entries,
 | 
					 | 
				
			||||||
    cl_device_id *                   devices,
 | 
					 | 
				
			||||||
    cl_uint *                        num_devices) CL_API_SUFFIX__VERSION_1_2;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromDX9MediaSurfaceKHR_fn)(
 | 
					 | 
				
			||||||
    cl_context                    context,
 | 
					 | 
				
			||||||
    cl_mem_flags                  flags,
 | 
					 | 
				
			||||||
    cl_dx9_media_adapter_type_khr adapter_type,
 | 
					 | 
				
			||||||
    void *                        surface_info,
 | 
					 | 
				
			||||||
    cl_uint                       plane,                                                                          
 | 
					 | 
				
			||||||
    cl_int *                      errcode_ret) CL_API_SUFFIX__VERSION_1_2;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireDX9MediaSurfacesKHR_fn)(
 | 
					 | 
				
			||||||
    cl_command_queue command_queue,
 | 
					 | 
				
			||||||
    cl_uint          num_objects,
 | 
					 | 
				
			||||||
    const cl_mem *   mem_objects,
 | 
					 | 
				
			||||||
    cl_uint          num_events_in_wait_list,
 | 
					 | 
				
			||||||
    const cl_event * event_wait_list,
 | 
					 | 
				
			||||||
    cl_event *       event) CL_API_SUFFIX__VERSION_1_2;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseDX9MediaSurfacesKHR_fn)(
 | 
					 | 
				
			||||||
    cl_command_queue command_queue,
 | 
					 | 
				
			||||||
    cl_uint          num_objects,
 | 
					 | 
				
			||||||
    const cl_mem *   mem_objects,
 | 
					 | 
				
			||||||
    cl_uint          num_events_in_wait_list,
 | 
					 | 
				
			||||||
    const cl_event * event_wait_list,
 | 
					 | 
				
			||||||
    cl_event *       event) CL_API_SUFFIX__VERSION_1_2;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#ifdef __cplusplus
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
#endif
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#endif  /* __OPENCL_CL_DX9_MEDIA_SHARING_H */
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
@@ -1,182 +0,0 @@
 | 
				
			|||||||
/**********************************************************************************
 | 
					 | 
				
			||||||
 * Copyright (c) 2008-2016 The Khronos Group Inc.
 | 
					 | 
				
			||||||
 *
 | 
					 | 
				
			||||||
 * Permission is hereby granted, free of charge, to any person obtaining a
 | 
					 | 
				
			||||||
 * copy of this software and/or associated documentation files (the
 | 
					 | 
				
			||||||
 * "Materials"), to deal in the Materials without restriction, including
 | 
					 | 
				
			||||||
 * without limitation the rights to use, copy, modify, merge, publish,
 | 
					 | 
				
			||||||
 * distribute, sublicense, and/or sell copies of the Materials, and to
 | 
					 | 
				
			||||||
 * permit persons to whom the Materials are furnished to do so, subject to
 | 
					 | 
				
			||||||
 * the following conditions:
 | 
					 | 
				
			||||||
 *
 | 
					 | 
				
			||||||
 * The above copyright notice and this permission notice shall be included
 | 
					 | 
				
			||||||
 * in all copies or substantial portions of the Materials.
 | 
					 | 
				
			||||||
 *
 | 
					 | 
				
			||||||
 * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
 | 
					 | 
				
			||||||
 * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
 | 
					 | 
				
			||||||
 * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
 | 
					 | 
				
			||||||
 *    https://www.khronos.org/registry/
 | 
					 | 
				
			||||||
 *
 | 
					 | 
				
			||||||
 * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 | 
					 | 
				
			||||||
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 | 
					 | 
				
			||||||
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 | 
					 | 
				
			||||||
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 | 
					 | 
				
			||||||
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 | 
					 | 
				
			||||||
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 | 
					 | 
				
			||||||
 * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
 | 
					 | 
				
			||||||
 **********************************************************************************/
 | 
					 | 
				
			||||||
/*****************************************************************************\
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
Copyright (c) 2013-2016 Intel Corporation All Rights Reserved.
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
THESE MATERIALS ARE PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 | 
					 | 
				
			||||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 | 
					 | 
				
			||||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 | 
					 | 
				
			||||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
 | 
					 | 
				
			||||||
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 | 
					 | 
				
			||||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 | 
					 | 
				
			||||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 | 
					 | 
				
			||||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 | 
					 | 
				
			||||||
OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
 | 
					 | 
				
			||||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THESE
 | 
					 | 
				
			||||||
MATERIALS, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
File Name: cl_dx9_media_sharing_intel.h
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
Abstract:
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
Notes:
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
\*****************************************************************************/
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#ifndef __OPENCL_CL_DX9_MEDIA_SHARING_INTEL_H
 | 
					 | 
				
			||||||
#define __OPENCL_CL_DX9_MEDIA_SHARING_INTEL_H
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#include <CL/cl.h>
 | 
					 | 
				
			||||||
#include <CL/cl_platform.h>
 | 
					 | 
				
			||||||
#include <d3d9.h>
 | 
					 | 
				
			||||||
#include <dxvahd.h>
 | 
					 | 
				
			||||||
#include <wtypes.h>
 | 
					 | 
				
			||||||
#include <d3d9types.h>
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#ifdef __cplusplus
 | 
					 | 
				
			||||||
extern "C" {
 | 
					 | 
				
			||||||
#endif
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/***************************************
 | 
					 | 
				
			||||||
* cl_intel_dx9_media_sharing extension *
 | 
					 | 
				
			||||||
****************************************/
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#define cl_intel_dx9_media_sharing 1
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
typedef cl_uint cl_dx9_device_source_intel;
 | 
					 | 
				
			||||||
typedef cl_uint cl_dx9_device_set_intel;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* error codes */
 | 
					 | 
				
			||||||
#define CL_INVALID_DX9_DEVICE_INTEL                   -1010
 | 
					 | 
				
			||||||
#define CL_INVALID_DX9_RESOURCE_INTEL                 -1011
 | 
					 | 
				
			||||||
#define CL_DX9_RESOURCE_ALREADY_ACQUIRED_INTEL        -1012
 | 
					 | 
				
			||||||
#define CL_DX9_RESOURCE_NOT_ACQUIRED_INTEL            -1013
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* cl_dx9_device_source_intel */
 | 
					 | 
				
			||||||
#define CL_D3D9_DEVICE_INTEL                          0x4022
 | 
					 | 
				
			||||||
#define CL_D3D9EX_DEVICE_INTEL                        0x4070
 | 
					 | 
				
			||||||
#define CL_DXVA_DEVICE_INTEL                          0x4071
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* cl_dx9_device_set_intel */
 | 
					 | 
				
			||||||
#define CL_PREFERRED_DEVICES_FOR_DX9_INTEL            0x4024
 | 
					 | 
				
			||||||
#define CL_ALL_DEVICES_FOR_DX9_INTEL                  0x4025
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* cl_context_info */
 | 
					 | 
				
			||||||
#define CL_CONTEXT_D3D9_DEVICE_INTEL                  0x4026
 | 
					 | 
				
			||||||
#define CL_CONTEXT_D3D9EX_DEVICE_INTEL                0x4072
 | 
					 | 
				
			||||||
#define CL_CONTEXT_DXVA_DEVICE_INTEL                  0x4073
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* cl_mem_info */
 | 
					 | 
				
			||||||
#define CL_MEM_DX9_RESOURCE_INTEL                     0x4027
 | 
					 | 
				
			||||||
#define CL_MEM_DX9_SHARED_HANDLE_INTEL                0x4074
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* cl_image_info */
 | 
					 | 
				
			||||||
#define CL_IMAGE_DX9_PLANE_INTEL                      0x4075
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* cl_command_type */
 | 
					 | 
				
			||||||
#define CL_COMMAND_ACQUIRE_DX9_OBJECTS_INTEL          0x402A
 | 
					 | 
				
			||||||
#define CL_COMMAND_RELEASE_DX9_OBJECTS_INTEL          0x402B
 | 
					 | 
				
			||||||
/******************************************************************************/
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
extern CL_API_ENTRY cl_int CL_API_CALL
 | 
					 | 
				
			||||||
clGetDeviceIDsFromDX9INTEL(
 | 
					 | 
				
			||||||
    cl_platform_id              /* platform */,
 | 
					 | 
				
			||||||
    cl_dx9_device_source_intel  /* dx9_device_source */,
 | 
					 | 
				
			||||||
    void*                       /* dx9_object */,
 | 
					 | 
				
			||||||
    cl_dx9_device_set_intel     /* dx9_device_set */,
 | 
					 | 
				
			||||||
    cl_uint                     /* num_entries */, 
 | 
					 | 
				
			||||||
    cl_device_id*               /* devices */, 
 | 
					 | 
				
			||||||
    cl_uint*                    /* num_devices */) CL_EXT_SUFFIX__VERSION_1_1;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
typedef CL_API_ENTRY cl_int (CL_API_CALL* clGetDeviceIDsFromDX9INTEL_fn)(
 | 
					 | 
				
			||||||
    cl_platform_id              /* platform */,
 | 
					 | 
				
			||||||
    cl_dx9_device_source_intel  /* dx9_device_source */,
 | 
					 | 
				
			||||||
    void*                       /* dx9_object */,
 | 
					 | 
				
			||||||
    cl_dx9_device_set_intel     /* dx9_device_set */,
 | 
					 | 
				
			||||||
    cl_uint                     /* num_entries */, 
 | 
					 | 
				
			||||||
    cl_device_id*               /* devices */, 
 | 
					 | 
				
			||||||
    cl_uint*                    /* num_devices */) CL_EXT_SUFFIX__VERSION_1_1;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
extern CL_API_ENTRY cl_mem CL_API_CALL
 | 
					 | 
				
			||||||
clCreateFromDX9MediaSurfaceINTEL(
 | 
					 | 
				
			||||||
    cl_context                  /* context */,
 | 
					 | 
				
			||||||
    cl_mem_flags                /* flags */,
 | 
					 | 
				
			||||||
    IDirect3DSurface9*          /* resource */,
 | 
					 | 
				
			||||||
    HANDLE                      /* sharedHandle */,
 | 
					 | 
				
			||||||
    UINT                        /* plane */,
 | 
					 | 
				
			||||||
    cl_int*                     /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromDX9MediaSurfaceINTEL_fn)(
 | 
					 | 
				
			||||||
    cl_context                  /* context */,
 | 
					 | 
				
			||||||
    cl_mem_flags                /* flags */,
 | 
					 | 
				
			||||||
    IDirect3DSurface9*          /* resource */,
 | 
					 | 
				
			||||||
    HANDLE                      /* sharedHandle */,
 | 
					 | 
				
			||||||
    UINT                        /* plane */,
 | 
					 | 
				
			||||||
    cl_int*                     /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
extern CL_API_ENTRY cl_int CL_API_CALL
 | 
					 | 
				
			||||||
clEnqueueAcquireDX9ObjectsINTEL(
 | 
					 | 
				
			||||||
    cl_command_queue            /* command_queue */,
 | 
					 | 
				
			||||||
    cl_uint                     /* num_objects */,
 | 
					 | 
				
			||||||
    const cl_mem*               /* mem_objects */,
 | 
					 | 
				
			||||||
    cl_uint                     /* num_events_in_wait_list */,
 | 
					 | 
				
			||||||
    const cl_event*             /* event_wait_list */,
 | 
					 | 
				
			||||||
    cl_event*                   /* event */) CL_EXT_SUFFIX__VERSION_1_1;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireDX9ObjectsINTEL_fn)(
 | 
					 | 
				
			||||||
    cl_command_queue            /* command_queue */,
 | 
					 | 
				
			||||||
    cl_uint                     /* num_objects */,
 | 
					 | 
				
			||||||
    const cl_mem*               /* mem_objects */,
 | 
					 | 
				
			||||||
    cl_uint                     /* num_events_in_wait_list */,
 | 
					 | 
				
			||||||
    const cl_event*             /* event_wait_list */,
 | 
					 | 
				
			||||||
    cl_event*                   /* event */) CL_EXT_SUFFIX__VERSION_1_1;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
extern CL_API_ENTRY cl_int CL_API_CALL
 | 
					 | 
				
			||||||
clEnqueueReleaseDX9ObjectsINTEL(
 | 
					 | 
				
			||||||
    cl_command_queue            /* command_queue */,
 | 
					 | 
				
			||||||
    cl_uint                     /* num_objects */,
 | 
					 | 
				
			||||||
    cl_mem*                     /* mem_objects */,
 | 
					 | 
				
			||||||
    cl_uint                     /* num_events_in_wait_list */,
 | 
					 | 
				
			||||||
    const cl_event*             /* event_wait_list */,
 | 
					 | 
				
			||||||
    cl_event*                   /* event */) CL_EXT_SUFFIX__VERSION_1_1;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseDX9ObjectsINTEL_fn)(
 | 
					 | 
				
			||||||
    cl_command_queue            /* command_queue */,
 | 
					 | 
				
			||||||
    cl_uint                     /* num_objects */,
 | 
					 | 
				
			||||||
    cl_mem*                     /* mem_objects */,
 | 
					 | 
				
			||||||
    cl_uint                     /* num_events_in_wait_list */,
 | 
					 | 
				
			||||||
    const cl_event*             /* event_wait_list */,
 | 
					 | 
				
			||||||
    cl_event*                   /* event */) CL_EXT_SUFFIX__VERSION_1_1;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#ifdef __cplusplus
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
#endif
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#endif  /* __OPENCL_CL_DX9_MEDIA_SHARING_INTEL_H */
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
							
								
								
									
										136
									
								
								include/triton/external/CL/cl_egl.h
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										136
									
								
								include/triton/external/CL/cl_egl.h
									
									
									
									
										vendored
									
									
								
							@@ -1,136 +0,0 @@
 | 
				
			|||||||
/*******************************************************************************
 | 
					 | 
				
			||||||
 * Copyright (c) 2008-2015 The Khronos Group Inc.
 | 
					 | 
				
			||||||
 *
 | 
					 | 
				
			||||||
 * Permission is hereby granted, free of charge, to any person obtaining a
 | 
					 | 
				
			||||||
 * copy of this software and/or associated documentation files (the
 | 
					 | 
				
			||||||
 * "Materials"), to deal in the Materials without restriction, including
 | 
					 | 
				
			||||||
 * without limitation the rights to use, copy, modify, merge, publish,
 | 
					 | 
				
			||||||
 * distribute, sublicense, and/or sell copies of the Materials, and to
 | 
					 | 
				
			||||||
 * permit persons to whom the Materials are furnished to do so, subject to
 | 
					 | 
				
			||||||
 * the following conditions:
 | 
					 | 
				
			||||||
 *
 | 
					 | 
				
			||||||
 * The above copyright notice and this permission notice shall be included
 | 
					 | 
				
			||||||
 * in all copies or substantial portions of the Materials.
 | 
					 | 
				
			||||||
 *
 | 
					 | 
				
			||||||
 * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
 | 
					 | 
				
			||||||
 * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
 | 
					 | 
				
			||||||
 * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
 | 
					 | 
				
			||||||
 *    https://www.khronos.org/registry/
 | 
					 | 
				
			||||||
 *
 | 
					 | 
				
			||||||
 * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 | 
					 | 
				
			||||||
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 | 
					 | 
				
			||||||
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 | 
					 | 
				
			||||||
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 | 
					 | 
				
			||||||
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 | 
					 | 
				
			||||||
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 | 
					 | 
				
			||||||
 * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
 | 
					 | 
				
			||||||
 ******************************************************************************/
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#ifndef __OPENCL_CL_EGL_H
 | 
					 | 
				
			||||||
#define __OPENCL_CL_EGL_H
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#ifdef __APPLE__
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#else
 | 
					 | 
				
			||||||
#include "cl.h"
 | 
					 | 
				
			||||||
#endif  
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#ifdef __cplusplus
 | 
					 | 
				
			||||||
extern "C" {
 | 
					 | 
				
			||||||
#endif
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* Command type for events created with clEnqueueAcquireEGLObjectsKHR */
 | 
					 | 
				
			||||||
#define CL_COMMAND_EGL_FENCE_SYNC_OBJECT_KHR  0x202F
 | 
					 | 
				
			||||||
#define CL_COMMAND_ACQUIRE_EGL_OBJECTS_KHR    0x202D
 | 
					 | 
				
			||||||
#define CL_COMMAND_RELEASE_EGL_OBJECTS_KHR    0x202E
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* Error type for clCreateFromEGLImageKHR */
 | 
					 | 
				
			||||||
#define CL_INVALID_EGL_OBJECT_KHR             -1093
 | 
					 | 
				
			||||||
#define CL_EGL_RESOURCE_NOT_ACQUIRED_KHR      -1092
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* CLeglImageKHR is an opaque handle to an EGLImage */
 | 
					 | 
				
			||||||
typedef void* CLeglImageKHR;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* CLeglDisplayKHR is an opaque handle to an EGLDisplay */
 | 
					 | 
				
			||||||
typedef void* CLeglDisplayKHR;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* CLeglSyncKHR is an opaque handle to an EGLSync object */
 | 
					 | 
				
			||||||
typedef void* CLeglSyncKHR;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* properties passed to clCreateFromEGLImageKHR */
 | 
					 | 
				
			||||||
typedef intptr_t cl_egl_image_properties_khr;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#define cl_khr_egl_image 1
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
extern CL_API_ENTRY cl_mem CL_API_CALL
 | 
					 | 
				
			||||||
clCreateFromEGLImageKHR(cl_context                  /* context */,
 | 
					 | 
				
			||||||
                        CLeglDisplayKHR             /* egldisplay */,
 | 
					 | 
				
			||||||
                        CLeglImageKHR               /* eglimage */,
 | 
					 | 
				
			||||||
                        cl_mem_flags                /* flags */,
 | 
					 | 
				
			||||||
                        const cl_egl_image_properties_khr * /* properties */,
 | 
					 | 
				
			||||||
                        cl_int *                    /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromEGLImageKHR_fn)(
 | 
					 | 
				
			||||||
	cl_context                  context,
 | 
					 | 
				
			||||||
	CLeglDisplayKHR             egldisplay,
 | 
					 | 
				
			||||||
	CLeglImageKHR               eglimage,
 | 
					 | 
				
			||||||
	cl_mem_flags                flags,
 | 
					 | 
				
			||||||
	const cl_egl_image_properties_khr * properties,
 | 
					 | 
				
			||||||
	cl_int *                    errcode_ret);
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
extern CL_API_ENTRY cl_int CL_API_CALL
 | 
					 | 
				
			||||||
clEnqueueAcquireEGLObjectsKHR(cl_command_queue /* command_queue */,
 | 
					 | 
				
			||||||
                              cl_uint          /* num_objects */,
 | 
					 | 
				
			||||||
                              const cl_mem *   /* mem_objects */,
 | 
					 | 
				
			||||||
                              cl_uint          /* num_events_in_wait_list */,
 | 
					 | 
				
			||||||
                              const cl_event * /* event_wait_list */,
 | 
					 | 
				
			||||||
                              cl_event *       /* event */) CL_API_SUFFIX__VERSION_1_0;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireEGLObjectsKHR_fn)(
 | 
					 | 
				
			||||||
	cl_command_queue command_queue,
 | 
					 | 
				
			||||||
	cl_uint          num_objects,
 | 
					 | 
				
			||||||
	const cl_mem *   mem_objects,
 | 
					 | 
				
			||||||
	cl_uint          num_events_in_wait_list,
 | 
					 | 
				
			||||||
	const cl_event * event_wait_list,
 | 
					 | 
				
			||||||
	cl_event *       event);
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
extern CL_API_ENTRY cl_int CL_API_CALL
 | 
					 | 
				
			||||||
clEnqueueReleaseEGLObjectsKHR(cl_command_queue /* command_queue */,
 | 
					 | 
				
			||||||
                              cl_uint          /* num_objects */,
 | 
					 | 
				
			||||||
                              const cl_mem *   /* mem_objects */,
 | 
					 | 
				
			||||||
                              cl_uint          /* num_events_in_wait_list */,
 | 
					 | 
				
			||||||
                              const cl_event * /* event_wait_list */,
 | 
					 | 
				
			||||||
                              cl_event *       /* event */) CL_API_SUFFIX__VERSION_1_0;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseEGLObjectsKHR_fn)(
 | 
					 | 
				
			||||||
	cl_command_queue command_queue,
 | 
					 | 
				
			||||||
	cl_uint          num_objects,
 | 
					 | 
				
			||||||
	const cl_mem *   mem_objects,
 | 
					 | 
				
			||||||
	cl_uint          num_events_in_wait_list,
 | 
					 | 
				
			||||||
	const cl_event * event_wait_list,
 | 
					 | 
				
			||||||
	cl_event *       event);
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#define cl_khr_egl_event 1
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
extern CL_API_ENTRY cl_event CL_API_CALL
 | 
					 | 
				
			||||||
clCreateEventFromEGLSyncKHR(cl_context      /* context */,
 | 
					 | 
				
			||||||
                            CLeglSyncKHR    /* sync */,
 | 
					 | 
				
			||||||
                            CLeglDisplayKHR /* display */,
 | 
					 | 
				
			||||||
                            cl_int *        /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
typedef CL_API_ENTRY cl_event (CL_API_CALL *clCreateEventFromEGLSyncKHR_fn)(
 | 
					 | 
				
			||||||
	cl_context      context,
 | 
					 | 
				
			||||||
	CLeglSyncKHR    sync,
 | 
					 | 
				
			||||||
	CLeglDisplayKHR display,
 | 
					 | 
				
			||||||
	cl_int *        errcode_ret);
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#ifdef __cplusplus
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
#endif
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#endif /* __OPENCL_CL_EGL_H */
 | 
					 | 
				
			||||||
							
								
								
									
										670
									
								
								include/triton/external/CL/cl_ext.h
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										670
									
								
								include/triton/external/CL/cl_ext.h
									
									
									
									
										vendored
									
									
								
							@@ -1,670 +0,0 @@
 | 
				
			|||||||
/*******************************************************************************
 | 
					 | 
				
			||||||
 * Copyright (c) 2008-2015 The Khronos Group Inc.
 | 
					 | 
				
			||||||
 *
 | 
					 | 
				
			||||||
 * Permission is hereby granted, free of charge, to any person obtaining a
 | 
					 | 
				
			||||||
 * copy of this software and/or associated documentation files (the
 | 
					 | 
				
			||||||
 * "Materials"), to deal in the Materials without restriction, including
 | 
					 | 
				
			||||||
 * without limitation the rights to use, copy, modify, merge, publish,
 | 
					 | 
				
			||||||
 * distribute, sublicense, and/or sell copies of the Materials, and to
 | 
					 | 
				
			||||||
 * permit persons to whom the Materials are furnished to do so, subject to
 | 
					 | 
				
			||||||
 * the following conditions:
 | 
					 | 
				
			||||||
 *
 | 
					 | 
				
			||||||
 * The above copyright notice and this permission notice shall be included
 | 
					 | 
				
			||||||
 * in all copies or substantial portions of the Materials.
 | 
					 | 
				
			||||||
 *
 | 
					 | 
				
			||||||
 * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
 | 
					 | 
				
			||||||
 * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
 | 
					 | 
				
			||||||
 * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
 | 
					 | 
				
			||||||
 *    https://www.khronos.org/registry/
 | 
					 | 
				
			||||||
 *
 | 
					 | 
				
			||||||
 * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 | 
					 | 
				
			||||||
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 | 
					 | 
				
			||||||
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 | 
					 | 
				
			||||||
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 | 
					 | 
				
			||||||
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 | 
					 | 
				
			||||||
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 | 
					 | 
				
			||||||
 * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
 | 
					 | 
				
			||||||
 ******************************************************************************/
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* $Revision: 11928 $ on $Date: 2010-07-13 09:04:56 -0700 (Tue, 13 Jul 2010) $ */
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* cl_ext.h contains OpenCL extensions which don't have external */
 | 
					 | 
				
			||||||
/* (OpenGL, D3D) dependencies.                                   */
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#ifndef __CL_EXT_H
 | 
					 | 
				
			||||||
#define __CL_EXT_H
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#ifdef __cplusplus
 | 
					 | 
				
			||||||
extern "C" {
 | 
					 | 
				
			||||||
#endif
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#ifdef __APPLE__
 | 
					 | 
				
			||||||
        #include <OpenCL/cl.h>
 | 
					 | 
				
			||||||
    #include <AvailabilityMacros.h>
 | 
					 | 
				
			||||||
#else
 | 
					 | 
				
			||||||
        #include "cl.h"
 | 
					 | 
				
			||||||
#endif
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* cl_khr_fp64 extension - no extension #define since it has no functions  */
 | 
					 | 
				
			||||||
#define CL_DEVICE_DOUBLE_FP_CONFIG                  0x1032
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* cl_khr_fp16 extension - no extension #define since it has no functions  */
 | 
					 | 
				
			||||||
#define CL_DEVICE_HALF_FP_CONFIG                    0x1033
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* Memory object destruction
 | 
					 | 
				
			||||||
 *
 | 
					 | 
				
			||||||
 * Apple extension for use to manage externally allocated buffers used with cl_mem objects with CL_MEM_USE_HOST_PTR
 | 
					 | 
				
			||||||
 *
 | 
					 | 
				
			||||||
 * Registers a user callback function that will be called when the memory object is deleted and its resources 
 | 
					 | 
				
			||||||
 * freed. Each call to clSetMemObjectCallbackFn registers the specified user callback function on a callback 
 | 
					 | 
				
			||||||
 * stack associated with memobj. The registered user callback functions are called in the reverse order in 
 | 
					 | 
				
			||||||
 * which they were registered. The user callback functions are called and then the memory object is deleted 
 | 
					 | 
				
			||||||
 * and its resources freed. This provides a mechanism for the application (and libraries) using memobj to be 
 | 
					 | 
				
			||||||
 * notified when the memory referenced by host_ptr, specified when the memory object is created and used as 
 | 
					 | 
				
			||||||
 * the storage bits for the memory object, can be reused or freed.
 | 
					 | 
				
			||||||
 *
 | 
					 | 
				
			||||||
 * The application may not call CL api's with the cl_mem object passed to the pfn_notify.
 | 
					 | 
				
			||||||
 *
 | 
					 | 
				
			||||||
 * Please check for the "cl_APPLE_SetMemObjectDestructor" extension using clGetDeviceInfo(CL_DEVICE_EXTENSIONS)
 | 
					 | 
				
			||||||
 * before using.
 | 
					 | 
				
			||||||
 */
 | 
					 | 
				
			||||||
#define cl_APPLE_SetMemObjectDestructor 1
 | 
					 | 
				
			||||||
cl_int  CL_API_ENTRY clSetMemObjectDestructorAPPLE(  cl_mem /* memobj */, 
 | 
					 | 
				
			||||||
                                        void (* /*pfn_notify*/)( cl_mem /* memobj */, void* /*user_data*/), 
 | 
					 | 
				
			||||||
                                        void * /*user_data */ )             CL_EXT_SUFFIX__VERSION_1_0;  
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* Context Logging Functions
 | 
					 | 
				
			||||||
 *
 | 
					 | 
				
			||||||
 * The next three convenience functions are intended to be used as the pfn_notify parameter to clCreateContext().
 | 
					 | 
				
			||||||
 * Please check for the "cl_APPLE_ContextLoggingFunctions" extension using clGetDeviceInfo(CL_DEVICE_EXTENSIONS)
 | 
					 | 
				
			||||||
 * before using.
 | 
					 | 
				
			||||||
 *
 | 
					 | 
				
			||||||
 * clLogMessagesToSystemLog fowards on all log messages to the Apple System Logger 
 | 
					 | 
				
			||||||
 */
 | 
					 | 
				
			||||||
#define cl_APPLE_ContextLoggingFunctions 1
 | 
					 | 
				
			||||||
extern void CL_API_ENTRY clLogMessagesToSystemLogAPPLE(  const char * /* errstr */, 
 | 
					 | 
				
			||||||
                                            const void * /* private_info */, 
 | 
					 | 
				
			||||||
                                            size_t       /* cb */, 
 | 
					 | 
				
			||||||
                                            void *       /* user_data */ )  CL_EXT_SUFFIX__VERSION_1_0;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* clLogMessagesToStdout sends all log messages to the file descriptor stdout */
 | 
					 | 
				
			||||||
extern void CL_API_ENTRY clLogMessagesToStdoutAPPLE(   const char * /* errstr */, 
 | 
					 | 
				
			||||||
                                          const void * /* private_info */, 
 | 
					 | 
				
			||||||
                                          size_t       /* cb */, 
 | 
					 | 
				
			||||||
                                          void *       /* user_data */ )    CL_EXT_SUFFIX__VERSION_1_0;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* clLogMessagesToStderr sends all log messages to the file descriptor stderr */
 | 
					 | 
				
			||||||
extern void CL_API_ENTRY clLogMessagesToStderrAPPLE(   const char * /* errstr */, 
 | 
					 | 
				
			||||||
                                          const void * /* private_info */, 
 | 
					 | 
				
			||||||
                                          size_t       /* cb */, 
 | 
					 | 
				
			||||||
                                          void *       /* user_data */ )    CL_EXT_SUFFIX__VERSION_1_0;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/************************ 
 | 
					 | 
				
			||||||
* cl_khr_icd extension *                                                  
 | 
					 | 
				
			||||||
************************/
 | 
					 | 
				
			||||||
#define cl_khr_icd 1
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* cl_platform_info                                                        */
 | 
					 | 
				
			||||||
#define CL_PLATFORM_ICD_SUFFIX_KHR                  0x0920
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* Additional Error Codes                                                  */
 | 
					 | 
				
			||||||
#define CL_PLATFORM_NOT_FOUND_KHR                   -1001
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
extern CL_API_ENTRY cl_int CL_API_CALL
 | 
					 | 
				
			||||||
clIcdGetPlatformIDsKHR(cl_uint          /* num_entries */,
 | 
					 | 
				
			||||||
                       cl_platform_id * /* platforms */,
 | 
					 | 
				
			||||||
                       cl_uint *        /* num_platforms */);
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
typedef CL_API_ENTRY cl_int (CL_API_CALL *clIcdGetPlatformIDsKHR_fn)(
 | 
					 | 
				
			||||||
    cl_uint          /* num_entries */,
 | 
					 | 
				
			||||||
    cl_platform_id * /* platforms */,
 | 
					 | 
				
			||||||
    cl_uint *        /* num_platforms */);
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* Extension: cl_khr_image2D_buffer
 | 
					 | 
				
			||||||
 *
 | 
					 | 
				
			||||||
 * This extension allows a 2D image to be created from a cl_mem buffer without a copy.
 | 
					 | 
				
			||||||
 * The type associated with a 2D image created from a buffer in an OpenCL program is image2d_t.
 | 
					 | 
				
			||||||
 * Both the sampler and sampler-less read_image built-in functions are supported for 2D images
 | 
					 | 
				
			||||||
 * and 2D images created from a buffer.  Similarly, the write_image built-ins are also supported
 | 
					 | 
				
			||||||
 * for 2D images created from a buffer.
 | 
					 | 
				
			||||||
 *
 | 
					 | 
				
			||||||
 * When the 2D image from buffer is created, the client must specify the width,
 | 
					 | 
				
			||||||
 * height, image format (i.e. channel order and channel data type) and optionally the row pitch
 | 
					 | 
				
			||||||
 *
 | 
					 | 
				
			||||||
 * The pitch specified must be a multiple of CL_DEVICE_IMAGE_PITCH_ALIGNMENT pixels.
 | 
					 | 
				
			||||||
 * The base address of the buffer must be aligned to CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT pixels.
 | 
					 | 
				
			||||||
 */
 | 
					 | 
				
			||||||
    
 | 
					 | 
				
			||||||
/*************************************
 | 
					 | 
				
			||||||
 * cl_khr_initalize_memory extension *
 | 
					 | 
				
			||||||
 *************************************/
 | 
					 | 
				
			||||||
    
 | 
					 | 
				
			||||||
#define CL_CONTEXT_MEMORY_INITIALIZE_KHR            0x2030
 | 
					 | 
				
			||||||
    
 | 
					 | 
				
			||||||
    
 | 
					 | 
				
			||||||
/**************************************
 | 
					 | 
				
			||||||
 * cl_khr_terminate_context extension *
 | 
					 | 
				
			||||||
 **************************************/
 | 
					 | 
				
			||||||
    
 | 
					 | 
				
			||||||
#define CL_DEVICE_TERMINATE_CAPABILITY_KHR          0x2031
 | 
					 | 
				
			||||||
#define CL_CONTEXT_TERMINATE_KHR                    0x2032
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#define cl_khr_terminate_context 1
 | 
					 | 
				
			||||||
extern CL_API_ENTRY cl_int CL_API_CALL clTerminateContextKHR(cl_context /* context */) CL_EXT_SUFFIX__VERSION_1_2;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
typedef CL_API_ENTRY cl_int (CL_API_CALL *clTerminateContextKHR_fn)(cl_context /* context */) CL_EXT_SUFFIX__VERSION_1_2;
 | 
					 | 
				
			||||||
    
 | 
					 | 
				
			||||||
    
 | 
					 | 
				
			||||||
/*
 | 
					 | 
				
			||||||
 * Extension: cl_khr_spir
 | 
					 | 
				
			||||||
 *
 | 
					 | 
				
			||||||
 * This extension adds support to create an OpenCL program object from a 
 | 
					 | 
				
			||||||
 * Standard Portable Intermediate Representation (SPIR) instance
 | 
					 | 
				
			||||||
 */
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#define CL_DEVICE_SPIR_VERSIONS                     0x40E0
 | 
					 | 
				
			||||||
#define CL_PROGRAM_BINARY_TYPE_INTERMEDIATE         0x40E1
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/*****************************************
 | 
					 | 
				
			||||||
 * cl_khr_create_command_queue extension *
 | 
					 | 
				
			||||||
 *****************************************/
 | 
					 | 
				
			||||||
#define cl_khr_create_command_queue 1
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
typedef cl_bitfield cl_queue_properties_khr;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
extern CL_API_ENTRY cl_command_queue CL_API_CALL
 | 
					 | 
				
			||||||
clCreateCommandQueueWithPropertiesKHR( cl_context /* context */,
 | 
					 | 
				
			||||||
                                       cl_device_id /* device */,
 | 
					 | 
				
			||||||
                                       const cl_queue_properties_khr* /* properties */,
 | 
					 | 
				
			||||||
                                       cl_int* /* errcode_ret */ ) CL_EXT_SUFFIX__VERSION_1_2;
 | 
					 | 
				
			||||||
typedef CL_API_ENTRY cl_command_queue
 | 
					 | 
				
			||||||
(CL_API_CALL *clCreateCommandQueueWithPropertiesKHR_fn)( cl_context /* context */,
 | 
					 | 
				
			||||||
                                                         cl_device_id /* device */,
 | 
					 | 
				
			||||||
                                                         const cl_queue_properties_khr* /* properties */,
 | 
					 | 
				
			||||||
                                                         cl_int* /* errcode_ret */ ) CL_EXT_SUFFIX__VERSION_1_2;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/******************************************
 | 
					 | 
				
			||||||
* cl_nv_device_attribute_query extension *
 | 
					 | 
				
			||||||
******************************************/
 | 
					 | 
				
			||||||
/* cl_nv_device_attribute_query extension - no extension #define since it has no functions */
 | 
					 | 
				
			||||||
#define CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV       0x4000
 | 
					 | 
				
			||||||
#define CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV       0x4001
 | 
					 | 
				
			||||||
#define CL_DEVICE_REGISTERS_PER_BLOCK_NV            0x4002
 | 
					 | 
				
			||||||
#define CL_DEVICE_WARP_SIZE_NV                      0x4003
 | 
					 | 
				
			||||||
#define CL_DEVICE_GPU_OVERLAP_NV                    0x4004
 | 
					 | 
				
			||||||
#define CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV            0x4005
 | 
					 | 
				
			||||||
#define CL_DEVICE_INTEGRATED_MEMORY_NV              0x4006
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/*********************************
 | 
					 | 
				
			||||||
* cl_amd_device_memory_flags *
 | 
					 | 
				
			||||||
*********************************/
 | 
					 | 
				
			||||||
#define cl_amd_device_memory_flags 1
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#define CL_MEM_USE_PERSISTENT_MEM_AMD       (1 << 6)        // Alloc from GPU's CPU visible heap
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* cl_device_info */
 | 
					 | 
				
			||||||
#define CL_DEVICE_MAX_ATOMIC_COUNTERS_EXT           0x4032
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/*********************************
 | 
					 | 
				
			||||||
* cl_amd_device_attribute_query *
 | 
					 | 
				
			||||||
*********************************/
 | 
					 | 
				
			||||||
#define CL_DEVICE_PROFILING_TIMER_OFFSET_AMD        0x4036
 | 
					 | 
				
			||||||
#define CL_DEVICE_TOPOLOGY_AMD                      0x4037
 | 
					 | 
				
			||||||
#define CL_DEVICE_BOARD_NAME_AMD                    0x4038
 | 
					 | 
				
			||||||
#define CL_DEVICE_GLOBAL_FREE_MEMORY_AMD            0x4039
 | 
					 | 
				
			||||||
#define CL_DEVICE_SIMD_PER_COMPUTE_UNIT_AMD         0x4040
 | 
					 | 
				
			||||||
#define CL_DEVICE_SIMD_WIDTH_AMD                    0x4041
 | 
					 | 
				
			||||||
#define CL_DEVICE_SIMD_INSTRUCTION_WIDTH_AMD        0x4042
 | 
					 | 
				
			||||||
#define CL_DEVICE_WAVEFRONT_WIDTH_AMD               0x4043
 | 
					 | 
				
			||||||
#define CL_DEVICE_GLOBAL_MEM_CHANNELS_AMD           0x4044
 | 
					 | 
				
			||||||
#define CL_DEVICE_GLOBAL_MEM_CHANNEL_BANKS_AMD      0x4045
 | 
					 | 
				
			||||||
#define CL_DEVICE_GLOBAL_MEM_CHANNEL_BANK_WIDTH_AMD 0x4046
 | 
					 | 
				
			||||||
#define CL_DEVICE_LOCAL_MEM_SIZE_PER_COMPUTE_UNIT_AMD   0x4047
 | 
					 | 
				
			||||||
#define CL_DEVICE_LOCAL_MEM_BANKS_AMD               0x4048
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
typedef union
 | 
					 | 
				
			||||||
{
 | 
					 | 
				
			||||||
    struct { cl_uint type; cl_uint data[5]; } raw;
 | 
					 | 
				
			||||||
    struct { cl_uint type; cl_char unused[17]; cl_char bus; cl_char device; cl_char function; } pcie;
 | 
					 | 
				
			||||||
} cl_device_topology_amd;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#define CL_DEVICE_TOPOLOGY_TYPE_PCIE_AMD            1
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/**************************
 | 
					 | 
				
			||||||
* cl_amd_offline_devices *
 | 
					 | 
				
			||||||
**************************/
 | 
					 | 
				
			||||||
#define CL_CONTEXT_OFFLINE_DEVICES_AMD              0x403F
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/*********************************
 | 
					 | 
				
			||||||
* cl_arm_printf extension
 | 
					 | 
				
			||||||
*********************************/
 | 
					 | 
				
			||||||
#define CL_PRINTF_CALLBACK_ARM                      0x40B0
 | 
					 | 
				
			||||||
#define CL_PRINTF_BUFFERSIZE_ARM                    0x40B1
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#ifdef CL_VERSION_1_1
 | 
					 | 
				
			||||||
   /***********************************
 | 
					 | 
				
			||||||
    * cl_ext_device_fission extension *
 | 
					 | 
				
			||||||
    ***********************************/
 | 
					 | 
				
			||||||
    #define cl_ext_device_fission   1
 | 
					 | 
				
			||||||
    
 | 
					 | 
				
			||||||
    extern CL_API_ENTRY cl_int CL_API_CALL
 | 
					 | 
				
			||||||
    clReleaseDeviceEXT( cl_device_id /*device*/ ) CL_EXT_SUFFIX__VERSION_1_1; 
 | 
					 | 
				
			||||||
    
 | 
					 | 
				
			||||||
    typedef CL_API_ENTRY cl_int 
 | 
					 | 
				
			||||||
    (CL_API_CALL *clReleaseDeviceEXT_fn)( cl_device_id /*device*/ ) CL_EXT_SUFFIX__VERSION_1_1;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    extern CL_API_ENTRY cl_int CL_API_CALL
 | 
					 | 
				
			||||||
    clRetainDeviceEXT( cl_device_id /*device*/ ) CL_EXT_SUFFIX__VERSION_1_1; 
 | 
					 | 
				
			||||||
    
 | 
					 | 
				
			||||||
    typedef CL_API_ENTRY cl_int 
 | 
					 | 
				
			||||||
    (CL_API_CALL *clRetainDeviceEXT_fn)( cl_device_id /*device*/ ) CL_EXT_SUFFIX__VERSION_1_1;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    typedef cl_ulong  cl_device_partition_property_ext;
 | 
					 | 
				
			||||||
    extern CL_API_ENTRY cl_int CL_API_CALL
 | 
					 | 
				
			||||||
    clCreateSubDevicesEXT(  cl_device_id /*in_device*/,
 | 
					 | 
				
			||||||
                            const cl_device_partition_property_ext * /* properties */,
 | 
					 | 
				
			||||||
                            cl_uint /*num_entries*/,
 | 
					 | 
				
			||||||
                            cl_device_id * /*out_devices*/,
 | 
					 | 
				
			||||||
                            cl_uint * /*num_devices*/ ) CL_EXT_SUFFIX__VERSION_1_1;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    typedef CL_API_ENTRY cl_int 
 | 
					 | 
				
			||||||
    ( CL_API_CALL * clCreateSubDevicesEXT_fn)(  cl_device_id /*in_device*/,
 | 
					 | 
				
			||||||
                                                const cl_device_partition_property_ext * /* properties */,
 | 
					 | 
				
			||||||
                                                cl_uint /*num_entries*/,
 | 
					 | 
				
			||||||
                                                cl_device_id * /*out_devices*/,
 | 
					 | 
				
			||||||
                                                cl_uint * /*num_devices*/ ) CL_EXT_SUFFIX__VERSION_1_1;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    /* cl_device_partition_property_ext */
 | 
					 | 
				
			||||||
    #define CL_DEVICE_PARTITION_EQUALLY_EXT             0x4050
 | 
					 | 
				
			||||||
    #define CL_DEVICE_PARTITION_BY_COUNTS_EXT           0x4051
 | 
					 | 
				
			||||||
    #define CL_DEVICE_PARTITION_BY_NAMES_EXT            0x4052
 | 
					 | 
				
			||||||
    #define CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN_EXT  0x4053
 | 
					 | 
				
			||||||
    
 | 
					 | 
				
			||||||
    /* clDeviceGetInfo selectors */
 | 
					 | 
				
			||||||
    #define CL_DEVICE_PARENT_DEVICE_EXT                 0x4054
 | 
					 | 
				
			||||||
    #define CL_DEVICE_PARTITION_TYPES_EXT               0x4055
 | 
					 | 
				
			||||||
    #define CL_DEVICE_AFFINITY_DOMAINS_EXT              0x4056
 | 
					 | 
				
			||||||
    #define CL_DEVICE_REFERENCE_COUNT_EXT               0x4057
 | 
					 | 
				
			||||||
    #define CL_DEVICE_PARTITION_STYLE_EXT               0x4058
 | 
					 | 
				
			||||||
    
 | 
					 | 
				
			||||||
    /* error codes */
 | 
					 | 
				
			||||||
    #define CL_DEVICE_PARTITION_FAILED_EXT              -1057
 | 
					 | 
				
			||||||
    #define CL_INVALID_PARTITION_COUNT_EXT              -1058
 | 
					 | 
				
			||||||
    #define CL_INVALID_PARTITION_NAME_EXT               -1059
 | 
					 | 
				
			||||||
    
 | 
					 | 
				
			||||||
    /* CL_AFFINITY_DOMAINs */
 | 
					 | 
				
			||||||
    #define CL_AFFINITY_DOMAIN_L1_CACHE_EXT             0x1
 | 
					 | 
				
			||||||
    #define CL_AFFINITY_DOMAIN_L2_CACHE_EXT             0x2
 | 
					 | 
				
			||||||
    #define CL_AFFINITY_DOMAIN_L3_CACHE_EXT             0x3
 | 
					 | 
				
			||||||
    #define CL_AFFINITY_DOMAIN_L4_CACHE_EXT             0x4
 | 
					 | 
				
			||||||
    #define CL_AFFINITY_DOMAIN_NUMA_EXT                 0x10
 | 
					 | 
				
			||||||
    #define CL_AFFINITY_DOMAIN_NEXT_FISSIONABLE_EXT     0x100
 | 
					 | 
				
			||||||
    
 | 
					 | 
				
			||||||
    /* cl_device_partition_property_ext list terminators */
 | 
					 | 
				
			||||||
    #define CL_PROPERTIES_LIST_END_EXT                  ((cl_device_partition_property_ext) 0)
 | 
					 | 
				
			||||||
    #define CL_PARTITION_BY_COUNTS_LIST_END_EXT         ((cl_device_partition_property_ext) 0)
 | 
					 | 
				
			||||||
    #define CL_PARTITION_BY_NAMES_LIST_END_EXT          ((cl_device_partition_property_ext) 0 - 1)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    /* cl_ext_atomic_counters_32 and cl_ext_atomic_counters_64 extensions
 | 
					 | 
				
			||||||
     * no extension #define since they have no functions
 | 
					 | 
				
			||||||
     */
 | 
					 | 
				
			||||||
    #define CL_DEVICE_MAX_ATOMIC_COUNTERS_EXT           0x4032
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/*********************************
 | 
					 | 
				
			||||||
* cl_qcom_ext_host_ptr extension
 | 
					 | 
				
			||||||
*********************************/
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#define CL_MEM_EXT_HOST_PTR_QCOM                  (1 << 29)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#define CL_DEVICE_EXT_MEM_PADDING_IN_BYTES_QCOM   0x40A0      
 | 
					 | 
				
			||||||
#define CL_DEVICE_PAGE_SIZE_QCOM                  0x40A1
 | 
					 | 
				
			||||||
#define CL_IMAGE_ROW_ALIGNMENT_QCOM               0x40A2
 | 
					 | 
				
			||||||
#define CL_IMAGE_SLICE_ALIGNMENT_QCOM             0x40A3
 | 
					 | 
				
			||||||
#define CL_MEM_HOST_UNCACHED_QCOM                 0x40A4
 | 
					 | 
				
			||||||
#define CL_MEM_HOST_WRITEBACK_QCOM                0x40A5
 | 
					 | 
				
			||||||
#define CL_MEM_HOST_WRITETHROUGH_QCOM             0x40A6
 | 
					 | 
				
			||||||
#define CL_MEM_HOST_WRITE_COMBINING_QCOM          0x40A7
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
typedef cl_uint                                   cl_image_pitch_info_qcom;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
extern CL_API_ENTRY cl_int CL_API_CALL
 | 
					 | 
				
			||||||
clGetDeviceImageInfoQCOM(cl_device_id             device,
 | 
					 | 
				
			||||||
                         size_t                   image_width,
 | 
					 | 
				
			||||||
                         size_t                   image_height,
 | 
					 | 
				
			||||||
                         const cl_image_format   *image_format,
 | 
					 | 
				
			||||||
                         cl_image_pitch_info_qcom param_name,
 | 
					 | 
				
			||||||
                         size_t                   param_value_size,
 | 
					 | 
				
			||||||
                         void                    *param_value,
 | 
					 | 
				
			||||||
                         size_t                  *param_value_size_ret);
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
typedef struct _cl_mem_ext_host_ptr
 | 
					 | 
				
			||||||
{
 | 
					 | 
				
			||||||
    /* Type of external memory allocation. */
 | 
					 | 
				
			||||||
    /* Legal values will be defined in layered extensions. */
 | 
					 | 
				
			||||||
    cl_uint  allocation_type;
 | 
					 | 
				
			||||||
            
 | 
					 | 
				
			||||||
    /* Host cache policy for this external memory allocation. */
 | 
					 | 
				
			||||||
    cl_uint  host_cache_policy;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
} cl_mem_ext_host_ptr;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/*********************************
 | 
					 | 
				
			||||||
* cl_qcom_ion_host_ptr extension
 | 
					 | 
				
			||||||
*********************************/
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#define CL_MEM_ION_HOST_PTR_QCOM                  0x40A8
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
typedef struct _cl_mem_ion_host_ptr
 | 
					 | 
				
			||||||
{
 | 
					 | 
				
			||||||
    /* Type of external memory allocation. */
 | 
					 | 
				
			||||||
    /* Must be CL_MEM_ION_HOST_PTR_QCOM for ION allocations. */
 | 
					 | 
				
			||||||
    cl_mem_ext_host_ptr  ext_host_ptr;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    /* ION file descriptor */
 | 
					 | 
				
			||||||
    int                  ion_filedesc;
 | 
					 | 
				
			||||||
            
 | 
					 | 
				
			||||||
    /* Host pointer to the ION allocated memory */
 | 
					 | 
				
			||||||
    void*                ion_hostptr;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
} cl_mem_ion_host_ptr;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#endif /* CL_VERSION_1_1 */
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#if defined(CL_VERSION_1_2)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/******************************************
 | 
					 | 
				
			||||||
 * cl_img_yuv_image extension *
 | 
					 | 
				
			||||||
 ******************************************/
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* Image formats used in clCreateImage */
 | 
					 | 
				
			||||||
#define CL_NV21_IMG                                 0x40D0
 | 
					 | 
				
			||||||
#define CL_YV12_IMG                                 0x40D1
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/******************************************
 | 
					 | 
				
			||||||
 * cl_img_cached_allocations extension *
 | 
					 | 
				
			||||||
 ******************************************/
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* Flag values used by clCreteBuffer */
 | 
					 | 
				
			||||||
#define CL_MEM_USE_UNCACHED_CPU_MEMORY_IMG         	(1 << 26)
 | 
					 | 
				
			||||||
#define CL_MEM_USE_CACHED_CPU_MEMORY_IMG           	(1 << 27)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/******************************************
 | 
					 | 
				
			||||||
 * cl_img_use_gralloc_ptr extension *
 | 
					 | 
				
			||||||
 ******************************************/
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* Flag values used by clCreteBuffer */
 | 
					 | 
				
			||||||
#define CL_MEM_USE_GRALLOC_PTR_IMG                 	(1 << 28)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* To be used by clGetEventInfo: */
 | 
					 | 
				
			||||||
#define CL_COMMAND_ACQUIRE_GRALLOC_OBJECTS_IMG      0x40D2
 | 
					 | 
				
			||||||
#define CL_COMMAND_RELEASE_GRALLOC_OBJECTS_IMG      0x40D3
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* Error code from clEnqueueReleaseGrallocObjectsIMG */
 | 
					 | 
				
			||||||
#define CL_GRALLOC_RESOURCE_NOT_ACQUIRED_IMG        0x40D4
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
extern CL_API_ENTRY cl_int CL_API_CALL
 | 
					 | 
				
			||||||
clEnqueueAcquireGrallocObjectsIMG(cl_command_queue      /* command_queue */,
 | 
					 | 
				
			||||||
                                  cl_uint               /* num_objects */,
 | 
					 | 
				
			||||||
                                  const cl_mem *        /* mem_objects */,
 | 
					 | 
				
			||||||
                                  cl_uint               /* num_events_in_wait_list */,
 | 
					 | 
				
			||||||
                                  const cl_event *      /* event_wait_list */,
 | 
					 | 
				
			||||||
                                  cl_event *            /* event */) CL_EXT_SUFFIX__VERSION_1_2;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
extern CL_API_ENTRY cl_int CL_API_CALL
 | 
					 | 
				
			||||||
clEnqueueReleaseGrallocObjectsIMG(cl_command_queue      /* command_queue */,
 | 
					 | 
				
			||||||
                                  cl_uint               /* num_objects */,
 | 
					 | 
				
			||||||
                                  const cl_mem *        /* mem_objects */,
 | 
					 | 
				
			||||||
                                  cl_uint               /* num_events_in_wait_list */,
 | 
					 | 
				
			||||||
                                  const cl_event *      /* event_wait_list */,
 | 
					 | 
				
			||||||
                                  cl_event *            /* event */) CL_EXT_SUFFIX__VERSION_1_2;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#endif /* CL_VERSION_1_2 */
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#ifdef CL_VERSION_2_0
 | 
					 | 
				
			||||||
/*********************************
 | 
					 | 
				
			||||||
* cl_khr_subgroups extension
 | 
					 | 
				
			||||||
*********************************/
 | 
					 | 
				
			||||||
#define cl_khr_subgroups 1
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* cl_kernel_sub_group_info is declared in CL.h. */
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* cl_kernel_sub_group_info */
 | 
					 | 
				
			||||||
#define CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE_KHR	0x2033
 | 
					 | 
				
			||||||
#define CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE_KHR		0x2034
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
extern CL_API_ENTRY cl_int CL_API_CALL
 | 
					 | 
				
			||||||
clGetKernelSubGroupInfoKHR(cl_kernel /* in_kernel */,
 | 
					 | 
				
			||||||
						   cl_device_id /*in_device*/,
 | 
					 | 
				
			||||||
						   cl_kernel_sub_group_info /* param_name */,
 | 
					 | 
				
			||||||
						   size_t /*input_value_size*/,
 | 
					 | 
				
			||||||
						   const void * /*input_value*/,
 | 
					 | 
				
			||||||
						   size_t /*param_value_size*/,
 | 
					 | 
				
			||||||
						   void* /*param_value*/,
 | 
					 | 
				
			||||||
						   size_t* /*param_value_size_ret*/ ) CL_EXT_SUFFIX__VERSION_2_0_DEPRECATED;
 | 
					 | 
				
			||||||
						   
 | 
					 | 
				
			||||||
typedef CL_API_ENTRY cl_int
 | 
					 | 
				
			||||||
     ( CL_API_CALL * clGetKernelSubGroupInfoKHR_fn)(cl_kernel /* in_kernel */,
 | 
					 | 
				
			||||||
						      cl_device_id /*in_device*/,
 | 
					 | 
				
			||||||
						      cl_kernel_sub_group_info /* param_name */,
 | 
					 | 
				
			||||||
						      size_t /*input_value_size*/,
 | 
					 | 
				
			||||||
						      const void * /*input_value*/,
 | 
					 | 
				
			||||||
						      size_t /*param_value_size*/,
 | 
					 | 
				
			||||||
						      void* /*param_value*/,
 | 
					 | 
				
			||||||
						      size_t* /*param_value_size_ret*/ ) CL_EXT_SUFFIX__VERSION_2_0_DEPRECATED;
 | 
					 | 
				
			||||||
#endif /* CL_VERSION_2_0 */
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#ifdef CL_VERSION_2_1
 | 
					 | 
				
			||||||
/*********************************
 | 
					 | 
				
			||||||
* cl_khr_priority_hints extension
 | 
					 | 
				
			||||||
*********************************/
 | 
					 | 
				
			||||||
#define cl_khr_priority_hints 1
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
typedef cl_uint  cl_queue_priority_khr;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* cl_command_queue_properties */
 | 
					 | 
				
			||||||
#define CL_QUEUE_PRIORITY_KHR 0x1096
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* cl_queue_priority_khr */
 | 
					 | 
				
			||||||
#define CL_QUEUE_PRIORITY_HIGH_KHR (1<<0)
 | 
					 | 
				
			||||||
#define CL_QUEUE_PRIORITY_MED_KHR (1<<1)
 | 
					 | 
				
			||||||
#define CL_QUEUE_PRIORITY_LOW_KHR (1<<2)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#endif /* CL_VERSION_2_1 */
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#ifdef CL_VERSION_2_1
 | 
					 | 
				
			||||||
/*********************************
 | 
					 | 
				
			||||||
* cl_khr_throttle_hints extension
 | 
					 | 
				
			||||||
*********************************/
 | 
					 | 
				
			||||||
#define cl_khr_throttle_hints 1
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
typedef cl_uint  cl_queue_throttle_khr;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* cl_command_queue_properties */
 | 
					 | 
				
			||||||
#define CL_QUEUE_THROTTLE_KHR 0x1097
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* cl_queue_throttle_khr */
 | 
					 | 
				
			||||||
#define CL_QUEUE_THROTTLE_HIGH_KHR (1<<0)
 | 
					 | 
				
			||||||
#define CL_QUEUE_THROTTLE_MED_KHR (1<<1)
 | 
					 | 
				
			||||||
#define CL_QUEUE_THROTTLE_LOW_KHR (1<<2)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#endif /* CL_VERSION_2_1 */
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#ifdef CL_VERSION_2_2
 | 
					 | 
				
			||||||
/*********************************
 | 
					 | 
				
			||||||
* cl_khr_subgroup_named_barrier
 | 
					 | 
				
			||||||
*********************************/
 | 
					 | 
				
			||||||
#define cl_khr_subgroup_named_barrier 1
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* cl_device_info */
 | 
					 | 
				
			||||||
#define CL_DEVICE_MAX_NAMED_BARRIER_COUNT_KHR       0x2035
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#endif /* CL_VERSION_2_2 */
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/**********************************
 | 
					 | 
				
			||||||
 * cl_arm_import_memory extension *
 | 
					 | 
				
			||||||
 **********************************/
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#ifdef CL_VERSION_1_0
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
typedef intptr_t cl_import_properties_arm;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* Default and valid proporties name for cl_arm_import_memory */
 | 
					 | 
				
			||||||
#define CL_IMPORT_TYPE_ARM                        0x40B2
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* Host process memory type default value for CL_IMPORT_TYPE_ARM property */
 | 
					 | 
				
			||||||
#define CL_IMPORT_TYPE_HOST_ARM                   0x40B3
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* DMA BUF memory type value for CL_IMPORT_TYPE_ARM property */
 | 
					 | 
				
			||||||
#define CL_IMPORT_TYPE_DMA_BUF_ARM                0x40B4
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* Secure DMA BUF memory type value for CL_IMPORT_TYPE_ARM property */
 | 
					 | 
				
			||||||
#define CL_IMPORT_TYPE_SECURE_ARM                 0x40B5
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* This extension adds a new function that allows for direct memory import into
 | 
					 | 
				
			||||||
 * OpenCL via the clImportMemoryARM function.
 | 
					 | 
				
			||||||
 *
 | 
					 | 
				
			||||||
 * Memory imported through this interface will be mapped into the device's page
 | 
					 | 
				
			||||||
 * tables directly, providing zero copy access. It will never fall back to copy
 | 
					 | 
				
			||||||
 * operations and aliased buffers.
 | 
					 | 
				
			||||||
 *
 | 
					 | 
				
			||||||
 * Types of memory supported for import are specified as additional extension
 | 
					 | 
				
			||||||
 * strings.
 | 
					 | 
				
			||||||
 *
 | 
					 | 
				
			||||||
 * This extension produces cl_mem allocations which are compatible with all other
 | 
					 | 
				
			||||||
 * users of cl_mem in the standard API.
 | 
					 | 
				
			||||||
 *
 | 
					 | 
				
			||||||
 * This extension maps pages with the same properties as the normal buffer creation
 | 
					 | 
				
			||||||
 * function clCreateBuffer.
 | 
					 | 
				
			||||||
 */
 | 
					 | 
				
			||||||
extern CL_API_ENTRY cl_mem CL_API_CALL
 | 
					 | 
				
			||||||
clImportMemoryARM( cl_context context,
 | 
					 | 
				
			||||||
                   cl_mem_flags flags,
 | 
					 | 
				
			||||||
                   const cl_import_properties_arm *properties,
 | 
					 | 
				
			||||||
                   void *memory,
 | 
					 | 
				
			||||||
                   size_t size,
 | 
					 | 
				
			||||||
                   cl_int *errcode_ret) CL_EXT_SUFFIX__VERSION_1_0;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#endif /* CL_VERSION_1_0 */
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/******************************************
 | 
					 | 
				
			||||||
 * cl_arm_shared_virtual_memory extension *
 | 
					 | 
				
			||||||
 ******************************************/
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#ifdef CL_VERSION_1_2
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* Used by clGetDeviceInfo */
 | 
					 | 
				
			||||||
#define CL_DEVICE_SVM_CAPABILITIES_ARM                  0x40B6
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* Used by clGetMemObjectInfo */
 | 
					 | 
				
			||||||
#define CL_MEM_USES_SVM_POINTER_ARM                     0x40B7
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* Used by clSetKernelExecInfoARM: */
 | 
					 | 
				
			||||||
#define CL_KERNEL_EXEC_INFO_SVM_PTRS_ARM                0x40B8
 | 
					 | 
				
			||||||
#define CL_KERNEL_EXEC_INFO_SVM_FINE_GRAIN_SYSTEM_ARM   0x40B9
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* To be used by clGetEventInfo: */
 | 
					 | 
				
			||||||
#define CL_COMMAND_SVM_FREE_ARM                         0x40BA
 | 
					 | 
				
			||||||
#define CL_COMMAND_SVM_MEMCPY_ARM                       0x40BB
 | 
					 | 
				
			||||||
#define CL_COMMAND_SVM_MEMFILL_ARM                      0x40BC
 | 
					 | 
				
			||||||
#define CL_COMMAND_SVM_MAP_ARM                          0x40BD
 | 
					 | 
				
			||||||
#define CL_COMMAND_SVM_UNMAP_ARM                        0x40BE
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* Flag values returned by clGetDeviceInfo with CL_DEVICE_SVM_CAPABILITIES_ARM as the param_name. */
 | 
					 | 
				
			||||||
#define CL_DEVICE_SVM_COARSE_GRAIN_BUFFER_ARM           (1 << 0)
 | 
					 | 
				
			||||||
#define CL_DEVICE_SVM_FINE_GRAIN_BUFFER_ARM             (1 << 1)
 | 
					 | 
				
			||||||
#define CL_DEVICE_SVM_FINE_GRAIN_SYSTEM_ARM             (1 << 2)
 | 
					 | 
				
			||||||
#define CL_DEVICE_SVM_ATOMICS_ARM                       (1 << 3)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* Flag values used by clSVMAllocARM: */
 | 
					 | 
				
			||||||
#define CL_MEM_SVM_FINE_GRAIN_BUFFER_ARM                (1 << 10)
 | 
					 | 
				
			||||||
#define CL_MEM_SVM_ATOMICS_ARM                          (1 << 11)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
typedef cl_bitfield cl_svm_mem_flags_arm;
 | 
					 | 
				
			||||||
typedef cl_uint     cl_kernel_exec_info_arm;
 | 
					 | 
				
			||||||
typedef cl_bitfield cl_device_svm_capabilities_arm;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
extern CL_API_ENTRY void * CL_API_CALL
 | 
					 | 
				
			||||||
clSVMAllocARM(cl_context       /* context */,
 | 
					 | 
				
			||||||
              cl_svm_mem_flags_arm /* flags */,
 | 
					 | 
				
			||||||
              size_t           /* size */,
 | 
					 | 
				
			||||||
              cl_uint          /* alignment */) CL_EXT_SUFFIX__VERSION_1_2;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
extern CL_API_ENTRY void CL_API_CALL
 | 
					 | 
				
			||||||
clSVMFreeARM(cl_context        /* context */,
 | 
					 | 
				
			||||||
             void *            /* svm_pointer */) CL_EXT_SUFFIX__VERSION_1_2;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
extern CL_API_ENTRY cl_int CL_API_CALL
 | 
					 | 
				
			||||||
clEnqueueSVMFreeARM(cl_command_queue  /* command_queue */,
 | 
					 | 
				
			||||||
                    cl_uint           /* num_svm_pointers */,
 | 
					 | 
				
			||||||
                    void *[]          /* svm_pointers[] */,
 | 
					 | 
				
			||||||
                    void (CL_CALLBACK * /*pfn_free_func*/)(cl_command_queue /* queue */,
 | 
					 | 
				
			||||||
                                                           cl_uint          /* num_svm_pointers */,
 | 
					 | 
				
			||||||
                                                           void *[]         /* svm_pointers[] */,
 | 
					 | 
				
			||||||
                                                           void *           /* user_data */),
 | 
					 | 
				
			||||||
                    void *            /* user_data */,
 | 
					 | 
				
			||||||
                    cl_uint           /* num_events_in_wait_list */,
 | 
					 | 
				
			||||||
                    const cl_event *  /* event_wait_list */,
 | 
					 | 
				
			||||||
                    cl_event *        /* event */) CL_EXT_SUFFIX__VERSION_1_2;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
extern CL_API_ENTRY cl_int CL_API_CALL
 | 
					 | 
				
			||||||
clEnqueueSVMMemcpyARM(cl_command_queue  /* command_queue */,
 | 
					 | 
				
			||||||
                      cl_bool           /* blocking_copy */,
 | 
					 | 
				
			||||||
                      void *            /* dst_ptr */,
 | 
					 | 
				
			||||||
                      const void *      /* src_ptr */,
 | 
					 | 
				
			||||||
                      size_t            /* size */,
 | 
					 | 
				
			||||||
                      cl_uint           /* num_events_in_wait_list */,
 | 
					 | 
				
			||||||
                      const cl_event *  /* event_wait_list */,
 | 
					 | 
				
			||||||
                      cl_event *        /* event */) CL_EXT_SUFFIX__VERSION_1_2;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
extern CL_API_ENTRY cl_int CL_API_CALL
 | 
					 | 
				
			||||||
clEnqueueSVMMemFillARM(cl_command_queue  /* command_queue */,
 | 
					 | 
				
			||||||
                       void *            /* svm_ptr */,
 | 
					 | 
				
			||||||
                       const void *      /* pattern */,
 | 
					 | 
				
			||||||
                       size_t            /* pattern_size */,
 | 
					 | 
				
			||||||
                       size_t            /* size */,
 | 
					 | 
				
			||||||
                       cl_uint           /* num_events_in_wait_list */,
 | 
					 | 
				
			||||||
                       const cl_event *  /* event_wait_list */,
 | 
					 | 
				
			||||||
                       cl_event *        /* event */) CL_EXT_SUFFIX__VERSION_1_2;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
extern CL_API_ENTRY cl_int CL_API_CALL
 | 
					 | 
				
			||||||
clEnqueueSVMMapARM(cl_command_queue  /* command_queue */,
 | 
					 | 
				
			||||||
                   cl_bool           /* blocking_map */,
 | 
					 | 
				
			||||||
                   cl_map_flags      /* flags */,
 | 
					 | 
				
			||||||
                   void *            /* svm_ptr */,
 | 
					 | 
				
			||||||
                   size_t            /* size */,
 | 
					 | 
				
			||||||
                   cl_uint           /* num_events_in_wait_list */,
 | 
					 | 
				
			||||||
                   const cl_event *  /* event_wait_list */,
 | 
					 | 
				
			||||||
                   cl_event *        /* event */) CL_EXT_SUFFIX__VERSION_1_2;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
extern CL_API_ENTRY cl_int CL_API_CALL
 | 
					 | 
				
			||||||
clEnqueueSVMUnmapARM(cl_command_queue  /* command_queue */,
 | 
					 | 
				
			||||||
                     void *            /* svm_ptr */,
 | 
					 | 
				
			||||||
                     cl_uint           /* num_events_in_wait_list */,
 | 
					 | 
				
			||||||
                     const cl_event *  /* event_wait_list */,
 | 
					 | 
				
			||||||
                     cl_event *        /* event */) CL_EXT_SUFFIX__VERSION_1_2;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
extern CL_API_ENTRY cl_int CL_API_CALL
 | 
					 | 
				
			||||||
clSetKernelArgSVMPointerARM(cl_kernel    /* kernel */,
 | 
					 | 
				
			||||||
                            cl_uint      /* arg_index */,
 | 
					 | 
				
			||||||
                            const void * /* arg_value */) CL_EXT_SUFFIX__VERSION_1_2;
 | 
					 | 
				
			||||||
extern CL_API_ENTRY cl_int CL_API_CALL
 | 
					 | 
				
			||||||
clSetKernelExecInfoARM(cl_kernel            /* kernel */,
 | 
					 | 
				
			||||||
                       cl_kernel_exec_info_arm  /* param_name */,
 | 
					 | 
				
			||||||
                       size_t               /* param_value_size */,
 | 
					 | 
				
			||||||
                       const void *         /* param_value */) CL_EXT_SUFFIX__VERSION_1_2;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#endif /* CL_VERSION_1_2 */
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#ifdef __cplusplus
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
#endif
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#endif /* __CL_EXT_H */
 | 
					 | 
				
			||||||
							
								
								
									
										429
									
								
								include/triton/external/CL/cl_ext_intel.h
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										429
									
								
								include/triton/external/CL/cl_ext_intel.h
									
									
									
									
										vendored
									
									
								
							@@ -1,429 +0,0 @@
 | 
				
			|||||||
/*******************************************************************************
 | 
					 | 
				
			||||||
 * Copyright (c) 2008-2017 The Khronos Group Inc.
 | 
					 | 
				
			||||||
 *
 | 
					 | 
				
			||||||
 * Permission is hereby granted, free of charge, to any person obtaining a
 | 
					 | 
				
			||||||
 * copy of this software and/or associated documentation files (the
 | 
					 | 
				
			||||||
 * "Materials"), to deal in the Materials without restriction, including
 | 
					 | 
				
			||||||
 * without limitation the rights to use, copy, modify, merge, publish,
 | 
					 | 
				
			||||||
 * distribute, sublicense, and/or sell copies of the Materials, and to
 | 
					 | 
				
			||||||
 * permit persons to whom the Materials are furnished to do so, subject to
 | 
					 | 
				
			||||||
 * the following conditions:
 | 
					 | 
				
			||||||
 *
 | 
					 | 
				
			||||||
 * The above copyright notice and this permission notice shall be included
 | 
					 | 
				
			||||||
 * in all copies or substantial portions of the Materials.
 | 
					 | 
				
			||||||
 *
 | 
					 | 
				
			||||||
 * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
 | 
					 | 
				
			||||||
 * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
 | 
					 | 
				
			||||||
 * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
 | 
					 | 
				
			||||||
 *    https://www.khronos.org/registry/
 | 
					 | 
				
			||||||
 *
 | 
					 | 
				
			||||||
 * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 | 
					 | 
				
			||||||
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 | 
					 | 
				
			||||||
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 | 
					 | 
				
			||||||
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 | 
					 | 
				
			||||||
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 | 
					 | 
				
			||||||
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 | 
					 | 
				
			||||||
 * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
 | 
					 | 
				
			||||||
 ******************************************************************************/
 | 
					 | 
				
			||||||
/*****************************************************************************\
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
Copyright (c) 2013-2017 Intel Corporation All Rights Reserved.
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
THESE MATERIALS ARE PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 | 
					 | 
				
			||||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 | 
					 | 
				
			||||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 | 
					 | 
				
			||||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
 | 
					 | 
				
			||||||
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 | 
					 | 
				
			||||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 | 
					 | 
				
			||||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 | 
					 | 
				
			||||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 | 
					 | 
				
			||||||
OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
 | 
					 | 
				
			||||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THESE
 | 
					 | 
				
			||||||
MATERIALS, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
File Name: cl_ext_intel.h
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
Abstract:
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
Notes:
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
\*****************************************************************************/
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#ifndef __CL_EXT_INTEL_H
 | 
					 | 
				
			||||||
#define __CL_EXT_INTEL_H
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#ifdef __APPLE__
 | 
					 | 
				
			||||||
    #include <OpenCL/cl.h>
 | 
					 | 
				
			||||||
    #include <OpenCL/cl_platform.h>
 | 
					 | 
				
			||||||
#else
 | 
					 | 
				
			||||||
    #include "cl.h"
 | 
					 | 
				
			||||||
    #include "cl_platform.h"
 | 
					 | 
				
			||||||
#endif
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#ifdef __cplusplus
 | 
					 | 
				
			||||||
extern "C" {
 | 
					 | 
				
			||||||
#endif
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/***************************************
 | 
					 | 
				
			||||||
* cl_intel_thread_local_exec extension *
 | 
					 | 
				
			||||||
****************************************/
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#define cl_intel_thread_local_exec 1
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#define CL_QUEUE_THREAD_LOCAL_EXEC_ENABLE_INTEL      (((cl_bitfield)1) << 31)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/***********************************************
 | 
					 | 
				
			||||||
* cl_intel_device_partition_by_names extension *
 | 
					 | 
				
			||||||
************************************************/
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#define cl_intel_device_partition_by_names 1
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#define CL_DEVICE_PARTITION_BY_NAMES_INTEL          0x4052
 | 
					 | 
				
			||||||
#define CL_PARTITION_BY_NAMES_LIST_END_INTEL        -1
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/************************************************
 | 
					 | 
				
			||||||
* cl_intel_accelerator extension                *
 | 
					 | 
				
			||||||
* cl_intel_motion_estimation extension          *
 | 
					 | 
				
			||||||
* cl_intel_advanced_motion_estimation extension *
 | 
					 | 
				
			||||||
*************************************************/
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#define cl_intel_accelerator 1
 | 
					 | 
				
			||||||
#define cl_intel_motion_estimation 1
 | 
					 | 
				
			||||||
#define cl_intel_advanced_motion_estimation 1
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
typedef struct _cl_accelerator_intel* cl_accelerator_intel;
 | 
					 | 
				
			||||||
typedef cl_uint cl_accelerator_type_intel;
 | 
					 | 
				
			||||||
typedef cl_uint cl_accelerator_info_intel;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
typedef struct _cl_motion_estimation_desc_intel {
 | 
					 | 
				
			||||||
    cl_uint mb_block_type;
 | 
					 | 
				
			||||||
    cl_uint subpixel_mode;
 | 
					 | 
				
			||||||
    cl_uint sad_adjust_mode;
 | 
					 | 
				
			||||||
    cl_uint search_path_type;
 | 
					 | 
				
			||||||
} cl_motion_estimation_desc_intel;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* error codes */
 | 
					 | 
				
			||||||
#define CL_INVALID_ACCELERATOR_INTEL                              -1094
 | 
					 | 
				
			||||||
#define CL_INVALID_ACCELERATOR_TYPE_INTEL                         -1095
 | 
					 | 
				
			||||||
#define CL_INVALID_ACCELERATOR_DESCRIPTOR_INTEL                   -1096
 | 
					 | 
				
			||||||
#define CL_ACCELERATOR_TYPE_NOT_SUPPORTED_INTEL                   -1097
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* cl_accelerator_type_intel */
 | 
					 | 
				
			||||||
#define CL_ACCELERATOR_TYPE_MOTION_ESTIMATION_INTEL               0x0
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* cl_accelerator_info_intel */
 | 
					 | 
				
			||||||
#define CL_ACCELERATOR_DESCRIPTOR_INTEL                           0x4090
 | 
					 | 
				
			||||||
#define CL_ACCELERATOR_REFERENCE_COUNT_INTEL                      0x4091
 | 
					 | 
				
			||||||
#define CL_ACCELERATOR_CONTEXT_INTEL                              0x4092
 | 
					 | 
				
			||||||
#define CL_ACCELERATOR_TYPE_INTEL                                 0x4093
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* cl_motion_detect_desc_intel flags */
 | 
					 | 
				
			||||||
#define CL_ME_MB_TYPE_16x16_INTEL                                 0x0
 | 
					 | 
				
			||||||
#define CL_ME_MB_TYPE_8x8_INTEL                                   0x1
 | 
					 | 
				
			||||||
#define CL_ME_MB_TYPE_4x4_INTEL                                   0x2
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#define CL_ME_SUBPIXEL_MODE_INTEGER_INTEL                         0x0
 | 
					 | 
				
			||||||
#define CL_ME_SUBPIXEL_MODE_HPEL_INTEL                            0x1
 | 
					 | 
				
			||||||
#define CL_ME_SUBPIXEL_MODE_QPEL_INTEL                            0x2
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#define CL_ME_SAD_ADJUST_MODE_NONE_INTEL                          0x0
 | 
					 | 
				
			||||||
#define CL_ME_SAD_ADJUST_MODE_HAAR_INTEL                          0x1
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#define CL_ME_SEARCH_PATH_RADIUS_2_2_INTEL                        0x0
 | 
					 | 
				
			||||||
#define CL_ME_SEARCH_PATH_RADIUS_4_4_INTEL                        0x1
 | 
					 | 
				
			||||||
#define CL_ME_SEARCH_PATH_RADIUS_16_12_INTEL                      0x5
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#define CL_ME_SKIP_BLOCK_TYPE_16x16_INTEL                         0x0
 | 
					 | 
				
			||||||
#define CL_ME_CHROMA_INTRA_PREDICT_ENABLED_INTEL                  0x1
 | 
					 | 
				
			||||||
#define CL_ME_LUMA_INTRA_PREDICT_ENABLED_INTEL                    0x2
 | 
					 | 
				
			||||||
#define CL_ME_SKIP_BLOCK_TYPE_8x8_INTEL                           0x4
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#define CL_ME_FORWARD_INPUT_MODE_INTEL                            0x1
 | 
					 | 
				
			||||||
#define CL_ME_BACKWARD_INPUT_MODE_INTEL                           0x2
 | 
					 | 
				
			||||||
#define CL_ME_BIDIRECTION_INPUT_MODE_INTEL                        0x3
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#define CL_ME_BIDIR_WEIGHT_QUARTER_INTEL                          16
 | 
					 | 
				
			||||||
#define CL_ME_BIDIR_WEIGHT_THIRD_INTEL                            21
 | 
					 | 
				
			||||||
#define CL_ME_BIDIR_WEIGHT_HALF_INTEL                             32
 | 
					 | 
				
			||||||
#define CL_ME_BIDIR_WEIGHT_TWO_THIRD_INTEL                        43
 | 
					 | 
				
			||||||
#define CL_ME_BIDIR_WEIGHT_THREE_QUARTER_INTEL                    48
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#define CL_ME_COST_PENALTY_NONE_INTEL                             0x0
 | 
					 | 
				
			||||||
#define CL_ME_COST_PENALTY_LOW_INTEL                              0x1
 | 
					 | 
				
			||||||
#define CL_ME_COST_PENALTY_NORMAL_INTEL                           0x2
 | 
					 | 
				
			||||||
#define CL_ME_COST_PENALTY_HIGH_INTEL                             0x3
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#define CL_ME_COST_PRECISION_QPEL_INTEL                           0x0
 | 
					 | 
				
			||||||
#define CL_ME_COST_PRECISION_HPEL_INTEL                           0x1
 | 
					 | 
				
			||||||
#define CL_ME_COST_PRECISION_PEL_INTEL                            0x2
 | 
					 | 
				
			||||||
#define CL_ME_COST_PRECISION_DPEL_INTEL                           0x3
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#define CL_ME_LUMA_PREDICTOR_MODE_VERTICAL_INTEL                  0x0
 | 
					 | 
				
			||||||
#define CL_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_INTEL                0x1
 | 
					 | 
				
			||||||
#define CL_ME_LUMA_PREDICTOR_MODE_DC_INTEL                        0x2
 | 
					 | 
				
			||||||
#define CL_ME_LUMA_PREDICTOR_MODE_DIAGONAL_DOWN_LEFT_INTEL        0x3
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#define CL_ME_LUMA_PREDICTOR_MODE_DIAGONAL_DOWN_RIGHT_INTEL       0x4
 | 
					 | 
				
			||||||
#define CL_ME_LUMA_PREDICTOR_MODE_PLANE_INTEL                     0x4
 | 
					 | 
				
			||||||
#define CL_ME_LUMA_PREDICTOR_MODE_VERTICAL_RIGHT_INTEL            0x5
 | 
					 | 
				
			||||||
#define CL_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_DOWN_INTEL           0x6
 | 
					 | 
				
			||||||
#define CL_ME_LUMA_PREDICTOR_MODE_VERTICAL_LEFT_INTEL             0x7
 | 
					 | 
				
			||||||
#define CL_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_UP_INTEL             0x8
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#define CL_ME_CHROMA_PREDICTOR_MODE_DC_INTEL                      0x0
 | 
					 | 
				
			||||||
#define CL_ME_CHROMA_PREDICTOR_MODE_HORIZONTAL_INTEL              0x1
 | 
					 | 
				
			||||||
#define CL_ME_CHROMA_PREDICTOR_MODE_VERTICAL_INTEL                0x2
 | 
					 | 
				
			||||||
#define CL_ME_CHROMA_PREDICTOR_MODE_PLANE_INTEL                   0x3
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* cl_device_info */
 | 
					 | 
				
			||||||
#define CL_DEVICE_ME_VERSION_INTEL                                0x407E
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#define CL_ME_VERSION_LEGACY_INTEL                                0x0
 | 
					 | 
				
			||||||
#define CL_ME_VERSION_ADVANCED_VER_1_INTEL                        0x1
 | 
					 | 
				
			||||||
#define CL_ME_VERSION_ADVANCED_VER_2_INTEL                        0x2
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
extern CL_API_ENTRY cl_accelerator_intel CL_API_CALL
 | 
					 | 
				
			||||||
clCreateAcceleratorINTEL(
 | 
					 | 
				
			||||||
    cl_context                  /* context */,
 | 
					 | 
				
			||||||
    cl_accelerator_type_intel   /* accelerator_type */,
 | 
					 | 
				
			||||||
    size_t                      /* descriptor_size */,
 | 
					 | 
				
			||||||
    const void*                 /* descriptor */,
 | 
					 | 
				
			||||||
    cl_int*                     /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_2;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
typedef CL_API_ENTRY cl_accelerator_intel (CL_API_CALL *clCreateAcceleratorINTEL_fn)(
 | 
					 | 
				
			||||||
    cl_context                  /* context */,
 | 
					 | 
				
			||||||
    cl_accelerator_type_intel   /* accelerator_type */,
 | 
					 | 
				
			||||||
    size_t                      /* descriptor_size */,
 | 
					 | 
				
			||||||
    const void*                 /* descriptor */,
 | 
					 | 
				
			||||||
    cl_int*                     /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_2;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
extern CL_API_ENTRY cl_int CL_API_CALL
 | 
					 | 
				
			||||||
clGetAcceleratorInfoINTEL(
 | 
					 | 
				
			||||||
    cl_accelerator_intel        /* accelerator */,
 | 
					 | 
				
			||||||
    cl_accelerator_info_intel   /* param_name */,
 | 
					 | 
				
			||||||
    size_t                      /* param_value_size */,
 | 
					 | 
				
			||||||
    void*                       /* param_value */,
 | 
					 | 
				
			||||||
    size_t*                     /* param_value_size_ret */) CL_EXT_SUFFIX__VERSION_1_2;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetAcceleratorInfoINTEL_fn)(
 | 
					 | 
				
			||||||
    cl_accelerator_intel        /* accelerator */,
 | 
					 | 
				
			||||||
    cl_accelerator_info_intel   /* param_name */,
 | 
					 | 
				
			||||||
    size_t                      /* param_value_size */,
 | 
					 | 
				
			||||||
    void*                       /* param_value */,
 | 
					 | 
				
			||||||
    size_t*                     /* param_value_size_ret */) CL_EXT_SUFFIX__VERSION_1_2;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
extern CL_API_ENTRY cl_int CL_API_CALL
 | 
					 | 
				
			||||||
clRetainAcceleratorINTEL(
 | 
					 | 
				
			||||||
    cl_accelerator_intel        /* accelerator */) CL_EXT_SUFFIX__VERSION_1_2;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
typedef CL_API_ENTRY cl_int (CL_API_CALL *clRetainAcceleratorINTEL_fn)(
 | 
					 | 
				
			||||||
    cl_accelerator_intel        /* accelerator */) CL_EXT_SUFFIX__VERSION_1_2;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
extern CL_API_ENTRY cl_int CL_API_CALL
 | 
					 | 
				
			||||||
clReleaseAcceleratorINTEL(
 | 
					 | 
				
			||||||
    cl_accelerator_intel        /* accelerator */) CL_EXT_SUFFIX__VERSION_1_2;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
typedef CL_API_ENTRY cl_int (CL_API_CALL *clReleaseAcceleratorINTEL_fn)(
 | 
					 | 
				
			||||||
    cl_accelerator_intel        /* accelerator */) CL_EXT_SUFFIX__VERSION_1_2;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/******************************************
 | 
					 | 
				
			||||||
* cl_intel_simultaneous_sharing extension *
 | 
					 | 
				
			||||||
*******************************************/
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#define cl_intel_simultaneous_sharing 1
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#define CL_DEVICE_SIMULTANEOUS_INTEROPS_INTEL            0x4104
 | 
					 | 
				
			||||||
#define CL_DEVICE_NUM_SIMULTANEOUS_INTEROPS_INTEL        0x4105
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/***********************************
 | 
					 | 
				
			||||||
* cl_intel_egl_image_yuv extension *
 | 
					 | 
				
			||||||
************************************/
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#define cl_intel_egl_image_yuv 1
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#define CL_EGL_YUV_PLANE_INTEL                           0x4107
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/********************************
 | 
					 | 
				
			||||||
* cl_intel_packed_yuv extension *
 | 
					 | 
				
			||||||
*********************************/
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#define cl_intel_packed_yuv 1
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#define CL_YUYV_INTEL                                    0x4076
 | 
					 | 
				
			||||||
#define CL_UYVY_INTEL                                    0x4077
 | 
					 | 
				
			||||||
#define CL_YVYU_INTEL                                    0x4078
 | 
					 | 
				
			||||||
#define CL_VYUY_INTEL                                    0x4079
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/********************************************
 | 
					 | 
				
			||||||
* cl_intel_required_subgroup_size extension *
 | 
					 | 
				
			||||||
*********************************************/
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#define cl_intel_required_subgroup_size 1
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#define CL_DEVICE_SUB_GROUP_SIZES_INTEL                  0x4108
 | 
					 | 
				
			||||||
#define CL_KERNEL_SPILL_MEM_SIZE_INTEL                   0x4109
 | 
					 | 
				
			||||||
#define CL_KERNEL_COMPILE_SUB_GROUP_SIZE_INTEL           0x410A
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/****************************************
 | 
					 | 
				
			||||||
* cl_intel_driver_diagnostics extension *
 | 
					 | 
				
			||||||
*****************************************/
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#define cl_intel_driver_diagnostics 1
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
typedef cl_uint cl_diagnostics_verbose_level;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#define CL_CONTEXT_SHOW_DIAGNOSTICS_INTEL                0x4106
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#define CL_CONTEXT_DIAGNOSTICS_LEVEL_ALL_INTEL           ( 0xff )
 | 
					 | 
				
			||||||
#define CL_CONTEXT_DIAGNOSTICS_LEVEL_GOOD_INTEL          ( 1 )
 | 
					 | 
				
			||||||
#define CL_CONTEXT_DIAGNOSTICS_LEVEL_BAD_INTEL           ( 1 << 1 )
 | 
					 | 
				
			||||||
#define CL_CONTEXT_DIAGNOSTICS_LEVEL_NEUTRAL_INTEL       ( 1 << 2 )
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/********************************
 | 
					 | 
				
			||||||
* cl_intel_planar_yuv extension *
 | 
					 | 
				
			||||||
*********************************/
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#define CL_NV12_INTEL                                       0x410E
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#define CL_MEM_NO_ACCESS_INTEL                              ( 1 << 24 )
 | 
					 | 
				
			||||||
#define CL_MEM_ACCESS_FLAGS_UNRESTRICTED_INTEL              ( 1 << 25 )
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#define CL_DEVICE_PLANAR_YUV_MAX_WIDTH_INTEL                0x417E
 | 
					 | 
				
			||||||
#define CL_DEVICE_PLANAR_YUV_MAX_HEIGHT_INTEL               0x417F
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/*******************************************************
 | 
					 | 
				
			||||||
* cl_intel_device_side_avc_motion_estimation extension *
 | 
					 | 
				
			||||||
********************************************************/
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#define CL_DEVICE_AVC_ME_VERSION_INTEL                      0x410B
 | 
					 | 
				
			||||||
#define CL_DEVICE_AVC_ME_SUPPORTS_TEXTURE_SAMPLER_USE_INTEL 0x410C
 | 
					 | 
				
			||||||
#define CL_DEVICE_AVC_ME_SUPPORTS_PREEMPTION_INTEL          0x410D
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#define CL_AVC_ME_VERSION_0_INTEL                           0x0;  // No support.
 | 
					 | 
				
			||||||
#define CL_AVC_ME_VERSION_1_INTEL                           0x1;  // First supported version.
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#define CL_AVC_ME_MAJOR_16x16_INTEL                         0x0
 | 
					 | 
				
			||||||
#define CL_AVC_ME_MAJOR_16x8_INTEL                          0x1
 | 
					 | 
				
			||||||
#define CL_AVC_ME_MAJOR_8x16_INTEL                          0x2
 | 
					 | 
				
			||||||
#define CL_AVC_ME_MAJOR_8x8_INTEL                           0x3
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#define CL_AVC_ME_MINOR_8x8_INTEL                           0x0
 | 
					 | 
				
			||||||
#define CL_AVC_ME_MINOR_8x4_INTEL                           0x1
 | 
					 | 
				
			||||||
#define CL_AVC_ME_MINOR_4x8_INTEL                           0x2
 | 
					 | 
				
			||||||
#define CL_AVC_ME_MINOR_4x4_INTEL                           0x3
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#define CL_AVC_ME_MAJOR_FORWARD_INTEL                       0x0
 | 
					 | 
				
			||||||
#define CL_AVC_ME_MAJOR_BACKWARD_INTEL                      0x1
 | 
					 | 
				
			||||||
#define CL_AVC_ME_MAJOR_BIDIRECTIONAL_INTEL                 0x2
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#define CL_AVC_ME_PARTITION_MASK_ALL_INTEL                  0x0
 | 
					 | 
				
			||||||
#define CL_AVC_ME_PARTITION_MASK_16x16_INTEL                0x7E
 | 
					 | 
				
			||||||
#define CL_AVC_ME_PARTITION_MASK_16x8_INTEL                 0x7D
 | 
					 | 
				
			||||||
#define CL_AVC_ME_PARTITION_MASK_8x16_INTEL                 0x7B
 | 
					 | 
				
			||||||
#define CL_AVC_ME_PARTITION_MASK_8x8_INTEL                  0x77
 | 
					 | 
				
			||||||
#define CL_AVC_ME_PARTITION_MASK_8x4_INTEL                  0x6F
 | 
					 | 
				
			||||||
#define CL_AVC_ME_PARTITION_MASK_4x8_INTEL                  0x5F
 | 
					 | 
				
			||||||
#define CL_AVC_ME_PARTITION_MASK_4x4_INTEL                  0x3F
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#define CL_AVC_ME_SEARCH_WINDOW_EXHAUSTIVE_INTEL            0x0
 | 
					 | 
				
			||||||
#define CL_AVC_ME_SEARCH_WINDOW_SMALL_INTEL                 0x1
 | 
					 | 
				
			||||||
#define CL_AVC_ME_SEARCH_WINDOW_TINY_INTEL                  0x2
 | 
					 | 
				
			||||||
#define CL_AVC_ME_SEARCH_WINDOW_EXTRA_TINY_INTEL            0x3
 | 
					 | 
				
			||||||
#define CL_AVC_ME_SEARCH_WINDOW_DIAMOND_INTEL               0x4
 | 
					 | 
				
			||||||
#define CL_AVC_ME_SEARCH_WINDOW_LARGE_DIAMOND_INTEL         0x5
 | 
					 | 
				
			||||||
#define CL_AVC_ME_SEARCH_WINDOW_RESERVED0_INTEL             0x6
 | 
					 | 
				
			||||||
#define CL_AVC_ME_SEARCH_WINDOW_RESERVED1_INTEL             0x7
 | 
					 | 
				
			||||||
#define CL_AVC_ME_SEARCH_WINDOW_CUSTOM_INTEL                0x8
 | 
					 | 
				
			||||||
#define CL_AVC_ME_SEARCH_WINDOW_16x12_RADIUS_INTEL          0x9
 | 
					 | 
				
			||||||
#define CL_AVC_ME_SEARCH_WINDOW_4x4_RADIUS_INTEL            0x2
 | 
					 | 
				
			||||||
#define CL_AVC_ME_SEARCH_WINDOW_2x2_RADIUS_INTEL            0xa
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#define CL_AVC_ME_SAD_ADJUST_MODE_NONE_INTEL                0x0
 | 
					 | 
				
			||||||
#define CL_AVC_ME_SAD_ADJUST_MODE_HAAR_INTEL                0x2
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#define CL_AVC_ME_SUBPIXEL_MODE_INTEGER_INTEL               0x0
 | 
					 | 
				
			||||||
#define CL_AVC_ME_SUBPIXEL_MODE_HPEL_INTEL                  0x1
 | 
					 | 
				
			||||||
#define CL_AVC_ME_SUBPIXEL_MODE_QPEL_INTEL                  0x3
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#define CL_AVC_ME_COST_PRECISION_QPEL_INTEL                 0x0
 | 
					 | 
				
			||||||
#define CL_AVC_ME_COST_PRECISION_HPEL_INTEL                 0x1
 | 
					 | 
				
			||||||
#define CL_AVC_ME_COST_PRECISION_PEL_INTEL                  0x2
 | 
					 | 
				
			||||||
#define CL_AVC_ME_COST_PRECISION_DPEL_INTEL                 0x3
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#define CL_AVC_ME_BIDIR_WEIGHT_QUARTER_INTEL                0x10
 | 
					 | 
				
			||||||
#define CL_AVC_ME_BIDIR_WEIGHT_THIRD_INTEL                  0x15
 | 
					 | 
				
			||||||
#define CL_AVC_ME_BIDIR_WEIGHT_HALF_INTEL                   0x20
 | 
					 | 
				
			||||||
#define CL_AVC_ME_BIDIR_WEIGHT_TWO_THIRD_INTEL              0x2B
 | 
					 | 
				
			||||||
#define CL_AVC_ME_BIDIR_WEIGHT_THREE_QUARTER_INTEL          0x30
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#define CL_AVC_ME_BORDER_REACHED_LEFT_INTEL                 0x0
 | 
					 | 
				
			||||||
#define CL_AVC_ME_BORDER_REACHED_RIGHT_INTEL                0x2
 | 
					 | 
				
			||||||
#define CL_AVC_ME_BORDER_REACHED_TOP_INTEL                  0x4
 | 
					 | 
				
			||||||
#define CL_AVC_ME_BORDER_REACHED_BOTTOM_INTEL               0x8
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#define CL_AVC_ME_SKIP_BLOCK_PARTITION_16x16_INTEL          0x0
 | 
					 | 
				
			||||||
#define CL_AVC_ME_SKIP_BLOCK_PARTITION_8x8_INTEL            0x4000
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#define CL_AVC_ME_SKIP_BLOCK_16x16_FORWARD_ENABLE_INTEL     ( 0x1 << 24 )
 | 
					 | 
				
			||||||
#define CL_AVC_ME_SKIP_BLOCK_16x16_BACKWARD_ENABLE_INTEL    ( 0x2 << 24 )
 | 
					 | 
				
			||||||
#define CL_AVC_ME_SKIP_BLOCK_16x16_DUAL_ENABLE_INTEL        ( 0x3 << 24 )
 | 
					 | 
				
			||||||
#define CL_AVC_ME_SKIP_BLOCK_8x8_FORWARD_ENABLE_INTEL       ( 0x55 << 24 )
 | 
					 | 
				
			||||||
#define CL_AVC_ME_SKIP_BLOCK_8x8_BACKWARD_ENABLE_INTEL      ( 0xAA << 24 )
 | 
					 | 
				
			||||||
#define CL_AVC_ME_SKIP_BLOCK_8x8_DUAL_ENABLE_INTEL          ( 0xFF << 24 )
 | 
					 | 
				
			||||||
#define CL_AVC_ME_SKIP_BLOCK_8x8_0_FORWARD_ENABLE_INTEL     ( 0x1 << 24 )
 | 
					 | 
				
			||||||
#define CL_AVC_ME_SKIP_BLOCK_8x8_0_BACKWARD_ENABLE_INTEL    ( 0x2 << 24 )
 | 
					 | 
				
			||||||
#define CL_AVC_ME_SKIP_BLOCK_8x8_1_FORWARD_ENABLE_INTEL     ( 0x1 << 26 )
 | 
					 | 
				
			||||||
#define CL_AVC_ME_SKIP_BLOCK_8x8_1_BACKWARD_ENABLE_INTEL    ( 0x2 << 26 )
 | 
					 | 
				
			||||||
#define CL_AVC_ME_SKIP_BLOCK_8x8_2_FORWARD_ENABLE_INTEL     ( 0x1 << 28 )
 | 
					 | 
				
			||||||
#define CL_AVC_ME_SKIP_BLOCK_8x8_2_BACKWARD_ENABLE_INTEL    ( 0x2 << 28 )
 | 
					 | 
				
			||||||
#define CL_AVC_ME_SKIP_BLOCK_8x8_3_FORWARD_ENABLE_INTEL     ( 0x1 << 30 )
 | 
					 | 
				
			||||||
#define CL_AVC_ME_SKIP_BLOCK_8x8_3_BACKWARD_ENABLE_INTEL    ( 0x2 << 30 )
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#define CL_AVC_ME_BLOCK_BASED_SKIP_4x4_INTEL                0x00
 | 
					 | 
				
			||||||
#define CL_AVC_ME_BLOCK_BASED_SKIP_8x8_INTEL                0x80
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#define CL_AVC_ME_INTRA_16x16_INTEL                         0x0
 | 
					 | 
				
			||||||
#define CL_AVC_ME_INTRA_8x8_INTEL                           0x1
 | 
					 | 
				
			||||||
#define CL_AVC_ME_INTRA_4x4_INTEL                           0x2
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#define CL_AVC_ME_INTRA_LUMA_PARTITION_MASK_16x16_INTEL     0x6
 | 
					 | 
				
			||||||
#define CL_AVC_ME_INTRA_LUMA_PARTITION_MASK_8x8_INTEL       0x5
 | 
					 | 
				
			||||||
#define CL_AVC_ME_INTRA_LUMA_PARTITION_MASK_4x4_INTEL       0x3 
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#define CL_AVC_ME_INTRA_NEIGHBOR_LEFT_MASK_ENABLE_INTEL         0x60
 | 
					 | 
				
			||||||
#define CL_AVC_ME_INTRA_NEIGHBOR_UPPER_MASK_ENABLE_INTEL        0x10
 | 
					 | 
				
			||||||
#define CL_AVC_ME_INTRA_NEIGHBOR_UPPER_RIGHT_MASK_ENABLE_INTEL  0x8
 | 
					 | 
				
			||||||
#define CL_AVC_ME_INTRA_NEIGHBOR_UPPER_LEFT_MASK_ENABLE_INTEL   0x4
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#define CL_AVC_ME_LUMA_PREDICTOR_MODE_VERTICAL_INTEL            0x0
 | 
					 | 
				
			||||||
#define CL_AVC_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_INTEL          0x1
 | 
					 | 
				
			||||||
#define CL_AVC_ME_LUMA_PREDICTOR_MODE_DC_INTEL                  0x2
 | 
					 | 
				
			||||||
#define CL_AVC_ME_LUMA_PREDICTOR_MODE_DIAGONAL_DOWN_LEFT_INTEL  0x3
 | 
					 | 
				
			||||||
#define CL_AVC_ME_LUMA_PREDICTOR_MODE_DIAGONAL_DOWN_RIGHT_INTEL 0x4
 | 
					 | 
				
			||||||
#define CL_AVC_ME_LUMA_PREDICTOR_MODE_PLANE_INTEL               0x4
 | 
					 | 
				
			||||||
#define CL_AVC_ME_LUMA_PREDICTOR_MODE_VERTICAL_RIGHT_INTEL      0x5
 | 
					 | 
				
			||||||
#define CL_AVC_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_DOWN_INTEL     0x6
 | 
					 | 
				
			||||||
#define CL_AVC_ME_LUMA_PREDICTOR_MODE_VERTICAL_LEFT_INTEL       0x7
 | 
					 | 
				
			||||||
#define CL_AVC_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_UP_INTEL       0x8
 | 
					 | 
				
			||||||
#define CL_AVC_ME_CHROMA_PREDICTOR_MODE_DC_INTEL                0x0
 | 
					 | 
				
			||||||
#define CL_AVC_ME_CHROMA_PREDICTOR_MODE_HORIZONTAL_INTEL        0x1
 | 
					 | 
				
			||||||
#define CL_AVC_ME_CHROMA_PREDICTOR_MODE_VERTICAL_INTEL          0x2
 | 
					 | 
				
			||||||
#define CL_AVC_ME_CHROMA_PREDICTOR_MODE_PLANE_INTEL             0x3
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#define CL_AVC_ME_FRAME_FORWARD_INTEL                       0x1
 | 
					 | 
				
			||||||
#define CL_AVC_ME_FRAME_BACKWARD_INTEL                      0x2
 | 
					 | 
				
			||||||
#define CL_AVC_ME_FRAME_DUAL_INTEL                          0x3
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#define CL_AVC_ME_SLICE_TYPE_PRED_INTEL                     0x0
 | 
					 | 
				
			||||||
#define CL_AVC_ME_SLICE_TYPE_BPRED_INTEL                    0x1
 | 
					 | 
				
			||||||
#define CL_AVC_ME_SLICE_TYPE_INTRA_INTEL                    0x2
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#define CL_AVC_ME_INTERLACED_SCAN_TOP_FIELD_INTEL           0x0
 | 
					 | 
				
			||||||
#define CL_AVC_ME_INTERLACED_SCAN_BOTTOM_FIELD_INTEL        0x1  
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#ifdef __cplusplus
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
#endif
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#endif /* __CL_EXT_INTEL_H */
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
							
								
								
									
										167
									
								
								include/triton/external/CL/cl_gl.h
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										167
									
								
								include/triton/external/CL/cl_gl.h
									
									
									
									
										vendored
									
									
								
							@@ -1,167 +0,0 @@
 | 
				
			|||||||
/**********************************************************************************
 | 
					 | 
				
			||||||
 * Copyright (c) 2008-2015 The Khronos Group Inc.
 | 
					 | 
				
			||||||
 *
 | 
					 | 
				
			||||||
 * Permission is hereby granted, free of charge, to any person obtaining a
 | 
					 | 
				
			||||||
 * copy of this software and/or associated documentation files (the
 | 
					 | 
				
			||||||
 * "Materials"), to deal in the Materials without restriction, including
 | 
					 | 
				
			||||||
 * without limitation the rights to use, copy, modify, merge, publish,
 | 
					 | 
				
			||||||
 * distribute, sublicense, and/or sell copies of the Materials, and to
 | 
					 | 
				
			||||||
 * permit persons to whom the Materials are furnished to do so, subject to
 | 
					 | 
				
			||||||
 * the following conditions:
 | 
					 | 
				
			||||||
 *
 | 
					 | 
				
			||||||
 * The above copyright notice and this permission notice shall be included
 | 
					 | 
				
			||||||
 * in all copies or substantial portions of the Materials.
 | 
					 | 
				
			||||||
 *
 | 
					 | 
				
			||||||
 * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
 | 
					 | 
				
			||||||
 * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
 | 
					 | 
				
			||||||
 * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
 | 
					 | 
				
			||||||
 *    https://www.khronos.org/registry/
 | 
					 | 
				
			||||||
 *
 | 
					 | 
				
			||||||
 * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 | 
					 | 
				
			||||||
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 | 
					 | 
				
			||||||
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 | 
					 | 
				
			||||||
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 | 
					 | 
				
			||||||
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 | 
					 | 
				
			||||||
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 | 
					 | 
				
			||||||
 * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
 | 
					 | 
				
			||||||
 **********************************************************************************/
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#ifndef __OPENCL_CL_GL_H
 | 
					 | 
				
			||||||
#define __OPENCL_CL_GL_H
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#ifdef __APPLE__
 | 
					 | 
				
			||||||
#include <OpenCL/cl.h>
 | 
					 | 
				
			||||||
#else
 | 
					 | 
				
			||||||
#include "cl.h"
 | 
					 | 
				
			||||||
#endif	
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#ifdef __cplusplus
 | 
					 | 
				
			||||||
extern "C" {
 | 
					 | 
				
			||||||
#endif
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
typedef cl_uint     cl_gl_object_type;
 | 
					 | 
				
			||||||
typedef cl_uint     cl_gl_texture_info;
 | 
					 | 
				
			||||||
typedef cl_uint     cl_gl_platform_info;
 | 
					 | 
				
			||||||
typedef struct __GLsync *cl_GLsync;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* cl_gl_object_type = 0x2000 - 0x200F enum values are currently taken           */
 | 
					 | 
				
			||||||
#define CL_GL_OBJECT_BUFFER                     0x2000
 | 
					 | 
				
			||||||
#define CL_GL_OBJECT_TEXTURE2D                  0x2001
 | 
					 | 
				
			||||||
#define CL_GL_OBJECT_TEXTURE3D                  0x2002
 | 
					 | 
				
			||||||
#define CL_GL_OBJECT_RENDERBUFFER               0x2003
 | 
					 | 
				
			||||||
#define CL_GL_OBJECT_TEXTURE2D_ARRAY            0x200E
 | 
					 | 
				
			||||||
#define CL_GL_OBJECT_TEXTURE1D                  0x200F
 | 
					 | 
				
			||||||
#define CL_GL_OBJECT_TEXTURE1D_ARRAY            0x2010
 | 
					 | 
				
			||||||
#define CL_GL_OBJECT_TEXTURE_BUFFER             0x2011
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* cl_gl_texture_info           */
 | 
					 | 
				
			||||||
#define CL_GL_TEXTURE_TARGET                    0x2004
 | 
					 | 
				
			||||||
#define CL_GL_MIPMAP_LEVEL                      0x2005
 | 
					 | 
				
			||||||
#define CL_GL_NUM_SAMPLES                       0x2012
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
extern CL_API_ENTRY cl_mem CL_API_CALL
 | 
					 | 
				
			||||||
clCreateFromGLBuffer(cl_context     /* context */,
 | 
					 | 
				
			||||||
                     cl_mem_flags   /* flags */,
 | 
					 | 
				
			||||||
                     cl_GLuint      /* bufobj */,
 | 
					 | 
				
			||||||
                     int *          /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
extern CL_API_ENTRY cl_mem CL_API_CALL
 | 
					 | 
				
			||||||
clCreateFromGLTexture(cl_context      /* context */,
 | 
					 | 
				
			||||||
                      cl_mem_flags    /* flags */,
 | 
					 | 
				
			||||||
                      cl_GLenum       /* target */,
 | 
					 | 
				
			||||||
                      cl_GLint        /* miplevel */,
 | 
					 | 
				
			||||||
                      cl_GLuint       /* texture */,
 | 
					 | 
				
			||||||
                      cl_int *        /* errcode_ret */) CL_API_SUFFIX__VERSION_1_2;
 | 
					 | 
				
			||||||
    
 | 
					 | 
				
			||||||
extern CL_API_ENTRY cl_mem CL_API_CALL
 | 
					 | 
				
			||||||
clCreateFromGLRenderbuffer(cl_context   /* context */,
 | 
					 | 
				
			||||||
                           cl_mem_flags /* flags */,
 | 
					 | 
				
			||||||
                           cl_GLuint    /* renderbuffer */,
 | 
					 | 
				
			||||||
                           cl_int *     /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
extern CL_API_ENTRY cl_int CL_API_CALL
 | 
					 | 
				
			||||||
clGetGLObjectInfo(cl_mem                /* memobj */,
 | 
					 | 
				
			||||||
                  cl_gl_object_type *   /* gl_object_type */,
 | 
					 | 
				
			||||||
                  cl_GLuint *           /* gl_object_name */) CL_API_SUFFIX__VERSION_1_0;
 | 
					 | 
				
			||||||
                  
 | 
					 | 
				
			||||||
extern CL_API_ENTRY cl_int CL_API_CALL
 | 
					 | 
				
			||||||
clGetGLTextureInfo(cl_mem               /* memobj */,
 | 
					 | 
				
			||||||
                   cl_gl_texture_info   /* param_name */,
 | 
					 | 
				
			||||||
                   size_t               /* param_value_size */,
 | 
					 | 
				
			||||||
                   void *               /* param_value */,
 | 
					 | 
				
			||||||
                   size_t *             /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
extern CL_API_ENTRY cl_int CL_API_CALL
 | 
					 | 
				
			||||||
clEnqueueAcquireGLObjects(cl_command_queue      /* command_queue */,
 | 
					 | 
				
			||||||
                          cl_uint               /* num_objects */,
 | 
					 | 
				
			||||||
                          const cl_mem *        /* mem_objects */,
 | 
					 | 
				
			||||||
                          cl_uint               /* num_events_in_wait_list */,
 | 
					 | 
				
			||||||
                          const cl_event *      /* event_wait_list */,
 | 
					 | 
				
			||||||
                          cl_event *            /* event */) CL_API_SUFFIX__VERSION_1_0;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
extern CL_API_ENTRY cl_int CL_API_CALL
 | 
					 | 
				
			||||||
clEnqueueReleaseGLObjects(cl_command_queue      /* command_queue */,
 | 
					 | 
				
			||||||
                          cl_uint               /* num_objects */,
 | 
					 | 
				
			||||||
                          const cl_mem *        /* mem_objects */,
 | 
					 | 
				
			||||||
                          cl_uint               /* num_events_in_wait_list */,
 | 
					 | 
				
			||||||
                          const cl_event *      /* event_wait_list */,
 | 
					 | 
				
			||||||
                          cl_event *            /* event */) CL_API_SUFFIX__VERSION_1_0;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* Deprecated OpenCL 1.1 APIs */
 | 
					 | 
				
			||||||
extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL
 | 
					 | 
				
			||||||
clCreateFromGLTexture2D(cl_context      /* context */,
 | 
					 | 
				
			||||||
                        cl_mem_flags    /* flags */,
 | 
					 | 
				
			||||||
                        cl_GLenum       /* target */,
 | 
					 | 
				
			||||||
                        cl_GLint        /* miplevel */,
 | 
					 | 
				
			||||||
                        cl_GLuint       /* texture */,
 | 
					 | 
				
			||||||
                        cl_int *        /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
 | 
					 | 
				
			||||||
    
 | 
					 | 
				
			||||||
extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL
 | 
					 | 
				
			||||||
clCreateFromGLTexture3D(cl_context      /* context */,
 | 
					 | 
				
			||||||
                        cl_mem_flags    /* flags */,
 | 
					 | 
				
			||||||
                        cl_GLenum       /* target */,
 | 
					 | 
				
			||||||
                        cl_GLint        /* miplevel */,
 | 
					 | 
				
			||||||
                        cl_GLuint       /* texture */,
 | 
					 | 
				
			||||||
                        cl_int *        /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
 | 
					 | 
				
			||||||
    
 | 
					 | 
				
			||||||
/* cl_khr_gl_sharing extension  */
 | 
					 | 
				
			||||||
    
 | 
					 | 
				
			||||||
#define cl_khr_gl_sharing 1
 | 
					 | 
				
			||||||
    
 | 
					 | 
				
			||||||
typedef cl_uint     cl_gl_context_info;
 | 
					 | 
				
			||||||
    
 | 
					 | 
				
			||||||
/* Additional Error Codes  */
 | 
					 | 
				
			||||||
#define CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR  -1000
 | 
					 | 
				
			||||||
    
 | 
					 | 
				
			||||||
/* cl_gl_context_info  */
 | 
					 | 
				
			||||||
#define CL_CURRENT_DEVICE_FOR_GL_CONTEXT_KHR    0x2006
 | 
					 | 
				
			||||||
#define CL_DEVICES_FOR_GL_CONTEXT_KHR           0x2007
 | 
					 | 
				
			||||||
    
 | 
					 | 
				
			||||||
/* Additional cl_context_properties  */
 | 
					 | 
				
			||||||
#define CL_GL_CONTEXT_KHR                       0x2008
 | 
					 | 
				
			||||||
#define CL_EGL_DISPLAY_KHR                      0x2009
 | 
					 | 
				
			||||||
#define CL_GLX_DISPLAY_KHR                      0x200A
 | 
					 | 
				
			||||||
#define CL_WGL_HDC_KHR                          0x200B
 | 
					 | 
				
			||||||
#define CL_CGL_SHAREGROUP_KHR                   0x200C
 | 
					 | 
				
			||||||
    
 | 
					 | 
				
			||||||
extern CL_API_ENTRY cl_int CL_API_CALL
 | 
					 | 
				
			||||||
clGetGLContextInfoKHR(const cl_context_properties * /* properties */,
 | 
					 | 
				
			||||||
                      cl_gl_context_info            /* param_name */,
 | 
					 | 
				
			||||||
                      size_t                        /* param_value_size */,
 | 
					 | 
				
			||||||
                      void *                        /* param_value */,
 | 
					 | 
				
			||||||
                      size_t *                      /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
 | 
					 | 
				
			||||||
    
 | 
					 | 
				
			||||||
typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetGLContextInfoKHR_fn)(
 | 
					 | 
				
			||||||
    const cl_context_properties * properties,
 | 
					 | 
				
			||||||
    cl_gl_context_info            param_name,
 | 
					 | 
				
			||||||
    size_t                        param_value_size,
 | 
					 | 
				
			||||||
    void *                        param_value,
 | 
					 | 
				
			||||||
    size_t *                      param_value_size_ret);
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#ifdef __cplusplus
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
#endif
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#endif  /* __OPENCL_CL_GL_H */
 | 
					 | 
				
			||||||
							
								
								
									
										74
									
								
								include/triton/external/CL/cl_gl_ext.h
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										74
									
								
								include/triton/external/CL/cl_gl_ext.h
									
									
									
									
										vendored
									
									
								
							@@ -1,74 +0,0 @@
 | 
				
			|||||||
/**********************************************************************************
 | 
					 | 
				
			||||||
 * Copyright (c) 2008-2015 The Khronos Group Inc.
 | 
					 | 
				
			||||||
 *
 | 
					 | 
				
			||||||
 * Permission is hereby granted, free of charge, to any person obtaining a
 | 
					 | 
				
			||||||
 * copy of this software and/or associated documentation files (the
 | 
					 | 
				
			||||||
 * "Materials"), to deal in the Materials without restriction, including
 | 
					 | 
				
			||||||
 * without limitation the rights to use, copy, modify, merge, publish,
 | 
					 | 
				
			||||||
 * distribute, sublicense, and/or sell copies of the Materials, and to
 | 
					 | 
				
			||||||
 * permit persons to whom the Materials are furnished to do so, subject to
 | 
					 | 
				
			||||||
 * the following conditions:
 | 
					 | 
				
			||||||
 *
 | 
					 | 
				
			||||||
 * The above copyright notice and this permission notice shall be included
 | 
					 | 
				
			||||||
 * in all copies or substantial portions of the Materials.
 | 
					 | 
				
			||||||
 *
 | 
					 | 
				
			||||||
 * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
 | 
					 | 
				
			||||||
 * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
 | 
					 | 
				
			||||||
 * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
 | 
					 | 
				
			||||||
 *    https://www.khronos.org/registry/
 | 
					 | 
				
			||||||
 *
 | 
					 | 
				
			||||||
 * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 | 
					 | 
				
			||||||
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 | 
					 | 
				
			||||||
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 | 
					 | 
				
			||||||
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 | 
					 | 
				
			||||||
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 | 
					 | 
				
			||||||
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 | 
					 | 
				
			||||||
 * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
 | 
					 | 
				
			||||||
 **********************************************************************************/
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* cl_gl_ext.h contains vendor (non-KHR) OpenCL extensions which have           */
 | 
					 | 
				
			||||||
/* OpenGL dependencies.                                                         */
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#ifndef __OPENCL_CL_GL_EXT_H
 | 
					 | 
				
			||||||
#define __OPENCL_CL_GL_EXT_H
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#ifdef __cplusplus
 | 
					 | 
				
			||||||
extern "C" {
 | 
					 | 
				
			||||||
#endif
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#ifdef __APPLE__
 | 
					 | 
				
			||||||
    #include <OpenCL/cl_gl.h>
 | 
					 | 
				
			||||||
#else
 | 
					 | 
				
			||||||
    #include "cl_gl.h"
 | 
					 | 
				
			||||||
#endif
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/*
 | 
					 | 
				
			||||||
 * For each extension, follow this template
 | 
					 | 
				
			||||||
 *  cl_VEN_extname extension  */
 | 
					 | 
				
			||||||
/* #define cl_VEN_extname 1
 | 
					 | 
				
			||||||
 * ... define new types, if any
 | 
					 | 
				
			||||||
 * ... define new tokens, if any
 | 
					 | 
				
			||||||
 * ... define new APIs, if any
 | 
					 | 
				
			||||||
 *
 | 
					 | 
				
			||||||
 *  If you need GLtypes here, mirror them with a cl_GLtype, rather than including a GL header
 | 
					 | 
				
			||||||
 *  This allows us to avoid having to decide whether to include GL headers or GLES here.
 | 
					 | 
				
			||||||
 */
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* 
 | 
					 | 
				
			||||||
 *  cl_khr_gl_event  extension
 | 
					 | 
				
			||||||
 *  See section 9.9 in the OpenCL 1.1 spec for more information
 | 
					 | 
				
			||||||
 */
 | 
					 | 
				
			||||||
#define CL_COMMAND_GL_FENCE_SYNC_OBJECT_KHR     0x200D
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
extern CL_API_ENTRY cl_event CL_API_CALL
 | 
					 | 
				
			||||||
clCreateEventFromGLsyncKHR(cl_context           /* context */,
 | 
					 | 
				
			||||||
                           cl_GLsync            /* cl_GLsync */,
 | 
					 | 
				
			||||||
                           cl_int *             /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#ifdef __cplusplus
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
#endif
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#endif	/* __OPENCL_CL_GL_EXT_H  */
 | 
					 | 
				
			||||||
							
								
								
									
										1458
									
								
								include/triton/external/CL/cl_platform.h
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										1458
									
								
								include/triton/external/CL/cl_platform.h
									
									
									
									
										vendored
									
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							@@ -1,172 +0,0 @@
 | 
				
			|||||||
/**********************************************************************************
 | 
					 | 
				
			||||||
 * Copyright (c) 2008-2016 The Khronos Group Inc.
 | 
					 | 
				
			||||||
 *
 | 
					 | 
				
			||||||
 * Permission is hereby granted, free of charge, to any person obtaining a
 | 
					 | 
				
			||||||
 * copy of this software and/or associated documentation files (the
 | 
					 | 
				
			||||||
 * "Materials"), to deal in the Materials without restriction, including
 | 
					 | 
				
			||||||
 * without limitation the rights to use, copy, modify, merge, publish,
 | 
					 | 
				
			||||||
 * distribute, sublicense, and/or sell copies of the Materials, and to
 | 
					 | 
				
			||||||
 * permit persons to whom the Materials are furnished to do so, subject to
 | 
					 | 
				
			||||||
 * the following conditions:
 | 
					 | 
				
			||||||
 *
 | 
					 | 
				
			||||||
 * The above copyright notice and this permission notice shall be included
 | 
					 | 
				
			||||||
 * in all copies or substantial portions of the Materials.
 | 
					 | 
				
			||||||
 *
 | 
					 | 
				
			||||||
 * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
 | 
					 | 
				
			||||||
 * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
 | 
					 | 
				
			||||||
 * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
 | 
					 | 
				
			||||||
 *    https://www.khronos.org/registry/
 | 
					 | 
				
			||||||
 *
 | 
					 | 
				
			||||||
 * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 | 
					 | 
				
			||||||
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 | 
					 | 
				
			||||||
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 | 
					 | 
				
			||||||
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 | 
					 | 
				
			||||||
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 | 
					 | 
				
			||||||
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 | 
					 | 
				
			||||||
 * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
 | 
					 | 
				
			||||||
 **********************************************************************************/
 | 
					 | 
				
			||||||
/*****************************************************************************\
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
Copyright (c) 2013-2016 Intel Corporation All Rights Reserved.
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
THESE MATERIALS ARE PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 | 
					 | 
				
			||||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 | 
					 | 
				
			||||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 | 
					 | 
				
			||||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
 | 
					 | 
				
			||||||
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 | 
					 | 
				
			||||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 | 
					 | 
				
			||||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 | 
					 | 
				
			||||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 | 
					 | 
				
			||||||
OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
 | 
					 | 
				
			||||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THESE
 | 
					 | 
				
			||||||
MATERIALS, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
File Name: cl_va_api_media_sharing_intel.h
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
Abstract:
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
Notes:
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
\*****************************************************************************/
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#ifndef __OPENCL_CL_VA_API_MEDIA_SHARING_INTEL_H
 | 
					 | 
				
			||||||
#define __OPENCL_CL_VA_API_MEDIA_SHARING_INTEL_H
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#include "cl.h"
 | 
					 | 
				
			||||||
#include "cl_platform.h"
 | 
					 | 
				
			||||||
#include <va/va.h>
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#ifdef __cplusplus
 | 
					 | 
				
			||||||
extern "C" {
 | 
					 | 
				
			||||||
#endif
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/******************************************
 | 
					 | 
				
			||||||
* cl_intel_va_api_media_sharing extension *
 | 
					 | 
				
			||||||
*******************************************/
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#define cl_intel_va_api_media_sharing 1
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* error codes */
 | 
					 | 
				
			||||||
#define CL_INVALID_VA_API_MEDIA_ADAPTER_INTEL               -1098
 | 
					 | 
				
			||||||
#define CL_INVALID_VA_API_MEDIA_SURFACE_INTEL               -1099
 | 
					 | 
				
			||||||
#define CL_VA_API_MEDIA_SURFACE_ALREADY_ACQUIRED_INTEL      -1100
 | 
					 | 
				
			||||||
#define CL_VA_API_MEDIA_SURFACE_NOT_ACQUIRED_INTEL          -1101
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* cl_va_api_device_source_intel */
 | 
					 | 
				
			||||||
#define CL_VA_API_DISPLAY_INTEL                             0x4094
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* cl_va_api_device_set_intel */
 | 
					 | 
				
			||||||
#define CL_PREFERRED_DEVICES_FOR_VA_API_INTEL               0x4095
 | 
					 | 
				
			||||||
#define CL_ALL_DEVICES_FOR_VA_API_INTEL                     0x4096
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* cl_context_info */
 | 
					 | 
				
			||||||
#define CL_CONTEXT_VA_API_DISPLAY_INTEL                     0x4097
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* cl_mem_info */
 | 
					 | 
				
			||||||
#define CL_MEM_VA_API_MEDIA_SURFACE_INTEL                   0x4098
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* cl_image_info */
 | 
					 | 
				
			||||||
#define CL_IMAGE_VA_API_PLANE_INTEL                         0x4099
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* cl_command_type */
 | 
					 | 
				
			||||||
#define CL_COMMAND_ACQUIRE_VA_API_MEDIA_SURFACES_INTEL      0x409A
 | 
					 | 
				
			||||||
#define CL_COMMAND_RELEASE_VA_API_MEDIA_SURFACES_INTEL      0x409B
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
typedef cl_uint cl_va_api_device_source_intel;
 | 
					 | 
				
			||||||
typedef cl_uint cl_va_api_device_set_intel;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
extern CL_API_ENTRY cl_int CL_API_CALL
 | 
					 | 
				
			||||||
clGetDeviceIDsFromVA_APIMediaAdapterINTEL(
 | 
					 | 
				
			||||||
    cl_platform_id                /* platform */,
 | 
					 | 
				
			||||||
    cl_va_api_device_source_intel /* media_adapter_type */,
 | 
					 | 
				
			||||||
    void*                         /* media_adapter */,
 | 
					 | 
				
			||||||
    cl_va_api_device_set_intel    /* media_adapter_set */,
 | 
					 | 
				
			||||||
    cl_uint                       /* num_entries */,
 | 
					 | 
				
			||||||
    cl_device_id*                 /* devices */,
 | 
					 | 
				
			||||||
    cl_uint*                      /* num_devices */) CL_EXT_SUFFIX__VERSION_1_2;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
typedef CL_API_ENTRY cl_int (CL_API_CALL * clGetDeviceIDsFromVA_APIMediaAdapterINTEL_fn)(
 | 
					 | 
				
			||||||
    cl_platform_id                /* platform */,
 | 
					 | 
				
			||||||
    cl_va_api_device_source_intel /* media_adapter_type */,
 | 
					 | 
				
			||||||
    void*                         /* media_adapter */,
 | 
					 | 
				
			||||||
    cl_va_api_device_set_intel    /* media_adapter_set */,
 | 
					 | 
				
			||||||
    cl_uint                       /* num_entries */,
 | 
					 | 
				
			||||||
    cl_device_id*                 /* devices */,
 | 
					 | 
				
			||||||
    cl_uint*                      /* num_devices */) CL_EXT_SUFFIX__VERSION_1_2;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
extern CL_API_ENTRY cl_mem CL_API_CALL
 | 
					 | 
				
			||||||
clCreateFromVA_APIMediaSurfaceINTEL(
 | 
					 | 
				
			||||||
    cl_context                    /* context */,
 | 
					 | 
				
			||||||
    cl_mem_flags                  /* flags */,
 | 
					 | 
				
			||||||
    VASurfaceID*                  /* surface */,
 | 
					 | 
				
			||||||
    cl_uint                       /* plane */,
 | 
					 | 
				
			||||||
    cl_int*                       /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_2;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
typedef CL_API_ENTRY cl_mem (CL_API_CALL * clCreateFromVA_APIMediaSurfaceINTEL_fn)(
 | 
					 | 
				
			||||||
    cl_context                    /* context */,
 | 
					 | 
				
			||||||
    cl_mem_flags                  /* flags */,
 | 
					 | 
				
			||||||
    VASurfaceID*                  /* surface */,
 | 
					 | 
				
			||||||
    cl_uint                       /* plane */,
 | 
					 | 
				
			||||||
    cl_int*                       /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_2;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
extern CL_API_ENTRY cl_int CL_API_CALL
 | 
					 | 
				
			||||||
clEnqueueAcquireVA_APIMediaSurfacesINTEL(
 | 
					 | 
				
			||||||
    cl_command_queue              /* command_queue */,
 | 
					 | 
				
			||||||
    cl_uint                       /* num_objects */,
 | 
					 | 
				
			||||||
    const cl_mem*                 /* mem_objects */,
 | 
					 | 
				
			||||||
    cl_uint                       /* num_events_in_wait_list */,
 | 
					 | 
				
			||||||
    const cl_event*               /* event_wait_list */,
 | 
					 | 
				
			||||||
    cl_event*                     /* event */) CL_EXT_SUFFIX__VERSION_1_2;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireVA_APIMediaSurfacesINTEL_fn)(
 | 
					 | 
				
			||||||
    cl_command_queue              /* command_queue */,
 | 
					 | 
				
			||||||
    cl_uint                       /* num_objects */,
 | 
					 | 
				
			||||||
    const cl_mem*                 /* mem_objects */,
 | 
					 | 
				
			||||||
    cl_uint                       /* num_events_in_wait_list */,
 | 
					 | 
				
			||||||
    const cl_event*               /* event_wait_list */,
 | 
					 | 
				
			||||||
    cl_event*                     /* event */) CL_EXT_SUFFIX__VERSION_1_2;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
extern CL_API_ENTRY cl_int CL_API_CALL
 | 
					 | 
				
			||||||
clEnqueueReleaseVA_APIMediaSurfacesINTEL(
 | 
					 | 
				
			||||||
    cl_command_queue              /* command_queue */,
 | 
					 | 
				
			||||||
    cl_uint                       /* num_objects */,
 | 
					 | 
				
			||||||
    const cl_mem*                 /* mem_objects */,
 | 
					 | 
				
			||||||
    cl_uint                       /* num_events_in_wait_list */,
 | 
					 | 
				
			||||||
    const cl_event*               /* event_wait_list */,
 | 
					 | 
				
			||||||
    cl_event*                     /* event */) CL_EXT_SUFFIX__VERSION_1_2;
 | 
					 | 
				
			||||||
	
 | 
					 | 
				
			||||||
typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseVA_APIMediaSurfacesINTEL_fn)(
 | 
					 | 
				
			||||||
    cl_command_queue              /* command_queue */,
 | 
					 | 
				
			||||||
    cl_uint                       /* num_objects */,
 | 
					 | 
				
			||||||
    const cl_mem*                 /* mem_objects */,
 | 
					 | 
				
			||||||
    cl_uint                       /* num_events_in_wait_list */,
 | 
					 | 
				
			||||||
    const cl_event*               /* event_wait_list */,
 | 
					 | 
				
			||||||
    cl_event*                     /* event */) CL_EXT_SUFFIX__VERSION_1_2;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#ifdef __cplusplus
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
#endif
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#endif  /* __OPENCL_CL_VA_API_MEDIA_SHARING_INTEL_H */
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
							
								
								
									
										59
									
								
								include/triton/external/CL/opencl.h
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										59
									
								
								include/triton/external/CL/opencl.h
									
									
									
									
										vendored
									
									
								
							@@ -1,59 +0,0 @@
 | 
				
			|||||||
/*******************************************************************************
 | 
					 | 
				
			||||||
 * Copyright (c) 2008-2015 The Khronos Group Inc.
 | 
					 | 
				
			||||||
 *
 | 
					 | 
				
			||||||
 * Permission is hereby granted, free of charge, to any person obtaining a
 | 
					 | 
				
			||||||
 * copy of this software and/or associated documentation files (the
 | 
					 | 
				
			||||||
 * "Materials"), to deal in the Materials without restriction, including
 | 
					 | 
				
			||||||
 * without limitation the rights to use, copy, modify, merge, publish,
 | 
					 | 
				
			||||||
 * distribute, sublicense, and/or sell copies of the Materials, and to
 | 
					 | 
				
			||||||
 * permit persons to whom the Materials are furnished to do so, subject to
 | 
					 | 
				
			||||||
 * the following conditions:
 | 
					 | 
				
			||||||
 *
 | 
					 | 
				
			||||||
 * The above copyright notice and this permission notice shall be included
 | 
					 | 
				
			||||||
 * in all copies or substantial portions of the Materials.
 | 
					 | 
				
			||||||
 *
 | 
					 | 
				
			||||||
 * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
 | 
					 | 
				
			||||||
 * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
 | 
					 | 
				
			||||||
 * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
 | 
					 | 
				
			||||||
 *    https://www.khronos.org/registry/
 | 
					 | 
				
			||||||
 *
 | 
					 | 
				
			||||||
 * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 | 
					 | 
				
			||||||
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 | 
					 | 
				
			||||||
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 | 
					 | 
				
			||||||
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 | 
					 | 
				
			||||||
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 | 
					 | 
				
			||||||
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 | 
					 | 
				
			||||||
 * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
 | 
					 | 
				
			||||||
 ******************************************************************************/
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#ifndef __OPENCL_H
 | 
					 | 
				
			||||||
#define __OPENCL_H
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#ifdef __cplusplus
 | 
					 | 
				
			||||||
extern "C" {
 | 
					 | 
				
			||||||
#endif
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#ifdef __APPLE__
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#include <OpenCL/cl.h>
 | 
					 | 
				
			||||||
#include <OpenCL/cl_gl.h>
 | 
					 | 
				
			||||||
#include <OpenCL/cl_gl_ext.h>
 | 
					 | 
				
			||||||
#include <OpenCL/cl_ext.h>
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#else
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#include "cl.h"
 | 
					 | 
				
			||||||
#include "cl_gl.h"
 | 
					 | 
				
			||||||
#include "cl_gl_ext.h"
 | 
					 | 
				
			||||||
#include "cl_ext.h"
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#endif
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#ifdef __cplusplus
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
#endif
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#endif  /* __OPENCL_H   */
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
							
								
								
									
										288
									
								
								include/triton/external/hip.h
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										288
									
								
								include/triton/external/hip.h
									
									
									
									
										vendored
									
									
										Normal file
									
								
							@@ -0,0 +1,288 @@
 | 
				
			|||||||
 | 
					/*
 | 
				
			||||||
 | 
					 * @brief hipError_t
 | 
				
			||||||
 | 
					 * @enum
 | 
				
			||||||
 | 
					 * @ingroup Enumerations
 | 
				
			||||||
 | 
					 */
 | 
				
			||||||
 | 
					// Developer note - when updating these, update the hipErrorName and hipErrorString functions in
 | 
				
			||||||
 | 
					// NVCC and HCC paths Also update the hipCUDAErrorTohipError function in NVCC path.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					// Ignoring error-code return values from hip APIs is discouraged. On C++17,
 | 
				
			||||||
 | 
					// we can make that yield a warning
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					/*
 | 
				
			||||||
 | 
					 * @brief hipError_t
 | 
				
			||||||
 | 
					 * @enum
 | 
				
			||||||
 | 
					 * @ingroup Enumerations
 | 
				
			||||||
 | 
					 */
 | 
				
			||||||
 | 
					// Developer note - when updating these, update the hipErrorName and hipErrorString functions in
 | 
				
			||||||
 | 
					// NVCC and HCC paths Also update the hipCUDAErrorTohipError function in NVCC path.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#include <cstddef>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					typedef enum hipError_t {
 | 
				
			||||||
 | 
					    hipSuccess = 0,  ///< Successful completion.
 | 
				
			||||||
 | 
					    hipErrorInvalidValue = 1,  ///< One or more of the parameters passed to the API call is NULL
 | 
				
			||||||
 | 
					                               ///< or not in an acceptable range.
 | 
				
			||||||
 | 
					    hipErrorOutOfMemory = 2,
 | 
				
			||||||
 | 
					    // Deprecated
 | 
				
			||||||
 | 
					    hipErrorMemoryAllocation = 2,  ///< Memory allocation error.
 | 
				
			||||||
 | 
					    hipErrorNotInitialized = 3,
 | 
				
			||||||
 | 
					    // Deprecated
 | 
				
			||||||
 | 
					    hipErrorInitializationError = 3,
 | 
				
			||||||
 | 
					    hipErrorDeinitialized = 4,
 | 
				
			||||||
 | 
					    hipErrorProfilerDisabled = 5,
 | 
				
			||||||
 | 
					    hipErrorProfilerNotInitialized = 6,
 | 
				
			||||||
 | 
					    hipErrorProfilerAlreadyStarted = 7,
 | 
				
			||||||
 | 
					    hipErrorProfilerAlreadyStopped = 8,
 | 
				
			||||||
 | 
					    hipErrorInvalidConfiguration = 9,
 | 
				
			||||||
 | 
					    hipErrorInvalidPitchValue = 12,
 | 
				
			||||||
 | 
					    hipErrorInvalidSymbol = 13,
 | 
				
			||||||
 | 
					    hipErrorInvalidDevicePointer = 17,  ///< Invalid Device Pointer
 | 
				
			||||||
 | 
					    hipErrorInvalidMemcpyDirection = 21,  ///< Invalid memory copy direction
 | 
				
			||||||
 | 
					    hipErrorInsufficientDriver = 35,
 | 
				
			||||||
 | 
					    hipErrorMissingConfiguration = 52,
 | 
				
			||||||
 | 
					    hipErrorPriorLaunchFailure = 53,
 | 
				
			||||||
 | 
					    hipErrorInvalidDeviceFunction = 98,
 | 
				
			||||||
 | 
					    hipErrorNoDevice = 100,  ///< Call to hipGetDeviceCount returned 0 devices
 | 
				
			||||||
 | 
					    hipErrorInvalidDevice = 101,  ///< DeviceID must be in range 0...#compute-devices.
 | 
				
			||||||
 | 
					    hipErrorInvalidImage = 200,
 | 
				
			||||||
 | 
					    hipErrorInvalidContext = 201,  ///< Produced when input context is invalid.
 | 
				
			||||||
 | 
					    hipErrorContextAlreadyCurrent = 202,
 | 
				
			||||||
 | 
					    hipErrorMapFailed = 205,
 | 
				
			||||||
 | 
					    // Deprecated
 | 
				
			||||||
 | 
					    hipErrorMapBufferObjectFailed = 205,  ///< Produced when the IPC memory attach failed from ROCr.
 | 
				
			||||||
 | 
					    hipErrorUnmapFailed = 206,
 | 
				
			||||||
 | 
					    hipErrorArrayIsMapped = 207,
 | 
				
			||||||
 | 
					    hipErrorAlreadyMapped = 208,
 | 
				
			||||||
 | 
					    hipErrorNoBinaryForGpu = 209,
 | 
				
			||||||
 | 
					    hipErrorAlreadyAcquired = 210,
 | 
				
			||||||
 | 
					    hipErrorNotMapped = 211,
 | 
				
			||||||
 | 
					    hipErrorNotMappedAsArray = 212,
 | 
				
			||||||
 | 
					    hipErrorNotMappedAsPointer = 213,
 | 
				
			||||||
 | 
					    hipErrorECCNotCorrectable = 214,
 | 
				
			||||||
 | 
					    hipErrorUnsupportedLimit = 215,
 | 
				
			||||||
 | 
					    hipErrorContextAlreadyInUse = 216,
 | 
				
			||||||
 | 
					    hipErrorPeerAccessUnsupported = 217,
 | 
				
			||||||
 | 
					    hipErrorInvalidKernelFile = 218,  ///< In CUDA DRV, it is CUDA_ERROR_INVALID_PTX
 | 
				
			||||||
 | 
					    hipErrorInvalidGraphicsContext = 219,
 | 
				
			||||||
 | 
					    hipErrorInvalidSource = 300,
 | 
				
			||||||
 | 
					    hipErrorFileNotFound = 301,
 | 
				
			||||||
 | 
					    hipErrorSharedObjectSymbolNotFound = 302,
 | 
				
			||||||
 | 
					    hipErrorSharedObjectInitFailed = 303,
 | 
				
			||||||
 | 
					    hipErrorOperatingSystem = 304,
 | 
				
			||||||
 | 
					    hipErrorInvalidHandle = 400,
 | 
				
			||||||
 | 
					    // Deprecated
 | 
				
			||||||
 | 
					    hipErrorInvalidResourceHandle = 400,  ///< Resource handle (hipEvent_t or hipStream_t) invalid.
 | 
				
			||||||
 | 
					    hipErrorNotFound = 500,
 | 
				
			||||||
 | 
					    hipErrorNotReady = 600,  ///< Indicates that asynchronous operations enqueued earlier are not
 | 
				
			||||||
 | 
					                             ///< ready.  This is not actually an error, but is used to distinguish
 | 
				
			||||||
 | 
					                             ///< from hipSuccess (which indicates completion).  APIs that return
 | 
				
			||||||
 | 
					                             ///< this error include hipEventQuery and hipStreamQuery.
 | 
				
			||||||
 | 
					    hipErrorIllegalAddress = 700,
 | 
				
			||||||
 | 
					    hipErrorLaunchOutOfResources = 701,  ///< Out of resources error.
 | 
				
			||||||
 | 
					    hipErrorLaunchTimeOut = 702,
 | 
				
			||||||
 | 
					    hipErrorPeerAccessAlreadyEnabled =
 | 
				
			||||||
 | 
					        704,  ///< Peer access was already enabled from the current device.
 | 
				
			||||||
 | 
					    hipErrorPeerAccessNotEnabled =
 | 
				
			||||||
 | 
					        705,  ///< Peer access was never enabled from the current device.
 | 
				
			||||||
 | 
					    hipErrorSetOnActiveProcess = 708,
 | 
				
			||||||
 | 
					    hipErrorAssert = 710,  ///< Produced when the kernel calls assert.
 | 
				
			||||||
 | 
					    hipErrorHostMemoryAlreadyRegistered =
 | 
				
			||||||
 | 
					        712,  ///< Produced when trying to lock a page-locked memory.
 | 
				
			||||||
 | 
					    hipErrorHostMemoryNotRegistered =
 | 
				
			||||||
 | 
					        713,  ///< Produced when trying to unlock a non-page-locked memory.
 | 
				
			||||||
 | 
					    hipErrorLaunchFailure =
 | 
				
			||||||
 | 
					        719,  ///< An exception occurred on the device while executing a kernel.
 | 
				
			||||||
 | 
					    hipErrorCooperativeLaunchTooLarge =
 | 
				
			||||||
 | 
					        720,  ///< This error indicates that the number of blocks launched per grid for a kernel
 | 
				
			||||||
 | 
					              ///< that was launched via cooperative launch APIs exceeds the maximum number of
 | 
				
			||||||
 | 
					              ///< allowed blocks for the current device
 | 
				
			||||||
 | 
					    hipErrorNotSupported = 801,  ///< Produced when the hip API is not supported/implemented
 | 
				
			||||||
 | 
					    hipErrorUnknown = 999,  //< Unknown error.
 | 
				
			||||||
 | 
					    // HSA Runtime Error Codes start here.
 | 
				
			||||||
 | 
					    hipErrorRuntimeMemory = 1052,  ///< HSA runtime memory call returned error.  Typically not seen
 | 
				
			||||||
 | 
					                                   ///< in production systems.
 | 
				
			||||||
 | 
					    hipErrorRuntimeOther = 1053,  ///< HSA runtime call other than memory returned error.  Typically
 | 
				
			||||||
 | 
					                                  ///< not seen in production systems.
 | 
				
			||||||
 | 
					    hipErrorTbd  ///< Marker that more error codes are needed.
 | 
				
			||||||
 | 
					} hipError_t;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					typedef struct ihipCtx_t* hipCtx_t;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					// Note many APIs also use integer deviceIds as an alternative to the device pointer:
 | 
				
			||||||
 | 
					typedef int hipDevice_t;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					typedef enum hipDeviceP2PAttr {
 | 
				
			||||||
 | 
					  hipDevP2PAttrPerformanceRank = 0,
 | 
				
			||||||
 | 
					  hipDevP2PAttrAccessSupported,
 | 
				
			||||||
 | 
					  hipDevP2PAttrNativeAtomicSupported,
 | 
				
			||||||
 | 
					  hipDevP2PAttrHipArrayAccessSupported
 | 
				
			||||||
 | 
					} hipDeviceP2PAttr;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					typedef struct ihipStream_t* hipStream_t;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define hipIpcMemLazyEnablePeerAccess 0
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define HIP_IPC_HANDLE_SIZE 64
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					typedef struct hipIpcMemHandle_st {
 | 
				
			||||||
 | 
					    char reserved[HIP_IPC_HANDLE_SIZE];
 | 
				
			||||||
 | 
					} hipIpcMemHandle_t;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					typedef struct hipIpcEventHandle_st {
 | 
				
			||||||
 | 
					    char reserved[HIP_IPC_HANDLE_SIZE];
 | 
				
			||||||
 | 
					} hipIpcEventHandle_t;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					typedef struct ihipModule_t* hipModule_t;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					typedef struct ihipModuleSymbol_t* hipFunction_t;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					typedef struct hipFuncAttributes {
 | 
				
			||||||
 | 
					    int binaryVersion;
 | 
				
			||||||
 | 
					    int cacheModeCA;
 | 
				
			||||||
 | 
					    size_t constSizeBytes;
 | 
				
			||||||
 | 
					    size_t localSizeBytes;
 | 
				
			||||||
 | 
					    int maxDynamicSharedSizeBytes;
 | 
				
			||||||
 | 
					    int maxThreadsPerBlock;
 | 
				
			||||||
 | 
					    int numRegs;
 | 
				
			||||||
 | 
					    int preferredShmemCarveout;
 | 
				
			||||||
 | 
					    int ptxVersion;
 | 
				
			||||||
 | 
					    size_t sharedSizeBytes;
 | 
				
			||||||
 | 
					} hipFuncAttributes;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					typedef struct ihipEvent_t* hipEvent_t;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					/*
 | 
				
			||||||
 | 
					 * @brief hipDeviceAttribute_t
 | 
				
			||||||
 | 
					 * @enum
 | 
				
			||||||
 | 
					 * @ingroup Enumerations
 | 
				
			||||||
 | 
					 */
 | 
				
			||||||
 | 
					typedef enum hipDeviceAttribute_t {
 | 
				
			||||||
 | 
					    hipDeviceAttributeMaxThreadsPerBlock,       ///< Maximum number of threads per block.
 | 
				
			||||||
 | 
					    hipDeviceAttributeMaxBlockDimX,             ///< Maximum x-dimension of a block.
 | 
				
			||||||
 | 
					    hipDeviceAttributeMaxBlockDimY,             ///< Maximum y-dimension of a block.
 | 
				
			||||||
 | 
					    hipDeviceAttributeMaxBlockDimZ,             ///< Maximum z-dimension of a block.
 | 
				
			||||||
 | 
					    hipDeviceAttributeMaxGridDimX,              ///< Maximum x-dimension of a grid.
 | 
				
			||||||
 | 
					    hipDeviceAttributeMaxGridDimY,              ///< Maximum y-dimension of a grid.
 | 
				
			||||||
 | 
					    hipDeviceAttributeMaxGridDimZ,              ///< Maximum z-dimension of a grid.
 | 
				
			||||||
 | 
					    hipDeviceAttributeMaxSharedMemoryPerBlock,  ///< Maximum shared memory available per block in
 | 
				
			||||||
 | 
					                                                ///< bytes.
 | 
				
			||||||
 | 
					    hipDeviceAttributeTotalConstantMemory,      ///< Constant memory size in bytes.
 | 
				
			||||||
 | 
					    hipDeviceAttributeWarpSize,                 ///< Warp size in threads.
 | 
				
			||||||
 | 
					    hipDeviceAttributeMaxRegistersPerBlock,  ///< Maximum number of 32-bit registers available to a
 | 
				
			||||||
 | 
					                                             ///< thread block. This number is shared by all thread
 | 
				
			||||||
 | 
					                                             ///< blocks simultaneously resident on a
 | 
				
			||||||
 | 
					                                             ///< multiprocessor.
 | 
				
			||||||
 | 
					    hipDeviceAttributeClockRate,             ///< Peak clock frequency in kilohertz.
 | 
				
			||||||
 | 
					    hipDeviceAttributeMemoryClockRate,       ///< Peak memory clock frequency in kilohertz.
 | 
				
			||||||
 | 
					    hipDeviceAttributeMemoryBusWidth,        ///< Global memory bus width in bits.
 | 
				
			||||||
 | 
					    hipDeviceAttributeMultiprocessorCount,   ///< Number of multiprocessors on the device.
 | 
				
			||||||
 | 
					    hipDeviceAttributeComputeMode,           ///< Compute mode that device is currently in.
 | 
				
			||||||
 | 
					    hipDeviceAttributeL2CacheSize,  ///< Size of L2 cache in bytes. 0 if the device doesn't have L2
 | 
				
			||||||
 | 
					                                    ///< cache.
 | 
				
			||||||
 | 
					    hipDeviceAttributeMaxThreadsPerMultiProcessor,  ///< Maximum resident threads per
 | 
				
			||||||
 | 
					                                                    ///< multiprocessor.
 | 
				
			||||||
 | 
					    hipDeviceAttributeComputeCapabilityMajor,       ///< Major compute capability version number.
 | 
				
			||||||
 | 
					    hipDeviceAttributeComputeCapabilityMinor,       ///< Minor compute capability version number.
 | 
				
			||||||
 | 
					    hipDeviceAttributeConcurrentKernels,  ///< Device can possibly execute multiple kernels
 | 
				
			||||||
 | 
					                                          ///< concurrently.
 | 
				
			||||||
 | 
					    hipDeviceAttributePciBusId,           ///< PCI Bus ID.
 | 
				
			||||||
 | 
					    hipDeviceAttributePciDeviceId,        ///< PCI Device ID.
 | 
				
			||||||
 | 
					    hipDeviceAttributeMaxSharedMemoryPerMultiprocessor,  ///< Maximum Shared Memory Per
 | 
				
			||||||
 | 
					                                                         ///< Multiprocessor.
 | 
				
			||||||
 | 
					    hipDeviceAttributeIsMultiGpuBoard,                   ///< Multiple GPU devices.
 | 
				
			||||||
 | 
					    hipDeviceAttributeIntegrated,                        ///< iGPU
 | 
				
			||||||
 | 
					    hipDeviceAttributeCooperativeLaunch,                 ///< Support cooperative launch
 | 
				
			||||||
 | 
					    hipDeviceAttributeCooperativeMultiDeviceLaunch,      ///< Support cooperative launch on multiple devices
 | 
				
			||||||
 | 
					    hipDeviceAttributeMaxTexture1DWidth,    ///< Maximum number of elements in 1D images
 | 
				
			||||||
 | 
					    hipDeviceAttributeMaxTexture2DWidth,    ///< Maximum dimension width of 2D images in image elements
 | 
				
			||||||
 | 
					    hipDeviceAttributeMaxTexture2DHeight,   ///< Maximum dimension height of 2D images in image elements
 | 
				
			||||||
 | 
					    hipDeviceAttributeMaxTexture3DWidth,    ///< Maximum dimension width of 3D images in image elements
 | 
				
			||||||
 | 
					    hipDeviceAttributeMaxTexture3DHeight,   ///< Maximum dimensions height of 3D images in image elements
 | 
				
			||||||
 | 
					    hipDeviceAttributeMaxTexture3DDepth,    ///< Maximum dimensions depth of 3D images in image elements
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    hipDeviceAttributeHdpMemFlushCntl,      ///< Address of the HDP_MEM_COHERENCY_FLUSH_CNTL register
 | 
				
			||||||
 | 
					    hipDeviceAttributeHdpRegFlushCntl,      ///< Address of the HDP_REG_COHERENCY_FLUSH_CNTL register
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    hipDeviceAttributeMaxPitch,             ///< Maximum pitch in bytes allowed by memory copies
 | 
				
			||||||
 | 
					    hipDeviceAttributeTextureAlignment,     ///<Alignment requirement for textures
 | 
				
			||||||
 | 
					    hipDeviceAttributeTexturePitchAlignment, ///<Pitch alignment requirement for 2D texture references bound to pitched memory;
 | 
				
			||||||
 | 
					    hipDeviceAttributeKernelExecTimeout,    ///<Run time limit for kernels executed on the device
 | 
				
			||||||
 | 
					    hipDeviceAttributeCanMapHostMemory,     ///<Device can map host memory into device address space
 | 
				
			||||||
 | 
					    hipDeviceAttributeEccEnabled,           ///<Device has ECC support enabled
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    hipDeviceAttributeCooperativeMultiDeviceUnmatchedFunc,        ///< Supports cooperative launch on multiple
 | 
				
			||||||
 | 
					                                                                  ///devices with unmatched functions
 | 
				
			||||||
 | 
					    hipDeviceAttributeCooperativeMultiDeviceUnmatchedGridDim,     ///< Supports cooperative launch on multiple
 | 
				
			||||||
 | 
					                                                                  ///devices with unmatched grid dimensions
 | 
				
			||||||
 | 
					    hipDeviceAttributeCooperativeMultiDeviceUnmatchedBlockDim,    ///< Supports cooperative launch on multiple
 | 
				
			||||||
 | 
					                                                                  ///devices with unmatched block dimensions
 | 
				
			||||||
 | 
					    hipDeviceAttributeCooperativeMultiDeviceUnmatchedSharedMem,   ///< Supports cooperative launch on multiple
 | 
				
			||||||
 | 
					                                                                  ///devices with unmatched shared memories
 | 
				
			||||||
 | 
					    hipDeviceAttributeAsicRevision,         ///< Revision of the GPU in this device
 | 
				
			||||||
 | 
					    hipDeviceAttributeManagedMemory,        ///< Device supports allocating managed memory on this system
 | 
				
			||||||
 | 
					    hipDeviceAttributeDirectManagedMemAccessFromHost, ///< Host can directly access managed memory on
 | 
				
			||||||
 | 
					                                                      /// the device without migration
 | 
				
			||||||
 | 
					    hipDeviceAttributeConcurrentManagedAccess,  ///< Device can coherently access managed memory
 | 
				
			||||||
 | 
					                                                /// concurrently with the CPU
 | 
				
			||||||
 | 
					    hipDeviceAttributePageableMemoryAccess,     ///< Device supports coherently accessing pageable memory
 | 
				
			||||||
 | 
					                                                /// without calling hipHostRegister on it
 | 
				
			||||||
 | 
					    hipDeviceAttributePageableMemoryAccessUsesHostPageTables, ///< Device accesses pageable memory via
 | 
				
			||||||
 | 
					                                                              /// the host's page tables
 | 
				
			||||||
 | 
					    hipDeviceAttributeCanUseStreamWaitValue ///< '1' if Device supports hipStreamWaitValue32() and
 | 
				
			||||||
 | 
					                                            ///< hipStreamWaitValue64() , '0' otherwise.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					} hipDeviceAttribute_t;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					typedef void* hipDeviceptr_t;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					/*
 | 
				
			||||||
 | 
					 * @brief hipJitOption
 | 
				
			||||||
 | 
					 * @enum
 | 
				
			||||||
 | 
					 * @ingroup Enumerations
 | 
				
			||||||
 | 
					 */
 | 
				
			||||||
 | 
					typedef enum hipJitOption {
 | 
				
			||||||
 | 
					    hipJitOptionMaxRegisters = 0,
 | 
				
			||||||
 | 
					    hipJitOptionThreadsPerBlock,
 | 
				
			||||||
 | 
					    hipJitOptionWallTime,
 | 
				
			||||||
 | 
					    hipJitOptionInfoLogBuffer,
 | 
				
			||||||
 | 
					    hipJitOptionInfoLogBufferSizeBytes,
 | 
				
			||||||
 | 
					    hipJitOptionErrorLogBuffer,
 | 
				
			||||||
 | 
					    hipJitOptionErrorLogBufferSizeBytes,
 | 
				
			||||||
 | 
					    hipJitOptionOptimizationLevel,
 | 
				
			||||||
 | 
					    hipJitOptionTargetFromContext,
 | 
				
			||||||
 | 
					    hipJitOptionTarget,
 | 
				
			||||||
 | 
					    hipJitOptionFallbackStrategy,
 | 
				
			||||||
 | 
					    hipJitOptionGenerateDebugInfo,
 | 
				
			||||||
 | 
					    hipJitOptionLogVerbose,
 | 
				
			||||||
 | 
					    hipJitOptionGenerateLineInfo,
 | 
				
			||||||
 | 
					    hipJitOptionCacheMode,
 | 
				
			||||||
 | 
					    hipJitOptionSm3xOpt,
 | 
				
			||||||
 | 
					    hipJitOptionFastCompile,
 | 
				
			||||||
 | 
					    hipJitOptionNumOptions
 | 
				
			||||||
 | 
					} hipJitOption;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					/**
 | 
				
			||||||
 | 
					 * @warning On AMD devices and some Nvidia devices, these hints and controls are ignored.
 | 
				
			||||||
 | 
					 */
 | 
				
			||||||
 | 
					typedef enum hipFuncAttribute {
 | 
				
			||||||
 | 
					    hipFuncAttributeMaxDynamicSharedMemorySize = 8,
 | 
				
			||||||
 | 
					    hipFuncAttributePreferredSharedMemoryCarveout = 9,
 | 
				
			||||||
 | 
					    hipFuncAttributeMax
 | 
				
			||||||
 | 
					} hipFuncAttribute;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					/**
 | 
				
			||||||
 | 
					 * @warning On AMD devices and some Nvidia devices, these hints and controls are ignored.
 | 
				
			||||||
 | 
					 */
 | 
				
			||||||
 | 
					typedef enum hipFuncCache_t {
 | 
				
			||||||
 | 
					    hipFuncCachePreferNone,    ///< no preference for shared memory or L1 (default)
 | 
				
			||||||
 | 
					    hipFuncCachePreferShared,  ///< prefer larger shared memory and smaller L1 cache
 | 
				
			||||||
 | 
					    hipFuncCachePreferL1,      ///< prefer larger L1 cache and smaller shared memory
 | 
				
			||||||
 | 
					    hipFuncCachePreferEqual,   ///< prefer equal size L1 cache and shared memory
 | 
				
			||||||
 | 
					} hipFuncCache_t;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define HIP_LAUNCH_PARAM_BUFFER_POINTER ((void*)0x01)
 | 
				
			||||||
 | 
					#define HIP_LAUNCH_PARAM_BUFFER_SIZE ((void*)0x02)
 | 
				
			||||||
 | 
					#define HIP_LAUNCH_PARAM_END ((void*)0x03)
 | 
				
			||||||
@@ -13,45 +13,40 @@
 | 
				
			|||||||
#include "triton/codegen/transform/peephole.h"
 | 
					#include "triton/codegen/transform/peephole.h"
 | 
				
			||||||
#include "triton/codegen/transform/pipeline.h"
 | 
					#include "triton/codegen/transform/pipeline.h"
 | 
				
			||||||
#include "triton/codegen/transform/prefetch.h"
 | 
					#include "triton/codegen/transform/prefetch.h"
 | 
				
			||||||
#include "triton/driver/device.h"
 | 
					 | 
				
			||||||
#include "triton/driver/kernel.h"
 | 
					 | 
				
			||||||
#include "triton/driver/module.h"
 | 
					 | 
				
			||||||
#include "triton/ir/function.h"
 | 
					#include "triton/ir/function.h"
 | 
				
			||||||
#include "triton/ir/module.h"
 | 
					#include "triton/ir/module.h"
 | 
				
			||||||
#include "triton/ir/print.h"
 | 
					#include "triton/ir/print.h"
 | 
				
			||||||
#include "llvm/IR/Module.h"
 | 
					#include "llvm/IR/Module.h"
 | 
				
			||||||
 | 
					#include "llvm/IR/LegacyPassManager.h"
 | 
				
			||||||
 | 
					#include "llvm/IR/Verifier.h"
 | 
				
			||||||
namespace triton {
 | 
					namespace triton {
 | 
				
			||||||
namespace codegen {
 | 
					namespace codegen {
 | 
				
			||||||
 | 
					
 | 
				
			||||||
// TODO:
 | 
					// TODO:
 | 
				
			||||||
// There should be a proper pass manager there!
 | 
					// There should be a proper pass manager there!
 | 
				
			||||||
void add_passes_to_emit_bin(ir::module &ir, driver::device *dev, int num_warps, int num_stages, bool force_nc_cache,
 | 
					std::unique_ptr<llvm::Module> add_passes_to_emit_bin(ir::module &ir, llvm::LLVMContext& ctx, codegen::target* target,
 | 
				
			||||||
                            driver::module *&mod, driver::kernel *&ker, size_t &shared_mem) {
 | 
					                                                     int cc, int num_warps, int num_stages, bool force_nc_cache, int& shared_static) {
 | 
				
			||||||
  // generate llvm code
 | 
					  // generate llvm code
 | 
				
			||||||
  llvm::LLVMContext ctx;
 | 
					 | 
				
			||||||
  std::string name = ir.get_function_list()[0]->get_name();
 | 
					  std::string name = ir.get_function_list()[0]->get_name();
 | 
				
			||||||
  std::unique_ptr<llvm::Module> llvm(new llvm::Module(name, ctx));
 | 
					  std::unique_ptr<llvm::Module> llvm(new llvm::Module(name, ctx));
 | 
				
			||||||
  // optimizations
 | 
					  // optimizations
 | 
				
			||||||
  std::unique_ptr<codegen::target> target = dev->make_target();
 | 
					  bool cts_use_async = target->as_nvidia() && target->as_nvidia()->sm() >= 80;
 | 
				
			||||||
  bool cts_use_async = target->as_nvidia()->sm() >= 80;
 | 
					 | 
				
			||||||
  // create passes
 | 
					  // create passes
 | 
				
			||||||
  codegen::analysis::align align;
 | 
					  codegen::analysis::align align;
 | 
				
			||||||
  codegen::analysis::axes axes;
 | 
					  codegen::analysis::axes axes;
 | 
				
			||||||
  codegen::transform::cts cts(cts_use_async);
 | 
					  codegen::transform::cts cts(cts_use_async);
 | 
				
			||||||
  codegen::transform::pipeline pipeline(cts_use_async, num_stages);
 | 
					  codegen::transform::pipeline pipeline(cts_use_async, num_stages);
 | 
				
			||||||
  codegen::transform::disassociate disassociate;
 | 
					  codegen::transform::disassociate disassociate;
 | 
				
			||||||
  codegen::analysis::layouts layouts(&axes, &align, num_warps, target.get());
 | 
					  codegen::analysis::layouts layouts(&axes, &align, num_warps, target);
 | 
				
			||||||
  codegen::analysis::liveness liveness(&layouts);
 | 
					  codegen::analysis::liveness liveness(&layouts);
 | 
				
			||||||
  codegen::analysis::swizzle swizzle(&layouts, target.get());
 | 
					  codegen::analysis::swizzle swizzle(&layouts, target);
 | 
				
			||||||
  codegen::analysis::allocation allocation(&liveness);
 | 
					  codegen::analysis::allocation allocation(&liveness);
 | 
				
			||||||
  codegen::transform::dce dce;
 | 
					  codegen::transform::dce dce;
 | 
				
			||||||
  codegen::transform::peephole peephole(target.get(), &layouts);
 | 
					  codegen::transform::peephole peephole(target, &layouts);
 | 
				
			||||||
//  codegen::transform::reassociate reassociate;
 | 
					 | 
				
			||||||
  codegen::transform::coalesce coalesce(&align, &layouts);
 | 
					  codegen::transform::coalesce coalesce(&align, &layouts);
 | 
				
			||||||
  codegen::transform::prefetch prefetch_s(target.get());
 | 
					  codegen::transform::prefetch prefetch_s(target);
 | 
				
			||||||
  codegen::transform::membar barriers(&liveness, &layouts, &allocation, &prefetch_s, target.get());
 | 
					  codegen::transform::membar barriers(&liveness, &layouts, &allocation, &prefetch_s, target);
 | 
				
			||||||
  codegen::generator isel(&axes, &layouts, &align, &allocation, &swizzle, target.get(), num_warps, force_nc_cache);
 | 
					  codegen::generator isel(&axes, &layouts, &align, &allocation, &swizzle, target, num_warps, force_nc_cache);
 | 
				
			||||||
  // run passes
 | 
					  // run passes
 | 
				
			||||||
  dce.run(ir);
 | 
					  dce.run(ir);
 | 
				
			||||||
  peephole.run(ir);
 | 
					  peephole.run(ir);
 | 
				
			||||||
@@ -72,15 +67,12 @@ void add_passes_to_emit_bin(ir::module &ir, driver::device *dev, int num_warps,
 | 
				
			|||||||
  layouts.run(ir);
 | 
					  layouts.run(ir);
 | 
				
			||||||
  coalesce.run(ir);
 | 
					  coalesce.run(ir);
 | 
				
			||||||
  dce.run(ir);
 | 
					  dce.run(ir);
 | 
				
			||||||
//  exit(1);
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  align.run(ir);
 | 
					  align.run(ir);
 | 
				
			||||||
  dce.run(ir);
 | 
					  dce.run(ir);
 | 
				
			||||||
  if (target->is_gpu())
 | 
					  if (target->is_gpu())
 | 
				
			||||||
    cts.run(ir);
 | 
					    cts.run(ir);
 | 
				
			||||||
  dce.run(ir);
 | 
					  dce.run(ir);
 | 
				
			||||||
  align.run(ir);
 | 
					  align.run(ir);
 | 
				
			||||||
//  ir::print(ir, std::cout);
 | 
					 | 
				
			||||||
  axes.run(ir);
 | 
					  axes.run(ir);
 | 
				
			||||||
  layouts.run(ir);
 | 
					  layouts.run(ir);
 | 
				
			||||||
  peephole.run(ir);
 | 
					  peephole.run(ir);
 | 
				
			||||||
@@ -93,11 +85,9 @@ void add_passes_to_emit_bin(ir::module &ir, driver::device *dev, int num_warps,
 | 
				
			|||||||
  allocation.run(ir);
 | 
					  allocation.run(ir);
 | 
				
			||||||
  prefetch_s.run(ir);
 | 
					  prefetch_s.run(ir);
 | 
				
			||||||
  barriers.run(ir);
 | 
					  barriers.run(ir);
 | 
				
			||||||
  // ir.print(std::cout);
 | 
					 | 
				
			||||||
  isel.visit(ir, *llvm);
 | 
					  isel.visit(ir, *llvm);
 | 
				
			||||||
  mod = driver::module::create(dev, std::move(llvm));
 | 
					  shared_static = allocation.allocated_size();
 | 
				
			||||||
  ker = driver::kernel::create(&*mod, name.c_str());
 | 
					  return llvm;
 | 
				
			||||||
  shared_mem = allocation.allocated_size();
 | 
					 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
} // namespace codegen
 | 
					} // namespace codegen
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -1,231 +0,0 @@
 | 
				
			|||||||
/* Copyright 2015-2017 Philippe Tillet
 | 
					 | 
				
			||||||
*
 | 
					 | 
				
			||||||
* Permission is hereby granted, free of charge, to any person obtaining
 | 
					 | 
				
			||||||
* a copy of this software and associated documentation files
 | 
					 | 
				
			||||||
* (the "Software"), to deal in the Software without restriction,
 | 
					 | 
				
			||||||
* including without limitation the rights to use, copy, modify, merge,
 | 
					 | 
				
			||||||
* publish, distribute, sublicense, and/or sell copies of the Software,
 | 
					 | 
				
			||||||
* and to permit persons to whom the Software is furnished to do so,
 | 
					 | 
				
			||||||
* subject to the following conditions:
 | 
					 | 
				
			||||||
*
 | 
					 | 
				
			||||||
* The above copyright notice and this permission notice shall be
 | 
					 | 
				
			||||||
* included in all copies or substantial portions of the Software.
 | 
					 | 
				
			||||||
*
 | 
					 | 
				
			||||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 | 
					 | 
				
			||||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 | 
					 | 
				
			||||||
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 | 
					 | 
				
			||||||
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 | 
					 | 
				
			||||||
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 | 
					 | 
				
			||||||
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 | 
					 | 
				
			||||||
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 | 
					 | 
				
			||||||
*/
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#include <vector>
 | 
					 | 
				
			||||||
#include <stdexcept>
 | 
					 | 
				
			||||||
#include "triton/driver/dispatch.h"
 | 
					 | 
				
			||||||
#include "triton/driver/backend.h"
 | 
					 | 
				
			||||||
#include "triton/driver/buffer.h"
 | 
					 | 
				
			||||||
#include "triton/driver/context.h"
 | 
					 | 
				
			||||||
#include "triton/driver/stream.h"
 | 
					 | 
				
			||||||
#include "triton/driver/kernel.h"
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
namespace triton
 | 
					 | 
				
			||||||
{
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
namespace driver
 | 
					 | 
				
			||||||
{
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/*-----------------------------------*/
 | 
					 | 
				
			||||||
//-----------  Platforms ------------*/
 | 
					 | 
				
			||||||
/*-----------------------------------*/
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
void backend::platforms::init() {
 | 
					 | 
				
			||||||
  if(!cache_.empty())
 | 
					 | 
				
			||||||
    return;
 | 
					 | 
				
			||||||
  //if CUDA is here
 | 
					 | 
				
			||||||
  if(dispatch::cuinit()){
 | 
					 | 
				
			||||||
    cache_.push_back(new cu_platform());
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
  //if host should be added
 | 
					 | 
				
			||||||
  bool host_visible = true;
 | 
					 | 
				
			||||||
  if(host_visible){
 | 
					 | 
				
			||||||
    cache_.push_back(new host_platform());
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
//  //if OpenCL is here
 | 
					 | 
				
			||||||
//  if(dispatch::clinit()){
 | 
					 | 
				
			||||||
//    cl_uint num_platforms;
 | 
					 | 
				
			||||||
//    dispatch::clGetPlatformIDs(0, nullptr, &num_platforms);
 | 
					 | 
				
			||||||
//    std::vector<cl_platform_id> ids(num_platforms);
 | 
					 | 
				
			||||||
//    dispatch::clGetPlatformIDs(num_platforms, ids.data(), nullptr);
 | 
					 | 
				
			||||||
//    for(cl_platform_id id: ids)
 | 
					 | 
				
			||||||
//      cache_.push_back(new cl_platform(id));
 | 
					 | 
				
			||||||
//  }
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  if(cache_.empty())
 | 
					 | 
				
			||||||
    throw std::runtime_error("Triton: No backend available. Make sure CUDA is available in your library path");
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
void backend::platforms::get(std::vector<platform *> &results) {
 | 
					 | 
				
			||||||
  std::copy(cache_.begin(), cache_.end(), std::back_inserter(results));
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
std::vector<driver::platform*> backend::platforms::cache_;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/*-----------------------------------*/
 | 
					 | 
				
			||||||
//-----------  Devices --------------*/
 | 
					 | 
				
			||||||
/*-----------------------------------*/
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
void backend::devices::init(std::vector<platform*> const & platforms) {
 | 
					 | 
				
			||||||
  if(!cache_.empty())
 | 
					 | 
				
			||||||
    return;
 | 
					 | 
				
			||||||
  for(driver::platform* pf: platforms)
 | 
					 | 
				
			||||||
    pf->devices(cache_);
 | 
					 | 
				
			||||||
  if(cache_.empty())
 | 
					 | 
				
			||||||
    throw std::runtime_error("Triton: No device available. Make sure that your platform is configured properly");
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
void backend::devices::get(std::vector<device*> &devs) {
 | 
					 | 
				
			||||||
  std::copy(cache_.begin(), cache_.end(), std::back_inserter(devs));
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
std::vector<driver::device*> backend::devices::cache_;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/*-----------------------------------*/
 | 
					 | 
				
			||||||
//---------- Modules ----------------*/
 | 
					 | 
				
			||||||
/*-----------------------------------*/
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
void backend::modules::release(){
 | 
					 | 
				
			||||||
  for(auto & x: cache_)
 | 
					 | 
				
			||||||
    delete x.second;
 | 
					 | 
				
			||||||
  cache_.clear();
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
std::map<std::tuple<driver::stream*, std::string>, driver::module*>  backend::modules::cache_;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/*-----------------------------------*/
 | 
					 | 
				
			||||||
//-----------  Kernels --------------*/
 | 
					 | 
				
			||||||
/*-----------------------------------*/
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
void backend::kernels::release(){
 | 
					 | 
				
			||||||
  for(auto & x: cache_)
 | 
					 | 
				
			||||||
    delete x.second;
 | 
					 | 
				
			||||||
  cache_.clear();
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
driver::kernel* backend::kernels::get(driver::module *mod, std::string const & name){
 | 
					 | 
				
			||||||
  std::tuple<driver::module*, std::string> key(mod, name);
 | 
					 | 
				
			||||||
  if(cache_.find(key)==cache_.end()){
 | 
					 | 
				
			||||||
    return &*cache_.insert({key, driver::kernel::create(mod, name.c_str())}).first->second;
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
  return cache_.at(key);
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
std::map<std::tuple<driver::module*, std::string>, driver::kernel*> backend::kernels::cache_;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/*-----------------------------------*/
 | 
					 | 
				
			||||||
//------------  Queues --------------*/
 | 
					 | 
				
			||||||
/*-----------------------------------*/
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
void backend::streams::init(std::list<driver::context*> const & contexts){
 | 
					 | 
				
			||||||
  for(driver::context* ctx : contexts)
 | 
					 | 
				
			||||||
    if(cache_.find(ctx)==cache_.end())
 | 
					 | 
				
			||||||
      cache_.insert(std::make_pair(ctx, std::vector<driver::stream*>{driver::stream::create(ctx->backend())}));
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
void backend::streams::release(){
 | 
					 | 
				
			||||||
  for(auto & x: cache_)
 | 
					 | 
				
			||||||
    for(auto & y: x.second)
 | 
					 | 
				
			||||||
      delete y;
 | 
					 | 
				
			||||||
  cache_.clear();
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
driver::stream* backend::streams::get_default()
 | 
					 | 
				
			||||||
{ return get(contexts::get_default(), 0); }
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
driver::stream* backend::streams::get(driver::context* context, unsigned int id){
 | 
					 | 
				
			||||||
  init(std::list<driver::context*>(1,context));
 | 
					 | 
				
			||||||
  for(auto & x : cache_)
 | 
					 | 
				
			||||||
    if(x.first==context)
 | 
					 | 
				
			||||||
      return x.second[id];
 | 
					 | 
				
			||||||
  throw;
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
void backend::streams::get(driver::context* context, std::vector<driver::stream*> & queues){
 | 
					 | 
				
			||||||
  init(std::list<driver::context*>(1,context));
 | 
					 | 
				
			||||||
  queues = cache_.at(context);
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
std::map<driver::context*, std::vector<driver::stream*>> backend::streams::cache_;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/*-----------------------------------*/
 | 
					 | 
				
			||||||
//------------  Contexts ------------*/
 | 
					 | 
				
			||||||
/*-----------------------------------*/
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
void backend::contexts::init(std::vector<driver::device*> const & devices){
 | 
					 | 
				
			||||||
  for(driver::device* dvc: devices)
 | 
					 | 
				
			||||||
    cache_.push_back(driver::context::create(dvc));
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
void backend::contexts::release(){
 | 
					 | 
				
			||||||
  for(auto & x: cache_)
 | 
					 | 
				
			||||||
    delete x;
 | 
					 | 
				
			||||||
  cache_.clear();
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
driver::context* backend::contexts::get_default(){
 | 
					 | 
				
			||||||
  backend::init();
 | 
					 | 
				
			||||||
  auto it = cache_.begin();
 | 
					 | 
				
			||||||
  std::advance(it, default_device);
 | 
					 | 
				
			||||||
  return *it;
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
void backend::contexts::get(std::list<driver::context*> & contexts){
 | 
					 | 
				
			||||||
  backend::init();
 | 
					 | 
				
			||||||
  contexts = cache_;
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
std::list<driver::context*> backend::contexts::cache_;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/*-----------------------------------*/
 | 
					 | 
				
			||||||
//------------  General -------------*/
 | 
					 | 
				
			||||||
/*-----------------------------------*/
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
void backend::synchronize(driver::context* context){
 | 
					 | 
				
			||||||
  for(driver::stream * queue: streams::cache_.at(context))
 | 
					 | 
				
			||||||
    queue->synchronize();
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
void backend::release(){
 | 
					 | 
				
			||||||
  backend::kernels::release();
 | 
					 | 
				
			||||||
//  backend::programs::release();
 | 
					 | 
				
			||||||
  backend::streams::release();
 | 
					 | 
				
			||||||
  backend::contexts::release();
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
void backend::init(){
 | 
					 | 
				
			||||||
  if(!contexts::cache_.empty())
 | 
					 | 
				
			||||||
    return;
 | 
					 | 
				
			||||||
  // initialize platforms
 | 
					 | 
				
			||||||
  backend::platforms::init();
 | 
					 | 
				
			||||||
  // initialize devices
 | 
					 | 
				
			||||||
  backend::devices::init(platforms::cache_);
 | 
					 | 
				
			||||||
  // initialize contexts
 | 
					 | 
				
			||||||
  backend::contexts::init(devices::cache_);
 | 
					 | 
				
			||||||
  // initialize streams
 | 
					 | 
				
			||||||
  streams::init(contexts::cache_);
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
unsigned int backend::default_device = 0;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
@@ -1,90 +0,0 @@
 | 
				
			|||||||
/* Copyright 2015-2017 Philippe Tillet
 | 
					 | 
				
			||||||
*
 | 
					 | 
				
			||||||
* Permission is hereby granted, free of charge, to any person obtaining
 | 
					 | 
				
			||||||
* a copy of this software and associated documentation files
 | 
					 | 
				
			||||||
* (the "Software"), to deal in the Software without restriction,
 | 
					 | 
				
			||||||
* including without limitation the rights to use, copy, modify, merge,
 | 
					 | 
				
			||||||
* publish, distribute, sublicense, and/or sell copies of the Software,
 | 
					 | 
				
			||||||
* and to permit persons to whom the Software is furnished to do so,
 | 
					 | 
				
			||||||
* subject to the following conditions:
 | 
					 | 
				
			||||||
*
 | 
					 | 
				
			||||||
* The above copyright notice and this permission notice shall be
 | 
					 | 
				
			||||||
* included in all copies or substantial portions of the Software.
 | 
					 | 
				
			||||||
*
 | 
					 | 
				
			||||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 | 
					 | 
				
			||||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 | 
					 | 
				
			||||||
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 | 
					 | 
				
			||||||
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 | 
					 | 
				
			||||||
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 | 
					 | 
				
			||||||
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 | 
					 | 
				
			||||||
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 | 
					 | 
				
			||||||
*/
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#include "triton/driver/stream.h"
 | 
					 | 
				
			||||||
#include "triton/driver/buffer.h"
 | 
					 | 
				
			||||||
#include "triton/driver/context.h"
 | 
					 | 
				
			||||||
#include "triton/driver/dispatch.h"
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
namespace triton
 | 
					 | 
				
			||||||
{
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
namespace driver
 | 
					 | 
				
			||||||
{
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
//
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
buffer::buffer(size_t size, CUdeviceptr cu, bool take_ownership)
 | 
					 | 
				
			||||||
  : polymorphic_resource(cu, take_ownership), size_(size) { }
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
buffer::buffer(size_t size, host_buffer_t hst, bool take_ownership)
 | 
					 | 
				
			||||||
  : polymorphic_resource(hst, take_ownership), size_(size) { }
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
size_t buffer::size() {
 | 
					 | 
				
			||||||
  return size_;
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
uintptr_t buffer::addr_as_uintptr_t() {
 | 
					 | 
				
			||||||
  switch(backend_){
 | 
					 | 
				
			||||||
    case CUDA: return *cu_;
 | 
					 | 
				
			||||||
    case Host: return (uintptr_t)hst_->data;
 | 
					 | 
				
			||||||
    default: return 0;
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
buffer* buffer::create(driver::context* ctx, size_t size) {
 | 
					 | 
				
			||||||
  switch(ctx->backend()){
 | 
					 | 
				
			||||||
  case CUDA: return new cu_buffer(size);
 | 
					 | 
				
			||||||
  case Host: return new host_buffer(size);
 | 
					 | 
				
			||||||
  default: throw std::runtime_error("unknown backend");
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
//
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
host_buffer::host_buffer(size_t size)
 | 
					 | 
				
			||||||
  :  buffer(size, host_buffer_t(), true){
 | 
					 | 
				
			||||||
  hst_->data = new char[size];
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
//
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
cu_buffer::cu_buffer(size_t size)
 | 
					 | 
				
			||||||
  : buffer(size, CUdeviceptr(), true) {
 | 
					 | 
				
			||||||
  dispatch::cuMemAlloc(&*cu_, size);
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
cu_buffer::cu_buffer(size_t size, CUdeviceptr cu, bool take_ownership)
 | 
					 | 
				
			||||||
  : buffer(size, cu, take_ownership){
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
void cu_buffer::set_zero(driver::stream* queue, size_t size){
 | 
					 | 
				
			||||||
  dispatch::cuMemsetD8Async(*cu_, 0, size, *queue->cu());
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
@@ -1,118 +0,0 @@
 | 
				
			|||||||
/* Copyright 2015-2017 Philippe Tillet
 | 
					 | 
				
			||||||
*
 | 
					 | 
				
			||||||
* Permission is hereby granted, free of charge, to any person obtaining
 | 
					 | 
				
			||||||
* a copy of this software and associated documentation files
 | 
					 | 
				
			||||||
* (the "Software"), to deal in the Software without restriction,
 | 
					 | 
				
			||||||
* including without limitation the rights to use, copy, modify, merge,
 | 
					 | 
				
			||||||
* publish, distribute, sublicense, and/or sell copies of the Software,
 | 
					 | 
				
			||||||
* and to permit persons to whom the Software is furnished to do so,
 | 
					 | 
				
			||||||
* subject to the following conditions:
 | 
					 | 
				
			||||||
*
 | 
					 | 
				
			||||||
* The above copyright notice and this permission notice shall be
 | 
					 | 
				
			||||||
* included in all copies or substantial portions of the Software.
 | 
					 | 
				
			||||||
*
 | 
					 | 
				
			||||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 | 
					 | 
				
			||||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 | 
					 | 
				
			||||||
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 | 
					 | 
				
			||||||
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 | 
					 | 
				
			||||||
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 | 
					 | 
				
			||||||
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 | 
					 | 
				
			||||||
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 | 
					 | 
				
			||||||
*/
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#include <cassert>
 | 
					 | 
				
			||||||
#include "triton/driver/context.h"
 | 
					 | 
				
			||||||
#include "triton/driver/module.h"
 | 
					 | 
				
			||||||
#include "triton/tools/sys/getenv.hpp"
 | 
					 | 
				
			||||||
#include "triton/tools/sys/mkdir.hpp"
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
namespace triton
 | 
					 | 
				
			||||||
{
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
namespace driver
 | 
					 | 
				
			||||||
{
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* ------------------------ */
 | 
					 | 
				
			||||||
//         BASE             //
 | 
					 | 
				
			||||||
/* ------------------------ */
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
context::context(driver::device *dev, CUcontext cu, bool take_ownership):
 | 
					 | 
				
			||||||
  polymorphic_resource(cu, take_ownership),
 | 
					 | 
				
			||||||
  dev_(dev), cache_path_(get_cache_path()) {
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
context::context(driver::device *dev, host_context_t hst, bool take_ownership):
 | 
					 | 
				
			||||||
  polymorphic_resource(hst, take_ownership),
 | 
					 | 
				
			||||||
  dev_(dev), cache_path_(get_cache_path()){
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
context* context::create(driver::device *dev){
 | 
					 | 
				
			||||||
  switch(dev->backend()){
 | 
					 | 
				
			||||||
  case CUDA: return new cu_context(dev);
 | 
					 | 
				
			||||||
  case Host: return new host_context(dev);
 | 
					 | 
				
			||||||
  default: throw std::runtime_error("unknown backend");
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
driver::device* context::device() const {
 | 
					 | 
				
			||||||
  return dev_;
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
std::string context::get_cache_path(){
 | 
					 | 
				
			||||||
  //user-specified cache path
 | 
					 | 
				
			||||||
  std::string result = tools::getenv("TRITON_CACHE_PATH");
 | 
					 | 
				
			||||||
  if(!result.empty()){
 | 
					 | 
				
			||||||
    if(tools::mkpath(result)==0)
 | 
					 | 
				
			||||||
      return result;
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
  //create in home
 | 
					 | 
				
			||||||
  result = tools::getenv("HOME");
 | 
					 | 
				
			||||||
  if(!result.empty())
 | 
					 | 
				
			||||||
  {
 | 
					 | 
				
			||||||
    result = result + "/.triton/cache/";
 | 
					 | 
				
			||||||
    if(tools::mkpath(result)==0)
 | 
					 | 
				
			||||||
      return result;
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
  //couldn't find a directory
 | 
					 | 
				
			||||||
  return "";
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
std::string const & context::cache_path() const{
 | 
					 | 
				
			||||||
  return cache_path_;
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* ------------------------ */
 | 
					 | 
				
			||||||
//         Host             //
 | 
					 | 
				
			||||||
/* ------------------------ */
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
host_context::host_context(driver::device* dev): context(dev, host_context_t(), true){
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* ------------------------ */
 | 
					 | 
				
			||||||
//         CUDA             //
 | 
					 | 
				
			||||||
/* ------------------------ */
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
// import CUdevice
 | 
					 | 
				
			||||||
CUdevice cu_context::get_device_of(CUcontext context){
 | 
					 | 
				
			||||||
  dispatch::cuCtxPushCurrent_v2(context);
 | 
					 | 
				
			||||||
  CUdevice res;
 | 
					 | 
				
			||||||
  dispatch::cuCtxGetDevice(&res);
 | 
					 | 
				
			||||||
  dispatch::cuCtxPopCurrent_v2(NULL);
 | 
					 | 
				
			||||||
  return res;
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
// wrapper for cuda context
 | 
					 | 
				
			||||||
cu_context::cu_context(CUcontext context, bool take_ownership): driver::context(new driver::cu_device(get_device_of(context), false),
 | 
					 | 
				
			||||||
                                                                                context, take_ownership) {
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
cu_context::cu_context(driver::device* device): context(device, CUcontext(), true){
 | 
					 | 
				
			||||||
  dispatch::cuCtxCreate(&*cu_, CU_CTX_SCHED_AUTO, *((driver::cu_device*)dev_)->cu());
 | 
					 | 
				
			||||||
//  dispatch::cuCtxPopCurrent_v2(NULL);
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
@@ -1,192 +0,0 @@
 | 
				
			|||||||
/* Copyright 2015-2017 Philippe Tillet
 | 
					 | 
				
			||||||
*
 | 
					 | 
				
			||||||
* Permission is hereby granted, free of charge, to any person obtaining
 | 
					 | 
				
			||||||
* a copy of this software and associated documentation files
 | 
					 | 
				
			||||||
* (the "Software"), to deal in the Software without restriction,
 | 
					 | 
				
			||||||
* including without limitation the rights to use, copy, modify, merge,
 | 
					 | 
				
			||||||
* publish, distribute, sublicense, and/or sell copies of the Software,
 | 
					 | 
				
			||||||
* and to permit persons to whom the Software is furnished to do so,
 | 
					 | 
				
			||||||
* subject to the following conditions:
 | 
					 | 
				
			||||||
*
 | 
					 | 
				
			||||||
* The above copyright notice and this permission notice shall be
 | 
					 | 
				
			||||||
* included in all copies or substantial portions of the Software.
 | 
					 | 
				
			||||||
*
 | 
					 | 
				
			||||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 | 
					 | 
				
			||||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 | 
					 | 
				
			||||||
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 | 
					 | 
				
			||||||
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 | 
					 | 
				
			||||||
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 | 
					 | 
				
			||||||
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 | 
					 | 
				
			||||||
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 | 
					 | 
				
			||||||
*/
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#include <map>
 | 
					 | 
				
			||||||
#include <algorithm>
 | 
					 | 
				
			||||||
#include <sstream>
 | 
					 | 
				
			||||||
#include <cstring>
 | 
					 | 
				
			||||||
#include <memory>
 | 
					 | 
				
			||||||
#include "triton/driver/device.h"
 | 
					 | 
				
			||||||
#include "triton/driver/context.h"
 | 
					 | 
				
			||||||
#include "triton/driver/error.h"
 | 
					 | 
				
			||||||
#include "triton/codegen/target.h"
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
namespace triton
 | 
					 | 
				
			||||||
{
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
namespace driver
 | 
					 | 
				
			||||||
{
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* ------------------------ */
 | 
					 | 
				
			||||||
//          Host            //
 | 
					 | 
				
			||||||
/* ------------------------ */
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
std::unique_ptr<codegen::target> host_device::make_target() const {
 | 
					 | 
				
			||||||
  return std::unique_ptr<codegen::cpu_target>(new codegen::cpu_target());
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* ------------------------ */
 | 
					 | 
				
			||||||
//         CUDA             //
 | 
					 | 
				
			||||||
/* ------------------------ */
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
// information query
 | 
					 | 
				
			||||||
template<CUdevice_attribute attr>
 | 
					 | 
				
			||||||
int cu_device::cuGetInfo() const{
 | 
					 | 
				
			||||||
  int res;
 | 
					 | 
				
			||||||
  dispatch::cuDeviceGetAttribute(&res, attr, *cu_);
 | 
					 | 
				
			||||||
  return res;
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
// convert to nvml
 | 
					 | 
				
			||||||
nvmlDevice_t cu_device::nvml_device() const{
 | 
					 | 
				
			||||||
  std::map<std::string, nvmlDevice_t> map;
 | 
					 | 
				
			||||||
  std::string key = pci_bus_id();
 | 
					 | 
				
			||||||
  if(map.find(key)==map.end()){
 | 
					 | 
				
			||||||
    nvmlDevice_t device;
 | 
					 | 
				
			||||||
    dispatch::nvmlDeviceGetHandleByPciBusId_v2(key.c_str(), &device);
 | 
					 | 
				
			||||||
    return map.insert(std::make_pair(key, device)).first->second;
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
  return map.at(key);
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
// number of address bits
 | 
					 | 
				
			||||||
size_t cu_device::address_bits() const{
 | 
					 | 
				
			||||||
  return sizeof(size_t)*8;
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
// name
 | 
					 | 
				
			||||||
std::string cu_device::name() const {
 | 
					 | 
				
			||||||
    char tmp[128];
 | 
					 | 
				
			||||||
    dispatch::cuDeviceGetName(tmp, 128, *cu_);
 | 
					 | 
				
			||||||
    return std::string(tmp);
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
// PCI bus ID
 | 
					 | 
				
			||||||
std::string cu_device::pci_bus_id() const{
 | 
					 | 
				
			||||||
  char tmp[128];
 | 
					 | 
				
			||||||
  dispatch::cuDeviceGetPCIBusId(tmp, 128, *cu_);
 | 
					 | 
				
			||||||
  return std::string(tmp);
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
// force the device to be interpreted as a particular cc
 | 
					 | 
				
			||||||
void cu_device::interpret_as(int cc){
 | 
					 | 
				
			||||||
  interpreted_as_ = std::make_shared<int>(cc);
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
// compute capability
 | 
					 | 
				
			||||||
int cu_device::compute_capability() const {
 | 
					 | 
				
			||||||
  if(interpreted_as_)
 | 
					 | 
				
			||||||
    return *interpreted_as_;
 | 
					 | 
				
			||||||
  size_t major = cuGetInfo<CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR>();
 | 
					 | 
				
			||||||
  size_t minor = cuGetInfo<CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR>();
 | 
					 | 
				
			||||||
  return major*10 + minor;
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
// maximum number of threads per block
 | 
					 | 
				
			||||||
size_t cu_device::max_threads_per_block() const {
 | 
					 | 
				
			||||||
  return cuGetInfo<CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK>();
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
// maximum amount of shared memory per block
 | 
					 | 
				
			||||||
size_t cu_device::max_shared_memory() const {
 | 
					 | 
				
			||||||
  return cuGetInfo<CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN>();
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
// warp size
 | 
					 | 
				
			||||||
size_t cu_device::warp_size() const {
 | 
					 | 
				
			||||||
  return cuGetInfo<CU_DEVICE_ATTRIBUTE_WARP_SIZE>();
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
// maximum block dimensions
 | 
					 | 
				
			||||||
std::vector<size_t> cu_device::max_block_dim() const {
 | 
					 | 
				
			||||||
  std::vector<size_t> result(3);
 | 
					 | 
				
			||||||
  result[0] = cuGetInfo<CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X>();
 | 
					 | 
				
			||||||
  result[1] = cuGetInfo<CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y>();
 | 
					 | 
				
			||||||
  result[2] = cuGetInfo<CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z>();
 | 
					 | 
				
			||||||
  return result;
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
// current SM clock
 | 
					 | 
				
			||||||
size_t cu_device::current_sm_clock() const{
 | 
					 | 
				
			||||||
  unsigned int result;
 | 
					 | 
				
			||||||
  dispatch::nvmlDeviceGetClockInfo(nvml_device(), NVML_CLOCK_SM, &result);
 | 
					 | 
				
			||||||
  return result;
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
// max SM clock
 | 
					 | 
				
			||||||
size_t cu_device::max_sm_clock() const{
 | 
					 | 
				
			||||||
  unsigned int result;
 | 
					 | 
				
			||||||
  dispatch::nvmlDeviceGetMaxClockInfo(nvml_device(), NVML_CLOCK_SM, &result);
 | 
					 | 
				
			||||||
  return result;
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
// current memory clock
 | 
					 | 
				
			||||||
size_t cu_device::current_mem_clock() const{
 | 
					 | 
				
			||||||
  unsigned int result;
 | 
					 | 
				
			||||||
  dispatch::nvmlDeviceGetClockInfo(nvml_device(), NVML_CLOCK_MEM, &result);
 | 
					 | 
				
			||||||
  return result;
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
// max memory clock
 | 
					 | 
				
			||||||
size_t cu_device::max_mem_clock() const{
 | 
					 | 
				
			||||||
  unsigned int result;
 | 
					 | 
				
			||||||
  dispatch::nvmlDeviceGetMaxClockInfo(nvml_device(), NVML_CLOCK_MEM, &result);
 | 
					 | 
				
			||||||
  return result;
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
// max memory clock
 | 
					 | 
				
			||||||
void cu_device::set_max_clock() {
 | 
					 | 
				
			||||||
  dispatch::nvmlDeviceSetApplicationsClocks(nvml_device(), max_mem_clock(), max_sm_clock());
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
void cu_device::enable_peer_access(CUdeviceptr peer_mem_ptr) const{
 | 
					 | 
				
			||||||
  CUcontext context;
 | 
					 | 
				
			||||||
  dispatch::cuPointerGetAttribute(&context, CU_POINTER_ATTRIBUTE_CONTEXT, peer_mem_ptr);
 | 
					 | 
				
			||||||
  try {
 | 
					 | 
				
			||||||
    dispatch::cuCtxEnablePeerAccess(context, 0);
 | 
					 | 
				
			||||||
  } catch (exception::cuda::peer_access_already_enabled) {}
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
// print infos
 | 
					 | 
				
			||||||
std::string cu_device::infos() const{
 | 
					 | 
				
			||||||
  std::ostringstream oss;
 | 
					 | 
				
			||||||
  std::vector<size_t> max_wi_sizes = max_block_dim();
 | 
					 | 
				
			||||||
  oss << "Platform: CUDA" << std::endl;
 | 
					 | 
				
			||||||
  oss << "Name: " << name() << std::endl;
 | 
					 | 
				
			||||||
  oss << "Maximum total work-group size: " << max_threads_per_block() << std::endl;
 | 
					 | 
				
			||||||
  oss << "Maximum individual work-group sizes: " << max_wi_sizes[0] << ", " << max_wi_sizes[1] << ", " << max_wi_sizes[2] << std::endl;
 | 
					 | 
				
			||||||
  oss << "Local memory size: " << max_shared_memory() << std::endl;
 | 
					 | 
				
			||||||
  return oss.str();
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
// target
 | 
					 | 
				
			||||||
std::unique_ptr<codegen::target> cu_device::make_target() const {
 | 
					 | 
				
			||||||
  return std::unique_ptr<codegen::nvidia_cu_target>(new codegen::nvidia_cu_target(compute_capability()));
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
@@ -21,7 +21,6 @@
 | 
				
			|||||||
*/
 | 
					*/
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#include "triton/driver/dispatch.h"
 | 
					#include "triton/driver/dispatch.h"
 | 
				
			||||||
#include "triton/driver/context.h"
 | 
					 | 
				
			||||||
#include "triton/tools/sys/getenv.hpp"
 | 
					#include "triton/tools/sys/getenv.hpp"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
namespace triton
 | 
					namespace triton
 | 
				
			||||||
@@ -31,65 +30,65 @@ namespace driver
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
//Helpers for function definition
 | 
					//Helpers for function definition
 | 
				
			||||||
#define DEFINE0(init, hlib, ret, fname) ret dispatch::fname()\
 | 
					#define DEFINE0(init, hlib, ret, fname) ret dispatch::fname()\
 | 
				
			||||||
{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname); }
 | 
					{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname); }\
 | 
				
			||||||
 | 
					void* dispatch::fname ## _;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#define DEFINE1(init, hlib, ret, fname, t1) ret dispatch::fname(t1 a)\
 | 
					#define DEFINE1(init, hlib, ret, fname, t1) ret dispatch::fname(t1 a)\
 | 
				
			||||||
{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a); }
 | 
					{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a); }\
 | 
				
			||||||
 | 
					void* dispatch::fname ## _;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#define DEFINE2(init, hlib, ret, fname, t1, t2) ret dispatch::fname(t1 a, t2 b)\
 | 
					#define DEFINE2(init, hlib, ret, fname, t1, t2) ret dispatch::fname(t1 a, t2 b)\
 | 
				
			||||||
{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b); }
 | 
					{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b); }\
 | 
				
			||||||
 | 
					void* dispatch::fname ## _;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#define DEFINE3(init, hlib, ret, fname, t1, t2, t3) ret dispatch::fname(t1 a, t2 b, t3 c)\
 | 
					#define DEFINE3(init, hlib, ret, fname, t1, t2, t3) ret dispatch::fname(t1 a, t2 b, t3 c)\
 | 
				
			||||||
{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c); }
 | 
					{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c); }\
 | 
				
			||||||
 | 
					void* dispatch::fname ## _;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#define DEFINE4(init, hlib, ret, fname, t1, t2, t3, t4) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d)\
 | 
					#define DEFINE4(init, hlib, ret, fname, t1, t2, t3, t4) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d)\
 | 
				
			||||||
{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d); }
 | 
					{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d); }\
 | 
				
			||||||
 | 
					void* dispatch::fname ## _;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#define DEFINE5(init, hlib, ret, fname, t1, t2, t3, t4, t5) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e)\
 | 
					#define DEFINE5(init, hlib, ret, fname, t1, t2, t3, t4, t5) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e)\
 | 
				
			||||||
{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e); }
 | 
					{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e); }\
 | 
				
			||||||
 | 
					void* dispatch::fname ## _;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#define DEFINE6(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f)\
 | 
					#define DEFINE6(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f)\
 | 
				
			||||||
{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f); }
 | 
					{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f); }\
 | 
				
			||||||
 | 
					void* dispatch::fname ## _;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#define DEFINE7(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6, t7) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f, t7 g)\
 | 
					#define DEFINE7(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6, t7) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f, t7 g)\
 | 
				
			||||||
{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f, g); }
 | 
					{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f, g); }\
 | 
				
			||||||
 | 
					void* dispatch::fname ## _;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#define DEFINE8(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f, t7 g, t8 h)\
 | 
					#define DEFINE8(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f, t7 g, t8 h)\
 | 
				
			||||||
{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f, g, h); }
 | 
					{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f, g, h); }\
 | 
				
			||||||
 | 
					void* dispatch::fname ## _;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#define DEFINE9(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f, t7 g, t8 h, t9 i)\
 | 
					#define DEFINE9(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f, t7 g, t8 h, t9 i)\
 | 
				
			||||||
{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f, g, h, i); }
 | 
					{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f, g, h, i); }\
 | 
				
			||||||
 | 
					void* dispatch::fname ## _;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#define DEFINE10(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f, t7 g, t8 h, t9 i, t10 j)\
 | 
					#define DEFINE10(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f, t7 g, t8 h, t9 i, t10 j)\
 | 
				
			||||||
{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f, g, h, i, j); }
 | 
					{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f, g, h, i, j); }\
 | 
				
			||||||
 | 
					void* dispatch::fname ## _;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#define DEFINE11(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f, t7 g, t8 h, t9 i, t10 j, t11 k)\
 | 
					#define DEFINE11(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f, t7 g, t8 h, t9 i, t10 j, t11 k)\
 | 
				
			||||||
{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f, g, h, i, j, k); }
 | 
					{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f, g, h, i, j, k); }\
 | 
				
			||||||
 | 
					void* dispatch::fname ## _;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#define DEFINE13(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f, t7 g, t8 h, t9 i, t10 j, t11 k, t12 l, t13 m)\
 | 
					#define DEFINE13(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f, t7 g, t8 h, t9 i, t10 j, t11 k, t12 l, t13 m)\
 | 
				
			||||||
{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f, g, h, i, j, k, l, m); }
 | 
					{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f, g, h, i, j, k, l, m); }\
 | 
				
			||||||
 | 
					void* dispatch::fname ## _;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#define DEFINE19(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15, t16, t17, t18, t19) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f, t7 g, t8 h, t9 i, t10 j, t11 k, t12 l, t13 m, t14 n, t15 o, t16 p, t17 q, t18 r, t19 s)\
 | 
					#define DEFINE19(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15, t16, t17, t18, t19) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f, t7 g, t8 h, t9 i, t10 j, t11 k, t12 l, t13 m, t14 n, t15 o, t16 p, t17 q, t18 r, t19 s)\
 | 
				
			||||||
{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s); }
 | 
					{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s); }\
 | 
				
			||||||
 | 
					void* dispatch::fname ## _;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
//Specialized helpers for CUDA
 | 
					 | 
				
			||||||
#define CUDA_DEFINE1(ret, fname, t1) DEFINE1(cuinit, cuda_, ret, fname, t1)
 | 
					 | 
				
			||||||
#define CUDA_DEFINE2(ret, fname, t1, t2) DEFINE2(cuinit, cuda_, ret, fname, t1, t2)
 | 
					 | 
				
			||||||
#define CUDA_DEFINE3(ret, fname, t1, t2, t3) DEFINE3(cuinit, cuda_, ret, fname, t1, t2, t3)
 | 
					 | 
				
			||||||
#define CUDA_DEFINE4(ret, fname, t1, t2, t3, t4) DEFINE4(cuinit, cuda_, ret, fname, t1, t2, t3, t4)
 | 
					 | 
				
			||||||
#define CUDA_DEFINE5(ret, fname, t1, t2, t3, t4, t5) DEFINE5(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5)
 | 
					 | 
				
			||||||
#define CUDA_DEFINE6(ret, fname, t1, t2, t3, t4, t5, t6) DEFINE6(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5, t6)
 | 
					 | 
				
			||||||
#define CUDA_DEFINE7(ret, fname, t1, t2, t3, t4, t5, t6, t7) DEFINE7(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5, t6, t7)
 | 
					 | 
				
			||||||
#define CUDA_DEFINE8(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8) DEFINE8(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8)
 | 
					 | 
				
			||||||
#define CUDA_DEFINE9(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9) DEFINE9(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9)
 | 
					 | 
				
			||||||
#define CUDA_DEFINE10(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10) DEFINE10(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10)
 | 
					 | 
				
			||||||
#define CUDA_DEFINE11(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11) DEFINE11(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#define NVML_DEFINE0(ret, fname) DEFINE0(nvmlinit, nvml_, ret, fname)
 | 
					 | 
				
			||||||
#define NVML_DEFINE1(ret, fname, t1) DEFINE1(nvmlinit, nvml_, ret, fname, t1)
 | 
					 | 
				
			||||||
#define NVML_DEFINE2(ret, fname, t1, t2) DEFINE2(nvmlinit, nvml_, ret, fname, t1, t2)
 | 
					 | 
				
			||||||
#define NVML_DEFINE3(ret, fname, t1, t2, t3) DEFINE3(nvmlinit, nvml_, ret, fname, t1, t2, t3)
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					/* ------------------- *
 | 
				
			||||||
 | 
					 * CUDA
 | 
				
			||||||
 | 
					 * ------------------- */
 | 
				
			||||||
 | 
					
 | 
				
			||||||
bool dispatch::cuinit(){
 | 
					bool dispatch::cuinit(){
 | 
				
			||||||
  if(cuda_==nullptr){
 | 
					  if(cuda_==nullptr){
 | 
				
			||||||
@@ -115,6 +114,74 @@ bool dispatch::cuinit(){
 | 
				
			|||||||
  return true;
 | 
					  return true;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define CUDA_DEFINE1(ret, fname, t1) DEFINE1(cuinit, cuda_, ret, fname, t1)
 | 
				
			||||||
 | 
					#define CUDA_DEFINE2(ret, fname, t1, t2) DEFINE2(cuinit, cuda_, ret, fname, t1, t2)
 | 
				
			||||||
 | 
					#define CUDA_DEFINE3(ret, fname, t1, t2, t3) DEFINE3(cuinit, cuda_, ret, fname, t1, t2, t3)
 | 
				
			||||||
 | 
					#define CUDA_DEFINE4(ret, fname, t1, t2, t3, t4) DEFINE4(cuinit, cuda_, ret, fname, t1, t2, t3, t4)
 | 
				
			||||||
 | 
					#define CUDA_DEFINE5(ret, fname, t1, t2, t3, t4, t5) DEFINE5(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5)
 | 
				
			||||||
 | 
					#define CUDA_DEFINE6(ret, fname, t1, t2, t3, t4, t5, t6) DEFINE6(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5, t6)
 | 
				
			||||||
 | 
					#define CUDA_DEFINE7(ret, fname, t1, t2, t3, t4, t5, t6, t7) DEFINE7(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5, t6, t7)
 | 
				
			||||||
 | 
					#define CUDA_DEFINE8(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8) DEFINE8(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8)
 | 
				
			||||||
 | 
					#define CUDA_DEFINE9(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9) DEFINE9(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9)
 | 
				
			||||||
 | 
					#define CUDA_DEFINE10(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10) DEFINE10(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10)
 | 
				
			||||||
 | 
					#define CUDA_DEFINE11(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11) DEFINE11(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					// context management
 | 
				
			||||||
 | 
					CUDA_DEFINE1(CUresult, cuCtxDestroy_v2, CUcontext)
 | 
				
			||||||
 | 
					CUDA_DEFINE3(CUresult, cuCtxCreate_v2, CUcontext *, unsigned int, CUdevice)
 | 
				
			||||||
 | 
					CUDA_DEFINE1(CUresult, cuCtxGetDevice, CUdevice*)
 | 
				
			||||||
 | 
					CUDA_DEFINE2(CUresult, cuCtxEnablePeerAccess, CUcontext, unsigned int)
 | 
				
			||||||
 | 
					CUDA_DEFINE1(CUresult, cuInit, unsigned int)
 | 
				
			||||||
 | 
					CUDA_DEFINE1(CUresult, cuDriverGetVersion, int *)
 | 
				
			||||||
 | 
					// device management
 | 
				
			||||||
 | 
					CUDA_DEFINE2(CUresult, cuDeviceGet, CUdevice *, int)
 | 
				
			||||||
 | 
					CUDA_DEFINE3(CUresult, cuDeviceGetName, char *, int, CUdevice)
 | 
				
			||||||
 | 
					CUDA_DEFINE3(CUresult, cuDeviceGetPCIBusId, char *, int, CUdevice)
 | 
				
			||||||
 | 
					CUDA_DEFINE3(CUresult, cuDeviceGetAttribute, int *, CUdevice_attribute, CUdevice)
 | 
				
			||||||
 | 
					CUDA_DEFINE1(CUresult, cuDeviceGetCount, int*)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					// link management
 | 
				
			||||||
 | 
					CUDA_DEFINE8(CUresult, cuLinkAddData_v2, CUlinkState, CUjitInputType, void*, size_t, const char*, unsigned int, CUjit_option*, void**);
 | 
				
			||||||
 | 
					CUDA_DEFINE4(CUresult, cuLinkCreate_v2, unsigned int, CUjit_option*, void**, CUlinkState*);
 | 
				
			||||||
 | 
					CUDA_DEFINE1(CUresult, cuLinkDestroy, CUlinkState);
 | 
				
			||||||
 | 
					CUDA_DEFINE3(CUresult, cuLinkComplete, CUlinkState, void**, size_t*);
 | 
				
			||||||
 | 
					// module management
 | 
				
			||||||
 | 
					CUDA_DEFINE4(CUresult, cuModuleGetGlobal_v2, CUdeviceptr*, size_t*, CUmodule, const char*)
 | 
				
			||||||
 | 
					CUDA_DEFINE2(CUresult, cuModuleLoad, CUmodule *, const char *)
 | 
				
			||||||
 | 
					CUDA_DEFINE1(CUresult, cuModuleUnload, CUmodule)
 | 
				
			||||||
 | 
					CUDA_DEFINE2(CUresult, cuModuleLoadData, CUmodule *, const void *)
 | 
				
			||||||
 | 
					CUDA_DEFINE5(CUresult, cuModuleLoadDataEx, CUmodule *, const void *, unsigned int, CUjit_option *, void **)
 | 
				
			||||||
 | 
					CUDA_DEFINE3(CUresult, cuModuleGetFunction, CUfunction *, CUmodule, const char *)
 | 
				
			||||||
 | 
					// stream management
 | 
				
			||||||
 | 
					CUDA_DEFINE2(CUresult, cuStreamCreate, CUstream *, unsigned int)
 | 
				
			||||||
 | 
					CUDA_DEFINE1(CUresult, cuStreamSynchronize, CUstream)
 | 
				
			||||||
 | 
					CUDA_DEFINE1(CUresult, cuStreamDestroy_v2, CUstream)
 | 
				
			||||||
 | 
					CUDA_DEFINE2(CUresult, cuStreamGetCtx, CUstream, CUcontext*)
 | 
				
			||||||
 | 
					CUDA_DEFINE11(CUresult, cuLaunchKernel, CUfunction, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, CUstream, void **, void **)
 | 
				
			||||||
 | 
					// function management
 | 
				
			||||||
 | 
					CUDA_DEFINE3(CUresult, cuFuncGetAttribute, int*, CUfunction_attribute, CUfunction)
 | 
				
			||||||
 | 
					CUDA_DEFINE3(CUresult, cuFuncSetAttribute, CUfunction, CUfunction_attribute, int)
 | 
				
			||||||
 | 
					CUDA_DEFINE2(CUresult, cuFuncSetCacheConfig, CUfunction, CUfunc_cache)
 | 
				
			||||||
 | 
					// memory management
 | 
				
			||||||
 | 
					CUDA_DEFINE3(CUresult, cuMemcpyDtoH_v2, void *, CUdeviceptr, size_t)
 | 
				
			||||||
 | 
					CUDA_DEFINE1(CUresult, cuMemFree_v2, CUdeviceptr)
 | 
				
			||||||
 | 
					CUDA_DEFINE4(CUresult, cuMemcpyDtoHAsync_v2, void *, CUdeviceptr, size_t, CUstream)
 | 
				
			||||||
 | 
					CUDA_DEFINE4(CUresult, cuMemcpyHtoDAsync_v2, CUdeviceptr, const void *, size_t, CUstream)
 | 
				
			||||||
 | 
					CUDA_DEFINE3(CUresult, cuMemcpyHtoD_v2, CUdeviceptr, const void *, size_t )
 | 
				
			||||||
 | 
					CUDA_DEFINE2(CUresult, cuMemAlloc_v2, CUdeviceptr*, size_t)
 | 
				
			||||||
 | 
					CUDA_DEFINE3(CUresult, cuPointerGetAttribute, void*, CUpointer_attribute, CUdeviceptr)
 | 
				
			||||||
 | 
					CUDA_DEFINE4(CUresult, cuMemsetD8Async, CUdeviceptr, unsigned char, size_t, CUstream)
 | 
				
			||||||
 | 
					// event management
 | 
				
			||||||
 | 
					CUDA_DEFINE2(CUresult, cuEventCreate, CUevent *, unsigned int)
 | 
				
			||||||
 | 
					CUDA_DEFINE3(CUresult, cuEventElapsedTime, float *, CUevent, CUevent)
 | 
				
			||||||
 | 
					CUDA_DEFINE2(CUresult, cuEventRecord, CUevent, CUstream)
 | 
				
			||||||
 | 
					CUDA_DEFINE1(CUresult, cuEventDestroy_v2, CUevent)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					/* ------------------- *
 | 
				
			||||||
 | 
					 * NVML
 | 
				
			||||||
 | 
					 * ------------------- */
 | 
				
			||||||
bool dispatch::nvmlinit(){
 | 
					bool dispatch::nvmlinit(){
 | 
				
			||||||
  if(nvml_==nullptr)
 | 
					  if(nvml_==nullptr)
 | 
				
			||||||
    nvml_ = dlopen("libnvidia-ml.so", RTLD_LAZY);
 | 
					    nvml_ = dlopen("libnvidia-ml.so", RTLD_LAZY);
 | 
				
			||||||
@@ -126,59 +193,93 @@ bool dispatch::nvmlinit(){
 | 
				
			|||||||
  return res;
 | 
					  return res;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
//CUDA
 | 
					#define NVML_DEFINE0(ret, fname) DEFINE0(nvmlinit, nvml_, ret, fname)
 | 
				
			||||||
CUDA_DEFINE1(CUresult, cuCtxDestroy_v2, CUcontext)
 | 
					#define NVML_DEFINE1(ret, fname, t1) DEFINE1(nvmlinit, nvml_, ret, fname, t1)
 | 
				
			||||||
CUDA_DEFINE2(CUresult, cuEventCreate, CUevent *, unsigned int)
 | 
					#define NVML_DEFINE2(ret, fname, t1, t2) DEFINE2(nvmlinit, nvml_, ret, fname, t1, t2)
 | 
				
			||||||
CUDA_DEFINE2(CUresult, cuDeviceGet, CUdevice *, int)
 | 
					#define NVML_DEFINE3(ret, fname, t1, t2, t3) DEFINE3(nvmlinit, nvml_, ret, fname, t1, t2, t3)
 | 
				
			||||||
CUDA_DEFINE3(CUresult, cuMemcpyDtoH_v2, void *, CUdeviceptr, size_t)
 | 
					 | 
				
			||||||
CUDA_DEFINE2(CUresult, cuStreamCreate, CUstream *, unsigned int)
 | 
					 | 
				
			||||||
CUDA_DEFINE3(CUresult, cuEventElapsedTime, float *, CUevent, CUevent)
 | 
					 | 
				
			||||||
CUDA_DEFINE1(CUresult, cuMemFree_v2, CUdeviceptr)
 | 
					 | 
				
			||||||
CUDA_DEFINE4(CUresult, cuMemcpyDtoHAsync_v2, void *, CUdeviceptr, size_t, CUstream)
 | 
					 | 
				
			||||||
CUDA_DEFINE1(CUresult, cuDriverGetVersion, int *)
 | 
					 | 
				
			||||||
CUDA_DEFINE3(CUresult, cuDeviceGetName, char *, int, CUdevice)
 | 
					 | 
				
			||||||
CUDA_DEFINE3(CUresult, cuDeviceGetPCIBusId, char *, int, CUdevice)
 | 
					 | 
				
			||||||
CUDA_DEFINE4(CUresult, cuModuleGetGlobal_v2, CUdeviceptr*, size_t*, CUmodule, const char*)
 | 
					 | 
				
			||||||
CUDA_DEFINE8(CUresult, cuLinkAddData_v2, CUlinkState, CUjitInputType, void*, size_t, const char*, unsigned int, CUjit_option*, void**);
 | 
					 | 
				
			||||||
CUDA_DEFINE4(CUresult, cuLinkCreate_v2, unsigned int, CUjit_option*, void**, CUlinkState*);
 | 
					 | 
				
			||||||
CUDA_DEFINE1(CUresult, cuLinkDestroy, CUlinkState);
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
CUDA_DEFINE3(CUresult, cuLinkComplete, CUlinkState, void**, size_t*);
 | 
					 | 
				
			||||||
CUDA_DEFINE4(CUresult, cuMemcpyHtoDAsync_v2, CUdeviceptr, const void *, size_t, CUstream)
 | 
					 | 
				
			||||||
CUDA_DEFINE2(CUresult, cuModuleLoad, CUmodule *, const char *)
 | 
					 | 
				
			||||||
CUDA_DEFINE11(CUresult, cuLaunchKernel, CUfunction, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, CUstream, void **, void **)
 | 
					 | 
				
			||||||
CUDA_DEFINE1(CUresult, cuModuleUnload, CUmodule)
 | 
					 | 
				
			||||||
CUDA_DEFINE2(CUresult, cuModuleLoadData, CUmodule *, const void *)
 | 
					 | 
				
			||||||
CUDA_DEFINE5(CUresult, cuModuleLoadDataEx, CUmodule *, const void *, unsigned int, CUjit_option *, void **)
 | 
					 | 
				
			||||||
CUDA_DEFINE3(CUresult, cuDeviceGetAttribute, int *, CUdevice_attribute, CUdevice)
 | 
					 | 
				
			||||||
CUDA_DEFINE1(CUresult, cuDeviceGetCount, int *)
 | 
					 | 
				
			||||||
CUDA_DEFINE3(CUresult, cuMemcpyHtoD_v2, CUdeviceptr, const void *, size_t )
 | 
					 | 
				
			||||||
CUDA_DEFINE1(CUresult, cuInit, unsigned int)
 | 
					 | 
				
			||||||
CUDA_DEFINE2(CUresult, cuEventRecord, CUevent, CUstream)
 | 
					 | 
				
			||||||
CUDA_DEFINE3(CUresult, cuCtxCreate_v2, CUcontext *, unsigned int, CUdevice)
 | 
					 | 
				
			||||||
CUDA_DEFINE3(CUresult, cuModuleGetFunction, CUfunction *, CUmodule, const char *)
 | 
					 | 
				
			||||||
CUDA_DEFINE1(CUresult, cuStreamSynchronize, CUstream)
 | 
					 | 
				
			||||||
CUDA_DEFINE1(CUresult, cuStreamDestroy_v2, CUstream)
 | 
					 | 
				
			||||||
CUDA_DEFINE2(CUresult, cuStreamGetCtx, CUstream, CUcontext*)
 | 
					 | 
				
			||||||
CUDA_DEFINE1(CUresult, cuEventDestroy_v2, CUevent)
 | 
					 | 
				
			||||||
CUDA_DEFINE2(CUresult, cuMemAlloc_v2, CUdeviceptr*, size_t)
 | 
					 | 
				
			||||||
CUDA_DEFINE3(CUresult, cuPointerGetAttribute, void*, CUpointer_attribute, CUdeviceptr)
 | 
					 | 
				
			||||||
CUDA_DEFINE1(CUresult, cuCtxGetDevice, CUdevice*)
 | 
					 | 
				
			||||||
CUDA_DEFINE1(CUresult, cuCtxGetCurrent, CUcontext*)
 | 
					 | 
				
			||||||
CUDA_DEFINE1(CUresult, cuCtxSetCurrent, CUcontext)
 | 
					 | 
				
			||||||
CUDA_DEFINE4(CUresult, cuMemsetD8Async, CUdeviceptr, unsigned char, size_t, CUstream)
 | 
					 | 
				
			||||||
CUDA_DEFINE1(CUresult, cuCtxPushCurrent_v2, CUcontext)
 | 
					 | 
				
			||||||
CUDA_DEFINE1(CUresult, cuCtxPopCurrent_v2, CUcontext*)
 | 
					 | 
				
			||||||
CUDA_DEFINE3(CUresult, cuFuncGetAttribute, int*, CUfunction_attribute, CUfunction)
 | 
					 | 
				
			||||||
CUDA_DEFINE3(CUresult, cuFuncSetAttribute, CUfunction, CUfunction_attribute, int)
 | 
					 | 
				
			||||||
CUDA_DEFINE2(CUresult, cuFuncSetCacheConfig, CUfunction, CUfunc_cache)
 | 
					 | 
				
			||||||
CUDA_DEFINE2(CUresult, cuCtxEnablePeerAccess, CUcontext, unsigned int)
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
NVML_DEFINE2(nvmlReturn_t, nvmlDeviceGetHandleByPciBusId_v2, const char *, nvmlDevice_t*)
 | 
					NVML_DEFINE2(nvmlReturn_t, nvmlDeviceGetHandleByPciBusId_v2, const char *, nvmlDevice_t*)
 | 
				
			||||||
NVML_DEFINE3(nvmlReturn_t, nvmlDeviceGetClockInfo, nvmlDevice_t, nvmlClockType_t, unsigned int*)
 | 
					NVML_DEFINE3(nvmlReturn_t, nvmlDeviceGetClockInfo, nvmlDevice_t, nvmlClockType_t, unsigned int*)
 | 
				
			||||||
NVML_DEFINE3(nvmlReturn_t, nvmlDeviceGetMaxClockInfo, nvmlDevice_t, nvmlClockType_t, unsigned int*)
 | 
					NVML_DEFINE3(nvmlReturn_t, nvmlDeviceGetMaxClockInfo, nvmlDevice_t, nvmlClockType_t, unsigned int*)
 | 
				
			||||||
NVML_DEFINE3(nvmlReturn_t, nvmlDeviceSetApplicationsClocks, nvmlDevice_t, unsigned int, unsigned int)
 | 
					NVML_DEFINE3(nvmlReturn_t, nvmlDeviceSetApplicationsClocks, nvmlDevice_t, unsigned int, unsigned int)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					/* ------------------- *
 | 
				
			||||||
 | 
					 * HIP
 | 
				
			||||||
 | 
					 * ------------------- */
 | 
				
			||||||
 | 
					bool dispatch::hipinit(){
 | 
				
			||||||
 | 
					  if(hip_==nullptr)
 | 
				
			||||||
 | 
					    hip_ = dlopen("libamdhip64.so", RTLD_LAZY);
 | 
				
			||||||
 | 
					  if(hip_ == nullptr)
 | 
				
			||||||
 | 
					    return false;
 | 
				
			||||||
 | 
					  hipError_t (*fptr)();
 | 
				
			||||||
 | 
					  hipInit_ = dlsym(hip_, "hipInit");
 | 
				
			||||||
 | 
					  *reinterpret_cast<void **>(&fptr) = hipInit_;
 | 
				
			||||||
 | 
					  hipError_t res = (*fptr)();
 | 
				
			||||||
 | 
					  check(res);
 | 
				
			||||||
 | 
					  return res;
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define HIP_DEFINE1(ret, fname, t1) DEFINE1(hipinit, hip_, ret, fname, t1)
 | 
				
			||||||
 | 
					#define HIP_DEFINE2(ret, fname, t1, t2) DEFINE2(hipinit, hip_, ret, fname, t1, t2)
 | 
				
			||||||
 | 
					#define HIP_DEFINE3(ret, fname, t1, t2, t3) DEFINE3(hipinit, hip_, ret, fname, t1, t2, t3)
 | 
				
			||||||
 | 
					#define HIP_DEFINE4(ret, fname, t1, t2, t3, t4) DEFINE4(hipinit, hip_, ret, fname, t1, t2, t3, t4)
 | 
				
			||||||
 | 
					#define HIP_DEFINE5(ret, fname, t1, t2, t3, t4, t5) DEFINE5(hipinit, hip_, ret, fname, t1, t2, t3, t4, t5)
 | 
				
			||||||
 | 
					#define HIP_DEFINE6(ret, fname, t1, t2, t3, t4, t5, t6) DEFINE6(hipinit, hip_, ret, fname, t1, t2, t3, t4, t5, t6)
 | 
				
			||||||
 | 
					#define HIP_DEFINE7(ret, fname, t1, t2, t3, t4, t5, t6, t7) DEFINE7(hipinit, hip_, ret, fname, t1, t2, t3, t4, t5, t6, t7)
 | 
				
			||||||
 | 
					#define HIP_DEFINE8(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8) DEFINE8(hipinit, hip_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8)
 | 
				
			||||||
 | 
					#define HIP_DEFINE9(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9) DEFINE9(hipinit, hip_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9)
 | 
				
			||||||
 | 
					#define HIP_DEFINE10(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10) DEFINE10(hipinit, hip_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10)
 | 
				
			||||||
 | 
					#define HIP_DEFINE11(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11) DEFINE11(hipinit, hip_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					// context management
 | 
				
			||||||
 | 
					HIP_DEFINE1(hipError_t, hipCtxDestroy, hipCtx_t)
 | 
				
			||||||
 | 
					HIP_DEFINE3(hipError_t, hipCtxCreate, hipCtx_t *, unsigned int, hipDevice_t)
 | 
				
			||||||
 | 
					HIP_DEFINE1(hipError_t, hipCtxGetDevice, hipDevice_t*)
 | 
				
			||||||
 | 
					HIP_DEFINE1(hipError_t, hipCtxPushCurrent, hipCtx_t)
 | 
				
			||||||
 | 
					HIP_DEFINE1(hipError_t, hipCtxPopCurrent, hipCtx_t*)
 | 
				
			||||||
 | 
					HIP_DEFINE2(hipError_t, hipCtxEnablePeerAccess, hipCtx_t, unsigned int)
 | 
				
			||||||
 | 
					HIP_DEFINE1(hipError_t, hipInit, unsigned int)
 | 
				
			||||||
 | 
					HIP_DEFINE1(hipError_t, hipDriverGetVersion, int *)
 | 
				
			||||||
 | 
					// device management
 | 
				
			||||||
 | 
					HIP_DEFINE2(hipError_t, hipGetDevice, hipDevice_t *, int)
 | 
				
			||||||
 | 
					HIP_DEFINE3(hipError_t, hipDeviceGetName, char *, int, hipDevice_t)
 | 
				
			||||||
 | 
					HIP_DEFINE3(hipError_t, hipDeviceGetPCIBusId, char *, int, hipDevice_t)
 | 
				
			||||||
 | 
					HIP_DEFINE3(hipError_t, hipDeviceGetAttribute, int *, hipDeviceAttribute_t, hipDevice_t)
 | 
				
			||||||
 | 
					HIP_DEFINE1(hipError_t, hipGetDeviceCount, int *)
 | 
				
			||||||
 | 
					// module management
 | 
				
			||||||
 | 
					HIP_DEFINE4(hipError_t, hipModuleGetGlobal, hipDeviceptr_t*, size_t*, hipModule_t, const char*)
 | 
				
			||||||
 | 
					HIP_DEFINE2(hipError_t, hipModuleLoad, hipModule_t *, const char *)
 | 
				
			||||||
 | 
					HIP_DEFINE1(hipError_t, hipModuleUnload, hipModule_t)
 | 
				
			||||||
 | 
					HIP_DEFINE2(hipError_t, hipModuleLoadData, hipModule_t *, const void *)
 | 
				
			||||||
 | 
					HIP_DEFINE5(hipError_t, hipModuleLoadDataEx, hipModule_t *, const void *, unsigned int, hipJitOption *, void **)
 | 
				
			||||||
 | 
					HIP_DEFINE3(hipError_t, hipModuleGetFunction, hipFunction_t *, hipModule_t, const char *)
 | 
				
			||||||
 | 
					// stream management
 | 
				
			||||||
 | 
					HIP_DEFINE2(hipError_t, hipStreamCreate, hipStream_t *, unsigned int)
 | 
				
			||||||
 | 
					HIP_DEFINE1(hipError_t, hipStreamSynchronize, hipStream_t)
 | 
				
			||||||
 | 
					HIP_DEFINE1(hipError_t, hipStreamDestroy, hipStream_t)
 | 
				
			||||||
 | 
					HIP_DEFINE11(hipError_t, hipModuleLaunchKernel, hipFunction_t, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, hipStream_t, void **, void **)
 | 
				
			||||||
 | 
					// function management
 | 
				
			||||||
 | 
					HIP_DEFINE2(hipError_t, hipFuncGetAttributes, hipFuncAttributes*, void*)
 | 
				
			||||||
 | 
					HIP_DEFINE2(hipError_t, hipFuncSetCacheConfig, hipFunction_t, hipFuncCache_t)
 | 
				
			||||||
 | 
					// memory management
 | 
				
			||||||
 | 
					HIP_DEFINE3(hipError_t, hipMemcpyDtoH, void *, hipDeviceptr_t, size_t)
 | 
				
			||||||
 | 
					HIP_DEFINE1(hipError_t, hipFree, hipDeviceptr_t)
 | 
				
			||||||
 | 
					HIP_DEFINE4(hipError_t, hipMemcpyDtoHAsync, void *, hipDeviceptr_t, size_t, hipStream_t)
 | 
				
			||||||
 | 
					HIP_DEFINE4(hipError_t, hipMemcpyHtoDAsync, hipDeviceptr_t, const void *, size_t, hipStream_t)
 | 
				
			||||||
 | 
					HIP_DEFINE3(hipError_t, hipMemcpyHtoD, hipDeviceptr_t, const void *, size_t )
 | 
				
			||||||
 | 
					HIP_DEFINE2(hipError_t, hipMalloc, hipDeviceptr_t*, size_t)
 | 
				
			||||||
 | 
					HIP_DEFINE3(hipError_t, hipPointerGetAttribute, void*, CUpointer_attribute, hipDeviceptr_t)
 | 
				
			||||||
 | 
					HIP_DEFINE4(hipError_t, hipMemsetD8Async, hipDeviceptr_t, unsigned char, size_t, hipStream_t)
 | 
				
			||||||
 | 
					// event management
 | 
				
			||||||
 | 
					HIP_DEFINE2(hipError_t, hipEventCreate, hipEvent_t *, unsigned int)
 | 
				
			||||||
 | 
					HIP_DEFINE3(hipError_t, hipEventElapsedTime, float *, hipEvent_t, hipEvent_t)
 | 
				
			||||||
 | 
					HIP_DEFINE2(hipError_t, hipEventRecord, hipEvent_t, hipStream_t)
 | 
				
			||||||
 | 
					HIP_DEFINE1(hipError_t, hipEventDestroy, hipEvent_t)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					/* ------------------- *
 | 
				
			||||||
 | 
					 * COMMON
 | 
				
			||||||
 | 
					 * ------------------- */
 | 
				
			||||||
 | 
					
 | 
				
			||||||
// Release
 | 
					// Release
 | 
				
			||||||
void dispatch::release(){
 | 
					void dispatch::release(){
 | 
				
			||||||
@@ -190,61 +291,9 @@ void dispatch::release(){
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
void* dispatch::cuda_;
 | 
					void* dispatch::cuda_;
 | 
				
			||||||
void* dispatch::nvml_;
 | 
					void* dispatch::nvml_;
 | 
				
			||||||
 | 
					 | 
				
			||||||
//CUDA
 | 
					 | 
				
			||||||
void* dispatch::cuCtxGetCurrent_;
 | 
					 | 
				
			||||||
void* dispatch::cuCtxSetCurrent_;
 | 
					 | 
				
			||||||
void* dispatch::cuCtxDestroy_v2_;
 | 
					 | 
				
			||||||
void* dispatch::cuEventCreate_;
 | 
					 | 
				
			||||||
void* dispatch::cuDeviceGet_;
 | 
					 | 
				
			||||||
void* dispatch::cuMemcpyDtoH_v2_;
 | 
					 | 
				
			||||||
void* dispatch::cuStreamCreate_;
 | 
					 | 
				
			||||||
void* dispatch::cuEventElapsedTime_;
 | 
					 | 
				
			||||||
void* dispatch::cuMemFree_v2_;
 | 
					 | 
				
			||||||
void* dispatch::cuMemcpyDtoHAsync_v2_;
 | 
					 | 
				
			||||||
void* dispatch::cuDriverGetVersion_;
 | 
					 | 
				
			||||||
void* dispatch::cuDeviceGetName_;
 | 
					 | 
				
			||||||
void* dispatch::cuDeviceGetPCIBusId_;
 | 
					 | 
				
			||||||
void* dispatch::cuModuleGetGlobal_v2_;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
void* dispatch::cuLinkAddData_v2_;
 | 
					 | 
				
			||||||
void* dispatch::cuLinkCreate_v2_;
 | 
					 | 
				
			||||||
void* dispatch::cuLinkDestroy_;
 | 
					 | 
				
			||||||
void* dispatch::cuModuleLoadData_;
 | 
					 | 
				
			||||||
void* dispatch::cuLinkComplete_;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
void* dispatch::cuMemcpyHtoDAsync_v2_;
 | 
					 | 
				
			||||||
void* dispatch::cuModuleLoad_;
 | 
					 | 
				
			||||||
void* dispatch::cuLaunchKernel_;
 | 
					 | 
				
			||||||
void* dispatch::cuModuleUnload_;
 | 
					 | 
				
			||||||
void* dispatch::cuModuleLoadDataEx_;
 | 
					 | 
				
			||||||
void* dispatch::cuDeviceGetAttribute_;
 | 
					 | 
				
			||||||
void* dispatch::cuDeviceGetCount_;
 | 
					 | 
				
			||||||
void* dispatch::cuMemcpyHtoD_v2_;
 | 
					 | 
				
			||||||
void* dispatch::cuInit_;
 | 
					 | 
				
			||||||
void* dispatch::cuEventRecord_;
 | 
					 | 
				
			||||||
void* dispatch::cuCtxCreate_v2_;
 | 
					 | 
				
			||||||
void* dispatch::cuModuleGetFunction_;
 | 
					 | 
				
			||||||
void* dispatch::cuStreamSynchronize_;
 | 
					 | 
				
			||||||
void* dispatch::cuStreamDestroy_v2_;
 | 
					 | 
				
			||||||
void* dispatch::cuStreamGetCtx_;
 | 
					 | 
				
			||||||
void* dispatch::cuEventDestroy_v2_;
 | 
					 | 
				
			||||||
void* dispatch::cuMemAlloc_v2_;
 | 
					 | 
				
			||||||
void* dispatch::cuPointerGetAttribute_;
 | 
					 | 
				
			||||||
void* dispatch::cuCtxGetDevice_;
 | 
					 | 
				
			||||||
void* dispatch::cuMemsetD8Async_;
 | 
					 | 
				
			||||||
void* dispatch::cuCtxPushCurrent_v2_;
 | 
					 | 
				
			||||||
void* dispatch::cuCtxPopCurrent_v2_;
 | 
					 | 
				
			||||||
void* dispatch::cuFuncGetAttribute_;
 | 
					 | 
				
			||||||
void* dispatch::cuFuncSetAttribute_;
 | 
					 | 
				
			||||||
void* dispatch::cuFuncSetCacheConfig_;
 | 
					 | 
				
			||||||
void* dispatch::cuCtxEnablePeerAccess_;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
void* dispatch::nvmlInit_v2_;
 | 
					void* dispatch::nvmlInit_v2_;
 | 
				
			||||||
void* dispatch::nvmlDeviceGetHandleByPciBusId_v2_;
 | 
					void* dispatch::hip_;
 | 
				
			||||||
void* dispatch::nvmlDeviceGetClockInfo_;
 | 
					
 | 
				
			||||||
void* dispatch::nvmlDeviceGetMaxClockInfo_;
 | 
					 | 
				
			||||||
void* dispatch::nvmlDeviceSetApplicationsClocks_;
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -94,6 +94,73 @@ void check(CUresult err)
 | 
				
			|||||||
  }
 | 
					  }
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					void check(hipError_t error) {
 | 
				
			||||||
 | 
					  using namespace exception::hip;
 | 
				
			||||||
 | 
					  switch(error)
 | 
				
			||||||
 | 
					  {
 | 
				
			||||||
 | 
					  case hipSuccess                              : break;
 | 
				
			||||||
 | 
					    case hipErrorInvalidValue                  : throw invalid_value();
 | 
				
			||||||
 | 
					    case hipErrorMemoryAllocation                  : throw out_of_memory();
 | 
				
			||||||
 | 
					    case hipErrorNotInitialized                : throw not_initialized();
 | 
				
			||||||
 | 
					    case hipErrorDeinitialized                  : throw deinitialized();
 | 
				
			||||||
 | 
					    case hipErrorProfilerDisabled              : throw profiler_disabled();
 | 
				
			||||||
 | 
					    case hipErrorProfilerNotInitialized       : throw profiler_not_initialized();
 | 
				
			||||||
 | 
					    case hipErrorProfilerAlreadyStarted       : throw profiler_already_started();
 | 
				
			||||||
 | 
					    case hipErrorProfilerAlreadyStopped       : throw profiler_already_stopped();
 | 
				
			||||||
 | 
					    case hipErrorNoDevice                      : throw no_device();
 | 
				
			||||||
 | 
					    case hipErrorInvalidSymbol                      : throw invalid_symbol();
 | 
				
			||||||
 | 
					    case hipErrorInvalidDevice                 : throw invalid_device();
 | 
				
			||||||
 | 
					    case hipErrorInvalidImage                  : throw invalid_image();
 | 
				
			||||||
 | 
					    case hipErrorInvalidContext                : throw invalid_context();
 | 
				
			||||||
 | 
					    case hipErrorContextAlreadyCurrent        : throw context_already_current();
 | 
				
			||||||
 | 
					    case hipErrorMapFailed                     : throw map_failed();
 | 
				
			||||||
 | 
					    case hipErrorUnmapFailed                   : throw unmap_failed();
 | 
				
			||||||
 | 
					    case hipErrorArrayIsMapped                : throw array_is_mapped();
 | 
				
			||||||
 | 
					    case hipErrorAlreadyMapped                 : throw already_mapped();
 | 
				
			||||||
 | 
					    case hipErrorNoBinaryForGpu              : throw no_binary_for_gpu();
 | 
				
			||||||
 | 
					    case hipErrorAlreadyAcquired               : throw already_acquired();
 | 
				
			||||||
 | 
					    case hipErrorNotMapped                     : throw not_mapped();
 | 
				
			||||||
 | 
					    case hipErrorNotMappedAsArray             : throw not_mapped_as_array();
 | 
				
			||||||
 | 
					    case hipErrorNotMappedAsPointer           : throw not_mapped_as_pointer();
 | 
				
			||||||
 | 
					    case hipErrorECCNotCorrectable            : throw ecc_uncorrectable();
 | 
				
			||||||
 | 
					    case hipErrorUnsupportedLimit             : throw unsupported_limit();
 | 
				
			||||||
 | 
					    case hipErrorContextAlreadyInUse          : throw context_already_in_use();
 | 
				
			||||||
 | 
					    case hipErrorPeerAccessUnsupported        : throw peer_access_unsupported();
 | 
				
			||||||
 | 
					    case hipErrorInvalidKernelFile            : throw invalid_ptx();
 | 
				
			||||||
 | 
					    case hipErrorInvalidGraphicsContext       : throw invalid_graphics_context();
 | 
				
			||||||
 | 
					    case hipErrorInvalidSource                 : throw invalid_source();
 | 
				
			||||||
 | 
					    case hipErrorFileNotFound                 : throw file_not_found();
 | 
				
			||||||
 | 
					    case hipErrorSharedObjectSymbolNotFound : throw shared_object_symbol_not_found();
 | 
				
			||||||
 | 
					    case hipErrorSharedObjectInitFailed      : throw shared_object_init_failed();
 | 
				
			||||||
 | 
					    case hipErrorOperatingSystem               : throw operating_system();
 | 
				
			||||||
 | 
					    case hipErrorInvalidResourceHandle                 : throw invalid_handle();
 | 
				
			||||||
 | 
					    case hipErrorNotFound                      : throw not_found();
 | 
				
			||||||
 | 
					    case hipErrorNotReady                      : throw not_ready();
 | 
				
			||||||
 | 
					    case hipErrorIllegalAddress                : throw illegal_address();
 | 
				
			||||||
 | 
					    case hipErrorLaunchOutOfResources        : throw launch_out_of_resources();
 | 
				
			||||||
 | 
					    case hipErrorLaunchTimeOut                 : throw launch_timeout();
 | 
				
			||||||
 | 
					    // case hipErrorLaunchIncompatibleTexturing  : throw launch_incompatible_texturing();
 | 
				
			||||||
 | 
					    case hipErrorPeerAccessAlreadyEnabled    : throw peer_access_already_enabled();
 | 
				
			||||||
 | 
					    case hipErrorPeerAccessNotEnabled        : throw peer_access_not_enabled();
 | 
				
			||||||
 | 
					    // case hipErrorPrimaryContextActive         : throw primary_context_active();
 | 
				
			||||||
 | 
					    // case hipErrorContextIsDestroyed           : throw context_is_destroyed();
 | 
				
			||||||
 | 
					    case hipErrorAssert                         : throw assert_error();
 | 
				
			||||||
 | 
					    // case hipErrorTooManyPeers                 : throw too_many_peers();
 | 
				
			||||||
 | 
					    case hipErrorHostMemoryAlreadyRegistered : throw host_memory_already_registered();
 | 
				
			||||||
 | 
					    case hipErrorHostMemoryNotRegistered     : throw host_memory_not_registered();
 | 
				
			||||||
 | 
					    // case hipErrorHardwareStackError           : throw hardware_stack_error();
 | 
				
			||||||
 | 
					    // case hipErrorIllegalInstruction            : throw illegal_instruction();
 | 
				
			||||||
 | 
					    // case hipErrorMisalignedAddress             : throw misaligned_address();
 | 
				
			||||||
 | 
					    // case hipErrorInvalidAddressSpace          : throw invalid_address_space();
 | 
				
			||||||
 | 
					    // case hipErrorInvalidPc                     : throw invalid_pc();
 | 
				
			||||||
 | 
					    case hipErrorLaunchFailure                  : throw launch_failed();
 | 
				
			||||||
 | 
					    // case hipErrorNotPermitted                  : throw not_permitted();
 | 
				
			||||||
 | 
					    case hipErrorNotSupported                  : throw not_supported();
 | 
				
			||||||
 | 
					    case hipErrorUnknown                        : throw unknown();
 | 
				
			||||||
 | 
					    default                                        : throw unknown();
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -1,91 +0,0 @@
 | 
				
			|||||||
/* Copyright 2015-2017 Philippe Tillet
 | 
					 | 
				
			||||||
* 
 | 
					 | 
				
			||||||
* Permission is hereby granted, free of charge, to any person obtaining 
 | 
					 | 
				
			||||||
* a copy of this software and associated documentation files 
 | 
					 | 
				
			||||||
* (the "Software"), to deal in the Software without restriction, 
 | 
					 | 
				
			||||||
* including without limitation the rights to use, copy, modify, merge, 
 | 
					 | 
				
			||||||
* publish, distribute, sublicense, and/or sell copies of the Software, 
 | 
					 | 
				
			||||||
* and to permit persons to whom the Software is furnished to do so, 
 | 
					 | 
				
			||||||
* subject to the following conditions:
 | 
					 | 
				
			||||||
* 
 | 
					 | 
				
			||||||
* The above copyright notice and this permission notice shall be 
 | 
					 | 
				
			||||||
* included in all copies or substantial portions of the Software.
 | 
					 | 
				
			||||||
* 
 | 
					 | 
				
			||||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
 | 
					 | 
				
			||||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
 | 
					 | 
				
			||||||
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 | 
					 | 
				
			||||||
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 | 
					 | 
				
			||||||
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
 | 
					 | 
				
			||||||
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
 | 
					 | 
				
			||||||
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 | 
					 | 
				
			||||||
*/
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#include "triton/driver/handle.h"
 | 
					 | 
				
			||||||
#include "triton/driver/error.h"
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
namespace triton
 | 
					 | 
				
			||||||
{
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
namespace driver
 | 
					 | 
				
			||||||
{
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
//Host
 | 
					 | 
				
			||||||
inline void _delete(host_platform_t) { }
 | 
					 | 
				
			||||||
inline void _delete(host_device_t)   { }
 | 
					 | 
				
			||||||
inline void _delete(host_context_t)  { }
 | 
					 | 
				
			||||||
inline void _delete(host_module_t)   { }
 | 
					 | 
				
			||||||
inline void _delete(host_stream_t)   { }
 | 
					 | 
				
			||||||
inline void _delete(host_buffer_t x)   { if(x.data) delete[] x.data; }
 | 
					 | 
				
			||||||
inline void _delete(host_function_t) { }
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
//CUDA
 | 
					 | 
				
			||||||
inline void _delete(CUcontext x) { dispatch::cuCtxDestroy(x); }
 | 
					 | 
				
			||||||
inline void _delete(CUdeviceptr x) { dispatch::cuMemFree(x); }
 | 
					 | 
				
			||||||
inline void _delete(CUstream x) { dispatch::cuStreamDestroy(x); }
 | 
					 | 
				
			||||||
inline void _delete(CUdevice) { }
 | 
					 | 
				
			||||||
inline void _delete(CUevent x) { dispatch::cuEventDestroy(x); }
 | 
					 | 
				
			||||||
inline void _delete(CUfunction) { }
 | 
					 | 
				
			||||||
inline void _delete(CUmodule x) { dispatch::cuModuleUnload(x); }
 | 
					 | 
				
			||||||
inline void _delete(cu_event_t x) { _delete(x.first); _delete(x.second); }
 | 
					 | 
				
			||||||
inline void _delete(CUPlatform){}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
//Constructor
 | 
					 | 
				
			||||||
template<class T>
 | 
					 | 
				
			||||||
handle<T>::handle(T cu, bool take_ownership): h_(new T(cu)), has_ownership_(take_ownership)
 | 
					 | 
				
			||||||
{ }
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
template<class T>
 | 
					 | 
				
			||||||
handle<T>::handle(): has_ownership_(false){ }
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
template<class T>
 | 
					 | 
				
			||||||
handle<T>::~handle(){
 | 
					 | 
				
			||||||
  try{
 | 
					 | 
				
			||||||
    if(has_ownership_ && h_ && h_.unique())
 | 
					 | 
				
			||||||
      _delete(*h_);
 | 
					 | 
				
			||||||
  }catch(const exception::cuda::base&){
 | 
					 | 
				
			||||||
    // order of destruction for global variables
 | 
					 | 
				
			||||||
    // is not guaranteed
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
template class handle<CUdeviceptr>;
 | 
					 | 
				
			||||||
template class handle<CUstream>;
 | 
					 | 
				
			||||||
template class handle<CUcontext>;
 | 
					 | 
				
			||||||
template class handle<CUdevice>;
 | 
					 | 
				
			||||||
template class handle<cu_event_t>;
 | 
					 | 
				
			||||||
template class handle<CUfunction>;
 | 
					 | 
				
			||||||
template class handle<CUmodule>;
 | 
					 | 
				
			||||||
template class handle<CUPlatform>;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
template class handle<host_platform_t>;
 | 
					 | 
				
			||||||
template class handle<host_device_t>;
 | 
					 | 
				
			||||||
template class handle<host_context_t>;
 | 
					 | 
				
			||||||
template class handle<host_module_t>;
 | 
					 | 
				
			||||||
template class handle<host_stream_t>;
 | 
					 | 
				
			||||||
template class handle<host_buffer_t>;
 | 
					 | 
				
			||||||
template class handle<host_function_t>;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
@@ -1,94 +0,0 @@
 | 
				
			|||||||
/* Copyright 2015-2017 Philippe Tillet
 | 
					 | 
				
			||||||
*
 | 
					 | 
				
			||||||
* Permission is hereby granted, free of charge, to any person obtaining
 | 
					 | 
				
			||||||
* a copy of this software and associated documentation files
 | 
					 | 
				
			||||||
* (the "Software"), to deal in the Software without restriction,
 | 
					 | 
				
			||||||
* including without limitation the rights to use, copy, modify, merge,
 | 
					 | 
				
			||||||
* publish, distribute, sublicense, and/or sell copies of the Software,
 | 
					 | 
				
			||||||
* and to permit persons to whom the Software is furnished to do so,
 | 
					 | 
				
			||||||
* subject to the following conditions:
 | 
					 | 
				
			||||||
*
 | 
					 | 
				
			||||||
* The above copyright notice and this permission notice shall be
 | 
					 | 
				
			||||||
* included in all copies or substantial portions of the Software.
 | 
					 | 
				
			||||||
*
 | 
					 | 
				
			||||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 | 
					 | 
				
			||||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 | 
					 | 
				
			||||||
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 | 
					 | 
				
			||||||
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 | 
					 | 
				
			||||||
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 | 
					 | 
				
			||||||
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 | 
					 | 
				
			||||||
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 | 
					 | 
				
			||||||
*/
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#include <string.h>
 | 
					 | 
				
			||||||
#include "triton/driver/kernel.h"
 | 
					 | 
				
			||||||
#include "triton/driver/buffer.h"
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
namespace triton
 | 
					 | 
				
			||||||
{
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
namespace driver
 | 
					 | 
				
			||||||
{
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* ------------------------ */
 | 
					 | 
				
			||||||
//         Base             //
 | 
					 | 
				
			||||||
/* ------------------------ */
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
kernel::kernel(driver::module *program, CUfunction fn, bool has_ownership):
 | 
					 | 
				
			||||||
  polymorphic_resource(fn, has_ownership), program_(program){
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
kernel::kernel(driver::module *program, host_function_t fn, bool has_ownership):
 | 
					 | 
				
			||||||
  polymorphic_resource(fn, has_ownership), program_(program){
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
kernel* kernel::create(driver::module* program, const char* name) {
 | 
					 | 
				
			||||||
    switch(program->backend()){
 | 
					 | 
				
			||||||
    case CUDA: return new cu_kernel(program, name);
 | 
					 | 
				
			||||||
    case Host: return new host_kernel(program, name);
 | 
					 | 
				
			||||||
    default: throw std::runtime_error("unknown backend");
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
driver::module* kernel::module() {
 | 
					 | 
				
			||||||
  return program_;
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* ------------------------ */
 | 
					 | 
				
			||||||
//         Host             //
 | 
					 | 
				
			||||||
/* ------------------------ */
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
host_kernel::host_kernel(driver::module* program, const char *name): kernel(program, host_function_t(), true) {
 | 
					 | 
				
			||||||
  hst_->fn = program->hst()->functions.at(name);
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* ------------------------ */
 | 
					 | 
				
			||||||
//         CUDA             //
 | 
					 | 
				
			||||||
/* ------------------------ */
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
cu_kernel::cu_kernel(driver::module *program, const char * name) : kernel(program, CUfunction(), true) {
 | 
					 | 
				
			||||||
  dispatch::cuModuleGetFunction(&*cu_, *program->cu(), name);
 | 
					 | 
				
			||||||
  dispatch::cuFuncSetCacheConfig(*cu_, CU_FUNC_CACHE_PREFER_SHARED);
 | 
					 | 
				
			||||||
  // properties
 | 
					 | 
				
			||||||
  int shared_total, shared_optin, shared_static;
 | 
					 | 
				
			||||||
  int n_spills, n_reg;
 | 
					 | 
				
			||||||
  CUdevice dev;
 | 
					 | 
				
			||||||
  dispatch::cuCtxGetDevice(&dev);
 | 
					 | 
				
			||||||
  dispatch::cuDeviceGetAttribute(&shared_total, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR, dev);
 | 
					 | 
				
			||||||
  dispatch::cuDeviceGetAttribute(&shared_optin, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN, dev);
 | 
					 | 
				
			||||||
  dispatch::cuFuncGetAttribute(&shared_static, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, *cu_);
 | 
					 | 
				
			||||||
  dispatch::cuFuncGetAttribute(&n_spills, CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES,  *cu_);
 | 
					 | 
				
			||||||
  dispatch::cuFuncGetAttribute(&n_reg, CU_FUNC_ATTRIBUTE_NUM_REGS, *cu_);
 | 
					 | 
				
			||||||
//  std::cout << n_reg << std::endl;
 | 
					 | 
				
			||||||
  if (shared_optin > 49152){
 | 
					 | 
				
			||||||
//      std::cout << "dynamic shared memory " << shared_optin << " " << shared_static << std::endl;
 | 
					 | 
				
			||||||
      dispatch::cuFuncSetAttribute(*cu_, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, shared_optin - shared_static);
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
							
								
								
									
										324
									
								
								lib/driver/llvm.cc
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										324
									
								
								lib/driver/llvm.cc
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,324 @@
 | 
				
			|||||||
 | 
					/* Copyright 2015-2017 Philippe Tillet
 | 
				
			||||||
 | 
					*
 | 
				
			||||||
 | 
					* Permission is hereby granted, free of charge, to any person obtaining
 | 
				
			||||||
 | 
					* a copy of this software and associated documentation files
 | 
				
			||||||
 | 
					* (the "Software"), to deal in the Software without restriction,
 | 
				
			||||||
 | 
					* including without limitation the rights to use, copy, modify, merge,
 | 
				
			||||||
 | 
					* publish, distribute, sublicense, and/or sell copies of the Software,
 | 
				
			||||||
 | 
					* and to permit persons to whom the Software is furnished to do so,
 | 
				
			||||||
 | 
					* subject to the following conditions:
 | 
				
			||||||
 | 
					*
 | 
				
			||||||
 | 
					* The above copyright notice and this permission notice shall be
 | 
				
			||||||
 | 
					* included in all copies or substantial portions of the Software.
 | 
				
			||||||
 | 
					*
 | 
				
			||||||
 | 
					* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 | 
				
			||||||
 | 
					* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 | 
				
			||||||
 | 
					* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 | 
				
			||||||
 | 
					* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 | 
				
			||||||
 | 
					* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 | 
				
			||||||
 | 
					* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 | 
				
			||||||
 | 
					* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 | 
				
			||||||
 | 
					*/
 | 
				
			||||||
 | 
					#include <fstream>
 | 
				
			||||||
 | 
					#include <unistd.h>
 | 
				
			||||||
 | 
					#include <memory>
 | 
				
			||||||
 | 
					#include <regex>
 | 
				
			||||||
 | 
					#include "triton/driver/llvm.h"
 | 
				
			||||||
 | 
					#include "triton/driver/dispatch.h"
 | 
				
			||||||
 | 
					#include "triton/driver/error.h"
 | 
				
			||||||
 | 
					#include "triton/tools/sha1.hpp"
 | 
				
			||||||
 | 
					#include "triton/tools/sys/getenv.hpp"
 | 
				
			||||||
 | 
					#include "triton/tools/sys/mkdir.hpp"
 | 
				
			||||||
 | 
					#include "triton/tools/sys/exec.hpp"
 | 
				
			||||||
 | 
					#include "llvm/IR/IRBuilder.h"
 | 
				
			||||||
 | 
					#include "llvm/IR/Verifier.h"
 | 
				
			||||||
 | 
					#include "llvm/IR/IRPrintingPasses.h"
 | 
				
			||||||
 | 
					#include "llvm/IR/Module.h"
 | 
				
			||||||
 | 
					#include "llvm/Support/CodeGen.h"
 | 
				
			||||||
 | 
					#include "llvm/Support/CommandLine.h"
 | 
				
			||||||
 | 
					#include "llvm/Support/SourceMgr.h"
 | 
				
			||||||
 | 
					#include "llvm/Support/raw_ostream.h"
 | 
				
			||||||
 | 
					#include "llvm/Support/TargetRegistry.h"
 | 
				
			||||||
 | 
					#include "llvm/Support/TargetSelect.h"
 | 
				
			||||||
 | 
					#include "llvm/Target/TargetMachine.h"
 | 
				
			||||||
 | 
					#include "llvm/Target/TargetOptions.h"
 | 
				
			||||||
 | 
					#include "llvm/IR/LegacyPassManager.h"
 | 
				
			||||||
 | 
					#include "llvm/ExecutionEngine/ExecutionEngine.h"
 | 
				
			||||||
 | 
					#include "llvm/ExecutionEngine/SectionMemoryManager.h"
 | 
				
			||||||
 | 
					#include "llvm/Transforms/Utils/Cloning.h"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					// begin AMD stuff
 | 
				
			||||||
 | 
					#include "llvm/Support/FileSystem.h"
 | 
				
			||||||
 | 
					#include "llvm/Support/FormattedStream.h"
 | 
				
			||||||
 | 
					#include "llvm/Support/Program.h"
 | 
				
			||||||
 | 
					#include "llvm/Support/ToolOutputFile.h"
 | 
				
			||||||
 | 
					#include "llvm/ADT/StringRef.h"
 | 
				
			||||||
 | 
					#include "llvm/Analysis/TargetLibraryInfo.h"
 | 
				
			||||||
 | 
					// end AMD stuff
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					namespace triton{
 | 
				
			||||||
 | 
					namespace driver{
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					void init_llvm() {
 | 
				
			||||||
 | 
					  static bool init = false;
 | 
				
			||||||
 | 
					  if(!init){
 | 
				
			||||||
 | 
					    LLVMInitializeNVPTXTargetInfo();
 | 
				
			||||||
 | 
					    LLVMInitializeNVPTXTarget();
 | 
				
			||||||
 | 
					    LLVMInitializeNVPTXTargetMC();
 | 
				
			||||||
 | 
					    LLVMInitializeNVPTXAsmPrinter();
 | 
				
			||||||
 | 
					    LLVMInitializeAMDGPUTargetInfo();
 | 
				
			||||||
 | 
					    LLVMInitializeAMDGPUTarget();
 | 
				
			||||||
 | 
					    LLVMInitializeAMDGPUTargetMC();
 | 
				
			||||||
 | 
					    LLVMInitializeAMDGPUAsmPrinter();
 | 
				
			||||||
 | 
					    init = true;
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					/* ------------------------ */
 | 
				
			||||||
 | 
					//         CUDA             //
 | 
				
			||||||
 | 
					/* ------------------------ */
 | 
				
			||||||
 | 
					static bool find_and_replace(std::string& str, const std::string& begin, const std::string& end, const std::string& target){
 | 
				
			||||||
 | 
					  size_t start_replace = str.find(begin);
 | 
				
			||||||
 | 
					  size_t end_replace = str.find(end, start_replace);
 | 
				
			||||||
 | 
					  if(start_replace == std::string::npos)
 | 
				
			||||||
 | 
					    return false;
 | 
				
			||||||
 | 
					  str.replace(start_replace, end_replace + 1 - start_replace, target);
 | 
				
			||||||
 | 
					  return true;
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					int vptx(int version){
 | 
				
			||||||
 | 
					  if(version >= 11030) return 73;
 | 
				
			||||||
 | 
					  if(version >= 11020) return 72;
 | 
				
			||||||
 | 
					  if(version >= 11010) return 71;
 | 
				
			||||||
 | 
					  if(version >= 11000) return 70;
 | 
				
			||||||
 | 
					  if(version >= 10020) return 65;
 | 
				
			||||||
 | 
					  if(version >= 10010) return 64;
 | 
				
			||||||
 | 
					  if(version >= 10000) return 63;
 | 
				
			||||||
 | 
					  throw std::runtime_error("Triton requires CUDA 10+");
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					std::string llir_to_ptx(llvm::Module* module, int cc, int version){
 | 
				
			||||||
 | 
					  // LLVM version in use may not officially support target hardware
 | 
				
			||||||
 | 
					  int max_nvvm_cc = 75;
 | 
				
			||||||
 | 
					  int max_nvvm_ptx = 64;
 | 
				
			||||||
 | 
					  // options
 | 
				
			||||||
 | 
					  auto options = llvm::cl::getRegisteredOptions();
 | 
				
			||||||
 | 
					  auto* short_ptr = static_cast<llvm::cl::opt<bool>*>(options["nvptx-short-ptr"]);
 | 
				
			||||||
 | 
					  assert(short_ptr);
 | 
				
			||||||
 | 
					  short_ptr->setValue(true);
 | 
				
			||||||
 | 
					  // compute capability
 | 
				
			||||||
 | 
					  std::string sm = "sm_" + std::to_string(cc);
 | 
				
			||||||
 | 
					  // max PTX version
 | 
				
			||||||
 | 
					  int ptx = vptx(version);
 | 
				
			||||||
 | 
					  int ptx_major = ptx / 10;
 | 
				
			||||||
 | 
					  int ptx_minor = ptx % 10;
 | 
				
			||||||
 | 
					  // create
 | 
				
			||||||
 | 
					  llvm::SmallVector<char, 0> buffer;
 | 
				
			||||||
 | 
					  std::string triple = "nvptx64-nvidia-cuda";
 | 
				
			||||||
 | 
					  std::string proc = "sm_" + std::to_string(std::min(cc, max_nvvm_cc));
 | 
				
			||||||
 | 
					  std::string layout = "";
 | 
				
			||||||
 | 
					  std::string features = "+ptx" + std::to_string(std::min(ptx, max_nvvm_ptx));
 | 
				
			||||||
 | 
					  init_llvm();
 | 
				
			||||||
 | 
					  // verify and store llvm
 | 
				
			||||||
 | 
					  llvm::legacy::PassManager pm;
 | 
				
			||||||
 | 
					  pm.add(llvm::createVerifierPass());
 | 
				
			||||||
 | 
					  pm.run(*module);
 | 
				
			||||||
 | 
					  // create machine
 | 
				
			||||||
 | 
					  module->setTargetTriple(triple);
 | 
				
			||||||
 | 
					  std::string error;
 | 
				
			||||||
 | 
					  auto target = llvm::TargetRegistry::lookupTarget(module->getTargetTriple(), error);
 | 
				
			||||||
 | 
					  llvm::TargetOptions opt;
 | 
				
			||||||
 | 
					  opt.AllowFPOpFusion = llvm::FPOpFusion::Fast;
 | 
				
			||||||
 | 
					  opt.UnsafeFPMath = false;
 | 
				
			||||||
 | 
					  opt.NoInfsFPMath = false;
 | 
				
			||||||
 | 
					  opt.NoNaNsFPMath = true;
 | 
				
			||||||
 | 
					  llvm::TargetMachine *machine = target->createTargetMachine(module->getTargetTriple(), proc, features, opt,
 | 
				
			||||||
 | 
					                                                             llvm::Reloc::PIC_, llvm::None, llvm::CodeGenOpt::Aggressive);
 | 
				
			||||||
 | 
					  // set data layout
 | 
				
			||||||
 | 
					  if(layout.empty())
 | 
				
			||||||
 | 
					    module->setDataLayout(machine->createDataLayout());
 | 
				
			||||||
 | 
					  else
 | 
				
			||||||
 | 
					    module->setDataLayout(layout);
 | 
				
			||||||
 | 
					  // emit machine code
 | 
				
			||||||
 | 
					  for (llvm::Function &f : module->functions())
 | 
				
			||||||
 | 
					    f.addFnAttr(llvm::Attribute::AlwaysInline);
 | 
				
			||||||
 | 
					  llvm::legacy::PassManager pass;
 | 
				
			||||||
 | 
					  llvm::raw_svector_ostream stream(buffer);
 | 
				
			||||||
 | 
					  // emit
 | 
				
			||||||
 | 
					  machine->addPassesToEmitFile(pass, stream, nullptr, llvm::CodeGenFileType::CGFT_AssemblyFile);
 | 
				
			||||||
 | 
					  pass.run(*module);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  // post-process
 | 
				
			||||||
 | 
					  std::string result(buffer.begin(), buffer.end());
 | 
				
			||||||
 | 
					  find_and_replace(result, ".version", "\n", ".version " + std::to_string(ptx_major) + "." + std::to_string(ptx_minor) + "\n");
 | 
				
			||||||
 | 
					  find_and_replace(result, ".target", "\n", ".target " + sm + "\n");
 | 
				
			||||||
 | 
					  while(find_and_replace(result, "\t// begin inline asm", "\n", ""));
 | 
				
			||||||
 | 
					  while(find_and_replace(result, "\t// end inline asm", "\n", ""));
 | 
				
			||||||
 | 
					  return result;
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					CUmodule ptx_to_cumodule(const std::string& ptx, int cc) {
 | 
				
			||||||
 | 
					  // JIT compile source-code
 | 
				
			||||||
 | 
					  try{
 | 
				
			||||||
 | 
					    // use ptxas if present in PATH. Otherwise, use JIT from the driver
 | 
				
			||||||
 | 
					    std::string ptxas = "ptxas";
 | 
				
			||||||
 | 
					    std::string version;
 | 
				
			||||||
 | 
					    int use_system_ptxas = tools::exec(ptxas + " --version 2>&1", version) == 0;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    // Use PTXAS via system call
 | 
				
			||||||
 | 
					    if(use_system_ptxas){
 | 
				
			||||||
 | 
					      // compile ptx with ptxas
 | 
				
			||||||
 | 
					      char _fsrc[] = "/tmp/triton_k_XXXXXX";
 | 
				
			||||||
 | 
					      char _flog[] = "/tmp/triton_l_XXXXXX";
 | 
				
			||||||
 | 
					      mkstemp(_fsrc);
 | 
				
			||||||
 | 
					      mkstemp(_flog);
 | 
				
			||||||
 | 
					      std::string fsrc = _fsrc;
 | 
				
			||||||
 | 
					      std::string flog = _flog;
 | 
				
			||||||
 | 
					      std::ofstream ofs(fsrc);
 | 
				
			||||||
 | 
					      ofs << ptx;
 | 
				
			||||||
 | 
					      ofs.close();
 | 
				
			||||||
 | 
					      std::string cmd;
 | 
				
			||||||
 | 
					      int err;
 | 
				
			||||||
 | 
					      cmd = ptxas + " -v --gpu-name=sm_" + std::to_string(cc) + " " + fsrc + " -o " + fsrc + ".o 2> " + flog;
 | 
				
			||||||
 | 
					      err = system(cmd.c_str());
 | 
				
			||||||
 | 
					      CUmodule ret;
 | 
				
			||||||
 | 
					      dispatch::cuModuleLoad(&ret, (fsrc + ".o").c_str());
 | 
				
			||||||
 | 
					      unlink(_fsrc);
 | 
				
			||||||
 | 
					      unlink(_flog);
 | 
				
			||||||
 | 
					      return ret;
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    // Use PTXAS included in driver
 | 
				
			||||||
 | 
					    CUjit_option opt[] = {CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, CU_JIT_ERROR_LOG_BUFFER,
 | 
				
			||||||
 | 
					                          CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES, CU_JIT_INFO_LOG_BUFFER,
 | 
				
			||||||
 | 
					                          CU_JIT_LOG_VERBOSE};
 | 
				
			||||||
 | 
					    unsigned int errbufsize = 8192;
 | 
				
			||||||
 | 
					    unsigned int logbufsize = 8192;
 | 
				
			||||||
 | 
					    char _err[errbufsize];
 | 
				
			||||||
 | 
					    char _log[logbufsize];
 | 
				
			||||||
 | 
					    void* optval[] = {(void*)(uintptr_t)errbufsize, (void*)_err, (void*)(uintptr_t)logbufsize, (void*)_log, (void*)1};
 | 
				
			||||||
 | 
					    CUmodule ret;
 | 
				
			||||||
 | 
					    dispatch::cuModuleLoadDataEx(&ret, ptx.data(), 5, opt, optval);
 | 
				
			||||||
 | 
					    return ret;
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  catch(exception::cuda::invalid_ptx const &){
 | 
				
			||||||
 | 
					    std::cout << ptx << std::endl;
 | 
				
			||||||
 | 
					    std::cerr << "It appears that Triton produced invalid PTX code:" << std::endl;
 | 
				
			||||||
 | 
					    throw;
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					/* ------------------------ */
 | 
				
			||||||
 | 
					//         HIP              //
 | 
				
			||||||
 | 
					/* ------------------------ */
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					std::string llir_to_amdgpu(llvm::Module* module, const std::string& _proc) {
 | 
				
			||||||
 | 
					  init_llvm();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					//  proc = std::get<0>(GetFeatureStrFromGCNArchName(rocminfo));
 | 
				
			||||||
 | 
					//  features = std::get<1>(GetFeatureStrFromGCNArchName(rocminfo));
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  // create
 | 
				
			||||||
 | 
					  llvm::SmallVector<char, 0> buffer;
 | 
				
			||||||
 | 
					  std::string triple = "amdgcn-amd-amdhsa";
 | 
				
			||||||
 | 
					  std::string layout = "";
 | 
				
			||||||
 | 
					  std::string features;
 | 
				
			||||||
 | 
					  std::string proc = "gfx908";
 | 
				
			||||||
 | 
					  // verify and store llvm
 | 
				
			||||||
 | 
					  llvm::legacy::PassManager pm;
 | 
				
			||||||
 | 
					  pm.add(llvm::createVerifierPass());
 | 
				
			||||||
 | 
					  pm.run(*module);
 | 
				
			||||||
 | 
					  // create machine
 | 
				
			||||||
 | 
					  module->setTargetTriple(triple);
 | 
				
			||||||
 | 
					  std::string error;
 | 
				
			||||||
 | 
					  auto target = llvm::TargetRegistry::lookupTarget(module->getTargetTriple(), error);
 | 
				
			||||||
 | 
					  llvm::TargetOptions opt;
 | 
				
			||||||
 | 
					  opt.AllowFPOpFusion = llvm::FPOpFusion::Fast;
 | 
				
			||||||
 | 
					  opt.UnsafeFPMath = false;
 | 
				
			||||||
 | 
					  opt.NoInfsFPMath = false;
 | 
				
			||||||
 | 
					  opt.NoNaNsFPMath = true;
 | 
				
			||||||
 | 
					  llvm::TargetMachine *machine = target->createTargetMachine(module->getTargetTriple(), proc, features, opt,
 | 
				
			||||||
 | 
					                                                             llvm::Reloc::PIC_, llvm::None,
 | 
				
			||||||
 | 
					                                                             llvm::CodeGenOpt::Aggressive);
 | 
				
			||||||
 | 
					  // set data layout
 | 
				
			||||||
 | 
					  if(layout.empty())
 | 
				
			||||||
 | 
					    module->setDataLayout(machine->createDataLayout());
 | 
				
			||||||
 | 
					  else
 | 
				
			||||||
 | 
					    module->setDataLayout(layout);
 | 
				
			||||||
 | 
					  // emit machine code
 | 
				
			||||||
 | 
					  for (llvm::Function &f : module->functions())
 | 
				
			||||||
 | 
					    f.addFnAttr(llvm::Attribute::AlwaysInline);
 | 
				
			||||||
 | 
					  llvm::legacy::PassManager pass;
 | 
				
			||||||
 | 
					  llvm::raw_svector_ostream stream(buffer);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  // create dump files
 | 
				
			||||||
 | 
					  std::string module_name = module->getModuleIdentifier();
 | 
				
			||||||
 | 
					  std::error_code ec;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  // Save GCN ISA binary.
 | 
				
			||||||
 | 
					  std::string isabin_path = std::string("/tmp/") + module_name + std::string(".o");
 | 
				
			||||||
 | 
					  std::unique_ptr<llvm::raw_fd_ostream> isabin_fs(
 | 
				
			||||||
 | 
					      new llvm::raw_fd_ostream(isabin_path, ec, llvm::sys::fs::OF_Text));
 | 
				
			||||||
 | 
					  if (ec)
 | 
				
			||||||
 | 
					  {
 | 
				
			||||||
 | 
					    std::cout << isabin_path << " was not created. error code: " << ec << std::endl;
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  // emit
 | 
				
			||||||
 | 
					  machine->addPassesToEmitFile(pass, *isabin_fs, nullptr, llvm::CGFT_ObjectFile);
 | 
				
			||||||
 | 
					  pass.run(*module);
 | 
				
			||||||
 | 
					  // Save GCN ISA.
 | 
				
			||||||
 | 
					  std::string amdgcn_path = std::string("/tmp/") + module_name + std::string(".gcn");
 | 
				
			||||||
 | 
					  std::string result(buffer.begin(), buffer.end());
 | 
				
			||||||
 | 
					  std::ofstream amdgcn(amdgcn_path);
 | 
				
			||||||
 | 
					  amdgcn << result;
 | 
				
			||||||
 | 
					  amdgcn.close();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  // generate HASCO file
 | 
				
			||||||
 | 
					  std::string hsaco_path = std::string("/tmp/") + module_name + std::string(".hsaco");
 | 
				
			||||||
 | 
					  std::string error_message;
 | 
				
			||||||
 | 
					  int lld_result =
 | 
				
			||||||
 | 
					      llvm::sys::ExecuteAndWait("/opt/rocm/llvm/bin/ld.lld",
 | 
				
			||||||
 | 
					                                {"/opt/rocm/llvm/bin/ld.lld", "-flavor", "gnu", "-shared", "-o", hsaco_path, isabin_path},
 | 
				
			||||||
 | 
					                                llvm::None, {}, 0, 0, &error_message);
 | 
				
			||||||
 | 
					  if (lld_result)
 | 
				
			||||||
 | 
					  {
 | 
				
			||||||
 | 
					    std::cout << "ld.lld execute fail: " << std::endl;
 | 
				
			||||||
 | 
					    std::cout << error_message << std::endl;
 | 
				
			||||||
 | 
					    std::cout << lld_result << std::endl;
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  return hsaco_path;
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					hipModule_t amdgpu_to_hipmodule(const std::string& path) {
 | 
				
			||||||
 | 
					  // Read HSACO.
 | 
				
			||||||
 | 
					  std::ifstream hsaco_file(path, std::ios::binary | std::ios::ate);
 | 
				
			||||||
 | 
					  std::ifstream::pos_type hsaco_file_size = hsaco_file.tellg();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  std::vector<unsigned char> hsaco(hsaco_file_size);
 | 
				
			||||||
 | 
					  hsaco_file.seekg(0, std::ios::beg);
 | 
				
			||||||
 | 
					  hsaco_file.read(reinterpret_cast<char*>(&hsaco[0]), hsaco_file_size);
 | 
				
			||||||
 | 
					  hsaco_file.close();
 | 
				
			||||||
 | 
					  hipJitOption opt[] = {hipJitOptionErrorLogBufferSizeBytes, hipJitOptionErrorLogBuffer,
 | 
				
			||||||
 | 
					                            hipJitOptionInfoLogBufferSizeBytes, hipJitOptionInfoLogBuffer,
 | 
				
			||||||
 | 
					                            hipJitOptionLogVerbose};
 | 
				
			||||||
 | 
					  unsigned int errbufsize = 8192;
 | 
				
			||||||
 | 
					  unsigned int logbufsize = 8192;
 | 
				
			||||||
 | 
					  char _err[errbufsize];
 | 
				
			||||||
 | 
					  char _log[logbufsize];
 | 
				
			||||||
 | 
					  void* optval[] = {(void*)(uintptr_t)errbufsize,
 | 
				
			||||||
 | 
					                    (void*)_err, (void*)(uintptr_t)logbufsize,
 | 
				
			||||||
 | 
					                    (void*)_log, (void*)1};
 | 
				
			||||||
 | 
					  hipModule_t ret;
 | 
				
			||||||
 | 
					  dispatch::hipModuleLoadDataEx(&ret, hsaco.data(), 5, opt, optval);
 | 
				
			||||||
 | 
					  return ret;
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -1,375 +0,0 @@
 | 
				
			|||||||
/* Copyright 2015-2017 Philippe Tillet
 | 
					 | 
				
			||||||
*
 | 
					 | 
				
			||||||
* Permission is hereby granted, free of charge, to any person obtaining
 | 
					 | 
				
			||||||
* a copy of this software and associated documentation files
 | 
					 | 
				
			||||||
* (the "Software"), to deal in the Software without restriction,
 | 
					 | 
				
			||||||
* including without limitation the rights to use, copy, modify, merge,
 | 
					 | 
				
			||||||
* publish, distribute, sublicense, and/or sell copies of the Software,
 | 
					 | 
				
			||||||
* and to permit persons to whom the Software is furnished to do so,
 | 
					 | 
				
			||||||
* subject to the following conditions:
 | 
					 | 
				
			||||||
*
 | 
					 | 
				
			||||||
* The above copyright notice and this permission notice shall be
 | 
					 | 
				
			||||||
* included in all copies or substantial portions of the Software.
 | 
					 | 
				
			||||||
*
 | 
					 | 
				
			||||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 | 
					 | 
				
			||||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 | 
					 | 
				
			||||||
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 | 
					 | 
				
			||||||
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 | 
					 | 
				
			||||||
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 | 
					 | 
				
			||||||
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 | 
					 | 
				
			||||||
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 | 
					 | 
				
			||||||
*/
 | 
					 | 
				
			||||||
#include <fstream>
 | 
					 | 
				
			||||||
#include <unistd.h>
 | 
					 | 
				
			||||||
#include <memory>
 | 
					 | 
				
			||||||
#include <regex>
 | 
					 | 
				
			||||||
#include "triton/driver/module.h"
 | 
					 | 
				
			||||||
#include "triton/driver/context.h"
 | 
					 | 
				
			||||||
#include "triton/driver/error.h"
 | 
					 | 
				
			||||||
#include "triton/tools/sha1.hpp"
 | 
					 | 
				
			||||||
#include "triton/tools/sys/getenv.hpp"
 | 
					 | 
				
			||||||
#include "triton/tools/sys/mkdir.hpp"
 | 
					 | 
				
			||||||
#include "triton/tools/sys/exec.hpp"
 | 
					 | 
				
			||||||
#include "llvm/IR/IRBuilder.h"
 | 
					 | 
				
			||||||
#include "llvm/IR/Verifier.h"
 | 
					 | 
				
			||||||
#include "llvm/IR/IRPrintingPasses.h"
 | 
					 | 
				
			||||||
#include "llvm/IR/Module.h"
 | 
					 | 
				
			||||||
#include "llvm/Support/CodeGen.h"
 | 
					 | 
				
			||||||
#include "llvm/Support/CommandLine.h"
 | 
					 | 
				
			||||||
#include "llvm/Support/SourceMgr.h"
 | 
					 | 
				
			||||||
#include "llvm/Support/raw_ostream.h"
 | 
					 | 
				
			||||||
#include "llvm/Support/TargetRegistry.h"
 | 
					 | 
				
			||||||
#include "llvm/Support/TargetSelect.h"
 | 
					 | 
				
			||||||
#include "llvm/Target/TargetMachine.h"
 | 
					 | 
				
			||||||
#include "llvm/Target/TargetOptions.h"
 | 
					 | 
				
			||||||
#include "llvm/IR/LegacyPassManager.h"
 | 
					 | 
				
			||||||
#include "llvm/ExecutionEngine/ExecutionEngine.h"
 | 
					 | 
				
			||||||
#include "llvm/ExecutionEngine/SectionMemoryManager.h"
 | 
					 | 
				
			||||||
#include "llvm/Transforms/Utils/Cloning.h"
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
std::string exec(const char* cmd) {
 | 
					 | 
				
			||||||
    std::array<char, 128> buffer;
 | 
					 | 
				
			||||||
    std::string result;
 | 
					 | 
				
			||||||
    std::unique_ptr<FILE, decltype(&pclose)> pipe(popen(cmd, "r"), pclose);
 | 
					 | 
				
			||||||
    if (!pipe) {
 | 
					 | 
				
			||||||
        throw std::runtime_error("popen() failed!");
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
    while (fgets(buffer.data(), buffer.size(), pipe.get()) != nullptr) {
 | 
					 | 
				
			||||||
        result += buffer.data();
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
    return result;
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  void LLVMInitializeNVPTXTargetInfo();
 | 
					 | 
				
			||||||
  void LLVMInitializeNVPTXTarget();
 | 
					 | 
				
			||||||
  void LLVMInitializeNVPTXTargetMC();
 | 
					 | 
				
			||||||
  void LLVMInitializeNVPTXAsmPrinter();
 | 
					 | 
				
			||||||
  void LLVMInitializeNVPTXAsmParser();
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
namespace triton
 | 
					 | 
				
			||||||
{
 | 
					 | 
				
			||||||
namespace driver
 | 
					 | 
				
			||||||
{
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* ------------------------ */
 | 
					 | 
				
			||||||
//         Base             //
 | 
					 | 
				
			||||||
/* ------------------------ */
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
void module::init_llvm() {
 | 
					 | 
				
			||||||
  static bool init = false;
 | 
					 | 
				
			||||||
  if(!init){
 | 
					 | 
				
			||||||
    LLVMInitializeNVPTXTargetInfo();
 | 
					 | 
				
			||||||
    LLVMInitializeNVPTXTarget();
 | 
					 | 
				
			||||||
    LLVMInitializeNVPTXTargetMC();
 | 
					 | 
				
			||||||
    LLVMInitializeNVPTXAsmPrinter();
 | 
					 | 
				
			||||||
    init = true;
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
module::module(CUmodule mod, bool has_ownership)
 | 
					 | 
				
			||||||
  : polymorphic_resource(mod, has_ownership), spilled_(0) {
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
module::module(host_module_t mod, bool has_ownership)
 | 
					 | 
				
			||||||
  : polymorphic_resource(mod, has_ownership), spilled_(0) {
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
module* module::create(driver::device* device, std::unique_ptr<llvm::Module> src) {
 | 
					 | 
				
			||||||
  switch(device->backend()){
 | 
					 | 
				
			||||||
    case CUDA: return new cu_module(device, std::move(src));
 | 
					 | 
				
			||||||
    case Host: return new host_module(std::move(src));
 | 
					 | 
				
			||||||
    default: throw std::runtime_error("unknown backend");
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
void module::compile_llvm_module(std::unique_ptr<llvm::Module> module, const std::string& triple,
 | 
					 | 
				
			||||||
                                 const std::string &proc, std::string layout,
 | 
					 | 
				
			||||||
                                 llvm::SmallVectorImpl<char> &buffer,
 | 
					 | 
				
			||||||
                                 const std::string& features,
 | 
					 | 
				
			||||||
                                 file_type_t ft) {
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* ------------------------ */
 | 
					 | 
				
			||||||
//        Host              //
 | 
					 | 
				
			||||||
/* ------------------------ */
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
host_module::host_module(std::unique_ptr<llvm::Module> src): module(host_module_t(), true) {
 | 
					 | 
				
			||||||
  throw std::runtime_error("CPU unsupported");
 | 
					 | 
				
			||||||
//  init_llvm();
 | 
					 | 
				
			||||||
//  // create kernel wrapper
 | 
					 | 
				
			||||||
//  llvm::LLVMContext &ctx = src->getContext();
 | 
					 | 
				
			||||||
//  llvm::Type *void_ty = llvm::Type::getVoidTy(ctx);
 | 
					 | 
				
			||||||
//  llvm::Type *args_ty = llvm::Type::getInt8PtrTy(ctx)->getPointerTo();
 | 
					 | 
				
			||||||
//  llvm::Type *int32_ty = llvm::Type::getInt32Ty(ctx);
 | 
					 | 
				
			||||||
//  std::vector<llvm::Type*> tys = {args_ty, int32_ty, int32_ty, int32_ty};
 | 
					 | 
				
			||||||
//  llvm::FunctionType *main_ty = llvm::FunctionType::get(void_ty, tys, false);
 | 
					 | 
				
			||||||
//  llvm::Function* main = llvm::Function::Create(main_ty, llvm::Function::ExternalLinkage, "_main", &*src);
 | 
					 | 
				
			||||||
//  llvm::Function* fn = &*src->getFunctionList().begin();
 | 
					 | 
				
			||||||
//  llvm::FunctionType *fn_ty = fn->getFunctionType();
 | 
					 | 
				
			||||||
//  std::vector<llvm::Value*> fn_args(fn_ty->getNumParams());
 | 
					 | 
				
			||||||
//  std::vector<llvm::Value*> ptrs(fn_args.size() - 3);
 | 
					 | 
				
			||||||
//  llvm::BasicBlock* entry = llvm::BasicBlock::Create(ctx, "entry", main);
 | 
					 | 
				
			||||||
//  llvm::IRBuilder<> ir_builder(ctx);
 | 
					 | 
				
			||||||
//  ir_builder.SetInsertPoint(entry);
 | 
					 | 
				
			||||||
//  auto get_size = [](llvm::Type* ty) { return ty->isPointerTy() ? sizeof(char*) : ty->getPrimitiveSizeInBits() / 8; };
 | 
					 | 
				
			||||||
//  llvm::Value* base = main->arg_begin();
 | 
					 | 
				
			||||||
//  llvm::Value* args_base = ir_builder.CreateBitCast(base, base->getType()->getPointerElementType());
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
//  size_t offset = 0;
 | 
					 | 
				
			||||||
//  for(unsigned i = 0; i < ptrs.size(); i++){
 | 
					 | 
				
			||||||
//    ptrs[i] = ir_builder.CreateGEP(args_base, ir_builder.getInt32(offset));
 | 
					 | 
				
			||||||
//    size_t nbytes = get_size(fn_ty->getParamType(i));
 | 
					 | 
				
			||||||
//    offset += nbytes;
 | 
					 | 
				
			||||||
//    if(i < ptrs.size() - 1){
 | 
					 | 
				
			||||||
//      size_t np1bytes = get_size(fn_ty->getParamType(i+1));
 | 
					 | 
				
			||||||
//      offset = (offset + np1bytes - 1) / np1bytes * np1bytes;
 | 
					 | 
				
			||||||
//    }
 | 
					 | 
				
			||||||
//  }
 | 
					 | 
				
			||||||
//  for(unsigned i = 0; i < ptrs.size(); i++)
 | 
					 | 
				
			||||||
//    ptrs[i] = ir_builder.CreateBitCast(ptrs[i], fn_ty->getParamType(i)->getPointerTo());
 | 
					 | 
				
			||||||
//  for(unsigned i = 0; i < ptrs.size(); i++)
 | 
					 | 
				
			||||||
//    fn_args[i] = ir_builder.CreateLoad(ptrs[i]);
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
//  fn_args[fn_args.size() - 3] = main->arg_begin() + 1;
 | 
					 | 
				
			||||||
//  fn_args[fn_args.size() - 2] = main->arg_begin() + 2;
 | 
					 | 
				
			||||||
//  fn_args[fn_args.size() - 1] = main->arg_begin() + 3;
 | 
					 | 
				
			||||||
//  ir_builder.CreateCall(fn, fn_args);
 | 
					 | 
				
			||||||
//  ir_builder.CreateRetVoid();
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
////  llvm::legacy::PassManager pm;
 | 
					 | 
				
			||||||
////  pm.add(llvm::createPrintModulePass(llvm::outs()));
 | 
					 | 
				
			||||||
////  pm.add(llvm::createVerifierPass());
 | 
					 | 
				
			||||||
////  pm.run(*src);
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
////   create execution engine
 | 
					 | 
				
			||||||
//  for(llvm::Function& fn: src->functions())
 | 
					 | 
				
			||||||
//    hst_->functions[fn.getName().str()] = &fn;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
////  llvm::orc::JITTargetMachineBuilder JTMB = *llvm::orc::JITTargetMachineBuilder::detectHost();
 | 
					 | 
				
			||||||
////  auto DL = JTMB.getDefaultDataLayoutForTarget();
 | 
					 | 
				
			||||||
////  auto CIRC = std::unique_ptr<llvm::orc::ConcurrentIRCompiler>(new llvm::orc::ConcurrentIRCompiler(JTMB));
 | 
					 | 
				
			||||||
////  hst_->ES = new llvm::orc::ExecutionSession();
 | 
					 | 
				
			||||||
////  hst_->ObjectLayer = new llvm::orc::RTDyldObjectLinkingLayer(*hst_->ES, []() { return std::unique_ptr<llvm::SectionMemoryManager>(new llvm::SectionMemoryManager()); });
 | 
					 | 
				
			||||||
////  hst_->CompileLayer = new llvm::orc::IRCompileLayer(*hst_->ES, *hst_->ObjectLayer, *CIRC);
 | 
					 | 
				
			||||||
////  hst_->DL = new llvm::DataLayout(std::move(*DL));
 | 
					 | 
				
			||||||
////  hst_->Mangle = new llvm::orc::MangleAndInterner(*hst_->ES, *hst_->DL);
 | 
					 | 
				
			||||||
////  hst_->Ctx = new llvm::orc::ThreadSafeContext(std::unique_ptr<llvm::LLVMContext>(new llvm::LLVMContext()));
 | 
					 | 
				
			||||||
////  hst_->MainJD =  &hst_->ES->createJITDylib("<main>");
 | 
					 | 
				
			||||||
////  hst_->MainJD->setGenerator(llvm::cantFail(llvm::orc::DynamicLibrarySearchGenerator::GetForCurrentProcess(
 | 
					 | 
				
			||||||
////                                            hst_->DL->getGlobalPrefix())));
 | 
					 | 
				
			||||||
////  llvm::cantFail(hst_->CompileLayer->add(*hst_->MainJD, llvm::orc::ThreadSafeModule(std::move(src), *hst_->Ctx)));
 | 
					 | 
				
			||||||
////  hst_->fn = (void(*)(char**, int32_t, int32_t, int32_t))(hst_->ES->lookup({hst_->MainJD}, (*hst_->Mangle)("_main"))->getAddress());
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
//  llvm::EngineBuilder builder(std::move(src));
 | 
					 | 
				
			||||||
//  builder.setErrorStr(&hst_->error);
 | 
					 | 
				
			||||||
//  builder.setMCJITMemoryManager(std::make_unique<llvm::SectionMemoryManager>());
 | 
					 | 
				
			||||||
//  builder.setOptLevel(llvm::CodeGenOpt::Aggressive);
 | 
					 | 
				
			||||||
//  builder.setEngineKind(llvm::EngineKind::JIT);
 | 
					 | 
				
			||||||
//  hst_->engine = builder.create();
 | 
					 | 
				
			||||||
//  hst_->fn = (void(*)(char**, int32_t, int32_t, int32_t))(hst_->engine->getFunctionAddress("_main"));
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
std::unique_ptr<buffer> host_module::symbol(const char *name) const {
 | 
					 | 
				
			||||||
  throw std::runtime_error("not implemented");
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* ------------------------ */
 | 
					 | 
				
			||||||
//         CUDA             //
 | 
					 | 
				
			||||||
/* ------------------------ */
 | 
					 | 
				
			||||||
static bool find_and_replace(std::string& str, const std::string& begin, const std::string& end, const std::string& target){
 | 
					 | 
				
			||||||
  size_t start_replace = str.find(begin);
 | 
					 | 
				
			||||||
  size_t end_replace = str.find(end, start_replace);
 | 
					 | 
				
			||||||
  if(start_replace == std::string::npos)
 | 
					 | 
				
			||||||
    return false;
 | 
					 | 
				
			||||||
  str.replace(start_replace, end_replace + 1 - start_replace, target);
 | 
					 | 
				
			||||||
  return true;
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
//static std::map<int, int> vptx = {
 | 
					 | 
				
			||||||
//  {10000, 63},
 | 
					 | 
				
			||||||
//  {10010, 64},
 | 
					 | 
				
			||||||
//  {10020, 65},
 | 
					 | 
				
			||||||
//  {11000, 70},
 | 
					 | 
				
			||||||
//  {11010, 71},
 | 
					 | 
				
			||||||
//  {11020, 72},
 | 
					 | 
				
			||||||
//  {11030, 73},
 | 
					 | 
				
			||||||
//  {11040, 73}
 | 
					 | 
				
			||||||
//};
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
int vptx(int version){
 | 
					 | 
				
			||||||
  if(version >= 11030) return 73;
 | 
					 | 
				
			||||||
  if(version >= 11020) return 72;
 | 
					 | 
				
			||||||
  if(version >= 11010) return 71;
 | 
					 | 
				
			||||||
  if(version >= 11000) return 70;
 | 
					 | 
				
			||||||
  if(version >= 10020) return 65;
 | 
					 | 
				
			||||||
  if(version >= 10010) return 64;
 | 
					 | 
				
			||||||
  if(version >= 10000) return 63;
 | 
					 | 
				
			||||||
  throw std::runtime_error("Triton requires CUDA 10+");
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
std::string cu_module::compile_llvm_module(llvm::Module* module, driver::device* device) {
 | 
					 | 
				
			||||||
  // LLVM version in use may not officially support target hardware
 | 
					 | 
				
			||||||
  int max_nvvm_cc = 75;
 | 
					 | 
				
			||||||
  int max_nvvm_ptx = 64;
 | 
					 | 
				
			||||||
  // options
 | 
					 | 
				
			||||||
  auto options = llvm::cl::getRegisteredOptions();
 | 
					 | 
				
			||||||
  auto* short_ptr = static_cast<llvm::cl::opt<bool>*>(options["nvptx-short-ptr"]);
 | 
					 | 
				
			||||||
  assert(short_ptr);
 | 
					 | 
				
			||||||
  short_ptr->setValue(true);
 | 
					 | 
				
			||||||
  // compute capability
 | 
					 | 
				
			||||||
  int cc = ((driver::cu_device*)device)->compute_capability();
 | 
					 | 
				
			||||||
  std::string sm = "sm_" + std::to_string(cc);
 | 
					 | 
				
			||||||
  // driver version
 | 
					 | 
				
			||||||
  int version;
 | 
					 | 
				
			||||||
  dispatch::cuDriverGetVersion(&version);
 | 
					 | 
				
			||||||
  int ptx = vptx(version);
 | 
					 | 
				
			||||||
  int ptx_major = ptx / 10;
 | 
					 | 
				
			||||||
  int ptx_minor = ptx % 10;
 | 
					 | 
				
			||||||
  // create
 | 
					 | 
				
			||||||
  llvm::SmallVector<char, 0> buffer;
 | 
					 | 
				
			||||||
  std::string triple = "nvptx64-nvidia-cuda";
 | 
					 | 
				
			||||||
  std::string proc = "sm_" + std::to_string(std::min(cc, max_nvvm_cc));
 | 
					 | 
				
			||||||
  std::string layout = "";
 | 
					 | 
				
			||||||
  std::string features = "+ptx" + std::to_string(std::min(ptx, max_nvvm_ptx));
 | 
					 | 
				
			||||||
  init_llvm();
 | 
					 | 
				
			||||||
  // verify and store llvm
 | 
					 | 
				
			||||||
  llvm::legacy::PassManager pm;
 | 
					 | 
				
			||||||
  pm.add(llvm::createVerifierPass());
 | 
					 | 
				
			||||||
  pm.run(*module);
 | 
					 | 
				
			||||||
  // create machine
 | 
					 | 
				
			||||||
  module->setTargetTriple(triple);
 | 
					 | 
				
			||||||
  std::string error;
 | 
					 | 
				
			||||||
  auto target = llvm::TargetRegistry::lookupTarget(module->getTargetTriple(), error);
 | 
					 | 
				
			||||||
  llvm::TargetOptions opt;
 | 
					 | 
				
			||||||
  opt.AllowFPOpFusion = llvm::FPOpFusion::Fast;
 | 
					 | 
				
			||||||
  opt.UnsafeFPMath = false;
 | 
					 | 
				
			||||||
  opt.NoInfsFPMath = false;
 | 
					 | 
				
			||||||
  opt.NoNaNsFPMath = true;
 | 
					 | 
				
			||||||
  llvm::TargetMachine *machine = target->createTargetMachine(module->getTargetTriple(), proc, features, opt,
 | 
					 | 
				
			||||||
                                                             llvm::Reloc::PIC_, llvm::None, llvm::CodeGenOpt::Aggressive);
 | 
					 | 
				
			||||||
  // set data layout
 | 
					 | 
				
			||||||
  if(layout.empty())
 | 
					 | 
				
			||||||
    module->setDataLayout(machine->createDataLayout());
 | 
					 | 
				
			||||||
  else
 | 
					 | 
				
			||||||
    module->setDataLayout(layout);
 | 
					 | 
				
			||||||
  // emit machine code
 | 
					 | 
				
			||||||
  for (llvm::Function &f : module->functions())
 | 
					 | 
				
			||||||
    f.addFnAttr(llvm::Attribute::AlwaysInline);
 | 
					 | 
				
			||||||
  llvm::legacy::PassManager pass;
 | 
					 | 
				
			||||||
  llvm::raw_svector_ostream stream(buffer);
 | 
					 | 
				
			||||||
  // emit
 | 
					 | 
				
			||||||
  machine->addPassesToEmitFile(pass, stream, nullptr, llvm::CodeGenFileType::CGFT_AssemblyFile);
 | 
					 | 
				
			||||||
  pass.run(*module);
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  // post-process
 | 
					 | 
				
			||||||
  std::string result(buffer.begin(), buffer.end());
 | 
					 | 
				
			||||||
  find_and_replace(result, ".version", "\n", ".version " + std::to_string(ptx_major) + "." + std::to_string(ptx_minor) + "\n");
 | 
					 | 
				
			||||||
  find_and_replace(result, ".target", "\n", ".target " + sm + "\n");
 | 
					 | 
				
			||||||
  while(find_and_replace(result, "\t// begin inline asm", "\n", ""));
 | 
					 | 
				
			||||||
  while(find_and_replace(result, "\t// end inline asm", "\n", ""));
 | 
					 | 
				
			||||||
  return result;
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
void cu_module::init_from_ptx(const std::string& ptx, driver::cu_device* device) {
 | 
					 | 
				
			||||||
  // JIT compile source-code
 | 
					 | 
				
			||||||
  try{
 | 
					 | 
				
			||||||
    // use ptxas if present in PATH. Otherwise, use JIT from the driver
 | 
					 | 
				
			||||||
    std::string ptxas = "ptxas";
 | 
					 | 
				
			||||||
    std::string version;
 | 
					 | 
				
			||||||
    int use_system_ptxas = tools::exec(ptxas + " --version 2>&1", version) == 0;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    // Use PTXAS via system call
 | 
					 | 
				
			||||||
    if(use_system_ptxas){
 | 
					 | 
				
			||||||
      // compile ptx with ptxas
 | 
					 | 
				
			||||||
      char _fsrc[] = "/tmp/triton_k_XXXXXX";
 | 
					 | 
				
			||||||
      char _flog[] = "/tmp/triton_l_XXXXXX";
 | 
					 | 
				
			||||||
      mkstemp(_fsrc);
 | 
					 | 
				
			||||||
      mkstemp(_flog);
 | 
					 | 
				
			||||||
      std::string fsrc = _fsrc;
 | 
					 | 
				
			||||||
      std::string flog = _flog;
 | 
					 | 
				
			||||||
      std::ofstream ofs(fsrc);
 | 
					 | 
				
			||||||
      ofs << ptx;
 | 
					 | 
				
			||||||
      ofs.close();
 | 
					 | 
				
			||||||
      std::string cmd;
 | 
					 | 
				
			||||||
      int err;
 | 
					 | 
				
			||||||
      std::string cc = std::to_string(device->compute_capability());
 | 
					 | 
				
			||||||
      cmd = ptxas + " -v --gpu-name=sm_" + cc + " " + fsrc + " -o " + fsrc + ".o 2> " + flog;
 | 
					 | 
				
			||||||
      err = system(cmd.c_str());
 | 
					 | 
				
			||||||
      dispatch::cuModuleLoad(&*cu_, (fsrc + ".o").c_str());
 | 
					 | 
				
			||||||
      unlink(_fsrc);
 | 
					 | 
				
			||||||
      unlink(_flog);
 | 
					 | 
				
			||||||
      return;
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    // Use PTXAS included in driver
 | 
					 | 
				
			||||||
    CUjit_option opt[] = {CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, CU_JIT_ERROR_LOG_BUFFER,
 | 
					 | 
				
			||||||
                          CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES, CU_JIT_INFO_LOG_BUFFER,
 | 
					 | 
				
			||||||
                          CU_JIT_LOG_VERBOSE};
 | 
					 | 
				
			||||||
    unsigned int errbufsize = 8192;
 | 
					 | 
				
			||||||
    unsigned int logbufsize = 8192;
 | 
					 | 
				
			||||||
    char _err[errbufsize];
 | 
					 | 
				
			||||||
    char _log[logbufsize];
 | 
					 | 
				
			||||||
    void* optval[] = {(void*)(uintptr_t)errbufsize, (void*)_err, (void*)(uintptr_t)logbufsize, (void*)_log, (void*)1};
 | 
					 | 
				
			||||||
    dispatch::cuModuleLoadDataEx(&*cu_, ptx_.data(), 5, opt, optval);
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
  catch(exception::cuda::invalid_ptx const &){
 | 
					 | 
				
			||||||
//#ifdef TRITON_LOG_PTX_ERROR
 | 
					 | 
				
			||||||
     std::cout << ptx << std::endl;
 | 
					 | 
				
			||||||
    std::cerr << "It appears that Triton produced invalid PTX code:" << std::endl;
 | 
					 | 
				
			||||||
//    exit(1);
 | 
					 | 
				
			||||||
//#endif
 | 
					 | 
				
			||||||
    throw;
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
cu_module::cu_module(driver::device* device, std::unique_ptr<llvm::Module> ll_module): module(CUmodule(), true) {
 | 
					 | 
				
			||||||
  llvm::raw_string_ostream oss(llir_);
 | 
					 | 
				
			||||||
  oss << *ll_module;
 | 
					 | 
				
			||||||
  oss.flush();
 | 
					 | 
				
			||||||
  ptx_ = compile_llvm_module(ll_module.get(), device);
 | 
					 | 
				
			||||||
  init_from_ptx(ptx_, (driver::cu_device*)device);
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
cu_module::cu_module(driver::device* device, std::string const & source) : module(CUmodule(), true), ptx_(source){
 | 
					 | 
				
			||||||
  init_from_ptx(ptx_, (driver::cu_device*)device);
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
std::unique_ptr<buffer> cu_module::symbol(const char *name) const{
 | 
					 | 
				
			||||||
  CUdeviceptr handle;
 | 
					 | 
				
			||||||
  size_t size;
 | 
					 | 
				
			||||||
  dispatch::cuModuleGetGlobal_v2(&handle, &size, *cu_, name);
 | 
					 | 
				
			||||||
  std::unique_ptr<buffer> res(new cu_buffer(size, handle, false));
 | 
					 | 
				
			||||||
  return std::move(res);
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
@@ -1,68 +0,0 @@
 | 
				
			|||||||
/* Copyright 2015-2017 Philippe Tillet
 | 
					 | 
				
			||||||
* 
 | 
					 | 
				
			||||||
* Permission is hereby granted, free of charge, to any person obtaining 
 | 
					 | 
				
			||||||
* a copy of this software and associated documentation files 
 | 
					 | 
				
			||||||
* (the "Software"), to deal in the Software without restriction, 
 | 
					 | 
				
			||||||
* including without limitation the rights to use, copy, modify, merge, 
 | 
					 | 
				
			||||||
* publish, distribute, sublicense, and/or sell copies of the Software, 
 | 
					 | 
				
			||||||
* and to permit persons to whom the Software is furnished to do so, 
 | 
					 | 
				
			||||||
* subject to the following conditions:
 | 
					 | 
				
			||||||
* 
 | 
					 | 
				
			||||||
* The above copyright notice and this permission notice shall be 
 | 
					 | 
				
			||||||
* included in all copies or substantial portions of the Software.
 | 
					 | 
				
			||||||
* 
 | 
					 | 
				
			||||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
 | 
					 | 
				
			||||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
 | 
					 | 
				
			||||||
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 | 
					 | 
				
			||||||
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 | 
					 | 
				
			||||||
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
 | 
					 | 
				
			||||||
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
 | 
					 | 
				
			||||||
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 | 
					 | 
				
			||||||
*/
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#include <string>
 | 
					 | 
				
			||||||
#include "triton/driver/platform.h"
 | 
					 | 
				
			||||||
#include "triton/driver/device.h"
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
namespace triton
 | 
					 | 
				
			||||||
{
 | 
					 | 
				
			||||||
namespace driver
 | 
					 | 
				
			||||||
{
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* ------------------------ */
 | 
					 | 
				
			||||||
//         CUDA             //
 | 
					 | 
				
			||||||
/* ------------------------ */
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
std::string cu_platform::version() const{
 | 
					 | 
				
			||||||
  int version;
 | 
					 | 
				
			||||||
  dispatch::cuDriverGetVersion(&version);
 | 
					 | 
				
			||||||
  return std::to_string(version);
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
void cu_platform::devices(std::vector<device *> &devices) const{
 | 
					 | 
				
			||||||
  int N;
 | 
					 | 
				
			||||||
  dispatch::cuDeviceGetCount(&N);
 | 
					 | 
				
			||||||
  for(int i = 0 ; i < N ; ++i){
 | 
					 | 
				
			||||||
    CUdevice dvc;
 | 
					 | 
				
			||||||
    dispatch::cuDeviceGet(&dvc, i);
 | 
					 | 
				
			||||||
    devices.push_back(new driver::cu_device(dvc));
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* ------------------------ */
 | 
					 | 
				
			||||||
//        Host              //
 | 
					 | 
				
			||||||
/* ------------------------ */
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
std::string host_platform::version() const {
 | 
					 | 
				
			||||||
  return "1.0";
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
void host_platform::devices(std::vector<driver::device*> &devices) const {
 | 
					 | 
				
			||||||
  devices.push_back(new driver::host_device());
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
@@ -1,142 +0,0 @@
 | 
				
			|||||||
/* Copyright 2015-2017 Philippe Tillet
 | 
					 | 
				
			||||||
* 
 | 
					 | 
				
			||||||
* Permission is hereby granted, free of charge, to any person obtaining 
 | 
					 | 
				
			||||||
* a copy of this software and associated documentation files 
 | 
					 | 
				
			||||||
* (the "Software"), to deal in the Software without restriction, 
 | 
					 | 
				
			||||||
* including without limitation the rights to use, copy, modify, merge, 
 | 
					 | 
				
			||||||
* publish, distribute, sublicense, and/or sell copies of the Software, 
 | 
					 | 
				
			||||||
* and to permit persons to whom the Software is furnished to do so, 
 | 
					 | 
				
			||||||
* subject to the following conditions:
 | 
					 | 
				
			||||||
* 
 | 
					 | 
				
			||||||
* The above copyright notice and this permission notice shall be 
 | 
					 | 
				
			||||||
* included in all copies or substantial portions of the Software.
 | 
					 | 
				
			||||||
* 
 | 
					 | 
				
			||||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
 | 
					 | 
				
			||||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
 | 
					 | 
				
			||||||
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 | 
					 | 
				
			||||||
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 | 
					 | 
				
			||||||
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
 | 
					 | 
				
			||||||
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
 | 
					 | 
				
			||||||
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 | 
					 | 
				
			||||||
*/
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#include <cassert>
 | 
					 | 
				
			||||||
#include <unistd.h>
 | 
					 | 
				
			||||||
#include <array>
 | 
					 | 
				
			||||||
#include "triton/driver/backend.h"
 | 
					 | 
				
			||||||
#include "triton/driver/stream.h"
 | 
					 | 
				
			||||||
#include "triton/driver/context.h"
 | 
					 | 
				
			||||||
#include "triton/driver/device.h"
 | 
					 | 
				
			||||||
#include "triton/driver/kernel.h"
 | 
					 | 
				
			||||||
#include "triton/driver/buffer.h"
 | 
					 | 
				
			||||||
#include "llvm/ExecutionEngine/ExecutionEngine.h"
 | 
					 | 
				
			||||||
#include "llvm/ExecutionEngine/GenericValue.h"
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
namespace triton
 | 
					 | 
				
			||||||
{
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
namespace driver
 | 
					 | 
				
			||||||
{
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* ------------------------ */
 | 
					 | 
				
			||||||
//         Base             //
 | 
					 | 
				
			||||||
/* ------------------------ */
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
stream::stream(CUstream cu, bool has_ownership)
 | 
					 | 
				
			||||||
  : polymorphic_resource(cu, has_ownership) {
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
stream::stream(host_stream_t cl, bool has_ownership)
 | 
					 | 
				
			||||||
  : polymorphic_resource(cl, has_ownership) {
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
driver::stream* stream::create(backend_t backend) {
 | 
					 | 
				
			||||||
  switch(backend){
 | 
					 | 
				
			||||||
    case CUDA: return new cu_stream();
 | 
					 | 
				
			||||||
    case Host: return new host_stream();
 | 
					 | 
				
			||||||
    default: throw std::runtime_error("unknown backend");
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* ------------------------ */
 | 
					 | 
				
			||||||
//          Host            //
 | 
					 | 
				
			||||||
/* ------------------------ */
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
host_stream::host_stream(): stream(host_stream_t(), true) {
 | 
					 | 
				
			||||||
  hst_->pool.reset(new ThreadPool(1));
 | 
					 | 
				
			||||||
  hst_->futures.reset(new std::vector<std::future<void>>());
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
void host_stream::synchronize() {
 | 
					 | 
				
			||||||
  for(auto& x: *hst_->futures)
 | 
					 | 
				
			||||||
    x.wait();
 | 
					 | 
				
			||||||
  hst_->futures->clear();
 | 
					 | 
				
			||||||
  hst_->args.clear();
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
void host_stream::enqueue(driver::kernel* kernel, std::array<size_t, 3> grid, std::array<size_t, 3> block, void* args, size_t args_size, size_t) {
 | 
					 | 
				
			||||||
  auto hst = kernel->module()->hst();
 | 
					 | 
				
			||||||
  hst_->futures->reserve(hst_->futures->size() + grid[0]*grid[1]*grid[2]);
 | 
					 | 
				
			||||||
  char* params = new char[args_size];
 | 
					 | 
				
			||||||
  std::memcpy((void*)params, (void*)args, args_size);
 | 
					 | 
				
			||||||
  for(size_t i = 0; i < grid[0]; i++)
 | 
					 | 
				
			||||||
    for(size_t j = 0; j < grid[1]; j++)
 | 
					 | 
				
			||||||
      for(size_t k = 0; k < grid[2]; k++)
 | 
					 | 
				
			||||||
        hst_->futures->emplace_back(hst_->pool->enqueue(hst->fn, (char**)params, int32_t(i), int32_t(j), int32_t(k)));
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
void host_stream::write(driver::buffer* buffer, bool blocking, std::size_t offset, std::size_t size, void const* ptr) {
 | 
					 | 
				
			||||||
  std::memcpy((void*)buffer->hst()->data, ptr, size);
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
void host_stream::read(driver::buffer* buffer, bool blocking, std::size_t offset, std::size_t size, void* ptr) {
 | 
					 | 
				
			||||||
  std::memcpy(ptr, (const void*)buffer->hst()->data, size);
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* ------------------------ */
 | 
					 | 
				
			||||||
//         CUDA             //
 | 
					 | 
				
			||||||
/* ------------------------ */
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
cu_stream::cu_stream(CUstream str, bool take_ownership):
 | 
					 | 
				
			||||||
  stream(str, take_ownership) {
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
cu_stream::cu_stream(): stream(CUstream(), true) {
 | 
					 | 
				
			||||||
  dispatch::cuStreamCreate(&*cu_, 0);
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
void cu_stream::synchronize() {
 | 
					 | 
				
			||||||
  dispatch::cuStreamSynchronize(*cu_);
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
void cu_stream::enqueue(driver::kernel* kernel, std::array<size_t, 3> grid, std::array<size_t, 3> block, void* args, size_t args_size, size_t shared_mem) {
 | 
					 | 
				
			||||||
  void *config[] = {
 | 
					 | 
				
			||||||
      CU_LAUNCH_PARAM_BUFFER_POINTER, args,
 | 
					 | 
				
			||||||
      CU_LAUNCH_PARAM_BUFFER_SIZE,    &args_size,
 | 
					 | 
				
			||||||
      CU_LAUNCH_PARAM_END
 | 
					 | 
				
			||||||
  };
 | 
					 | 
				
			||||||
  dispatch::cuLaunchKernel(*kernel->cu(), grid[0], grid[1], grid[2], block[0], block[1], block[2], shared_mem, *cu_, nullptr, config);
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
void cu_stream::write(driver::buffer* buffer, bool blocking, std::size_t offset, std::size_t size, void const* ptr) {
 | 
					 | 
				
			||||||
  if(blocking)
 | 
					 | 
				
			||||||
    dispatch::cuMemcpyHtoD(*buffer->cu() + offset, ptr, size);
 | 
					 | 
				
			||||||
  else
 | 
					 | 
				
			||||||
    dispatch::cuMemcpyHtoDAsync(*buffer->cu() + offset, ptr, size, *cu_);
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
void cu_stream::read(driver::buffer* buffer, bool blocking, std::size_t offset, std::size_t size, void* ptr) {
 | 
					 | 
				
			||||||
  if(blocking)
 | 
					 | 
				
			||||||
    dispatch::cuMemcpyDtoH(ptr, *buffer->cu() + offset, size);
 | 
					 | 
				
			||||||
  else
 | 
					 | 
				
			||||||
    dispatch::cuMemcpyDtoHAsync(ptr, *buffer->cu() + offset, size, *cu_);
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
@@ -1,7 +1,7 @@
 | 
				
			|||||||
#include "triton/codegen/pass.h"
 | 
					#include "triton/codegen/pass.h"
 | 
				
			||||||
#include "triton/driver/kernel.h"
 | 
					#include "triton/codegen/target.h"
 | 
				
			||||||
#include "triton/driver/module.h"
 | 
					#include "triton/driver/error.h"
 | 
				
			||||||
#include "triton/driver/stream.h"
 | 
					#include "triton/driver/llvm.h"
 | 
				
			||||||
#include "triton/ir/builder.h"
 | 
					#include "triton/ir/builder.h"
 | 
				
			||||||
#include "triton/ir/dispatch.h"
 | 
					#include "triton/ir/dispatch.h"
 | 
				
			||||||
#include "triton/ir/enums.h"
 | 
					#include "triton/ir/enums.h"
 | 
				
			||||||
@@ -15,7 +15,9 @@
 | 
				
			|||||||
#include <pybind11/stl.h>
 | 
					#include <pybind11/stl.h>
 | 
				
			||||||
#include <regex>
 | 
					#include <regex>
 | 
				
			||||||
#include <string>
 | 
					#include <string>
 | 
				
			||||||
#include <sstream>
 | 
					#include "llvm/IR/Module.h"
 | 
				
			||||||
 | 
					#include "llvm/IR/LegacyPassManager.h"
 | 
				
			||||||
 | 
					#include "llvm/IR/Verifier.h"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
namespace py = pybind11;
 | 
					namespace py = pybind11;
 | 
				
			||||||
namespace ir = triton::ir;
 | 
					namespace ir = triton::ir;
 | 
				
			||||||
@@ -24,72 +26,213 @@ namespace drv = triton::driver;
 | 
				
			|||||||
/*****************************************************************************/
 | 
					/*****************************************************************************/
 | 
				
			||||||
/* Python bindings for triton::driver                                        */
 | 
					/* Python bindings for triton::driver                                        */
 | 
				
			||||||
/*****************************************************************************/
 | 
					/*****************************************************************************/
 | 
				
			||||||
 | 
					// information query
 | 
				
			||||||
 | 
					template<CUdevice_attribute attr>
 | 
				
			||||||
 | 
					int cuGetInfo(CUdevice device) {
 | 
				
			||||||
 | 
					  int res;
 | 
				
			||||||
 | 
					  drv::dispatch::cuDeviceGetAttribute(&res, attr, device);
 | 
				
			||||||
 | 
					  return res;
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
void init_triton_driver(py::module &&m) {
 | 
					template<hipDeviceAttribute_t attr>
 | 
				
			||||||
  // base device
 | 
					int hipGetInfo(hipDevice_t device) {
 | 
				
			||||||
  py::class_<drv::device>(m, "device");
 | 
					  int res;
 | 
				
			||||||
  // cuda device
 | 
					  drv::dispatch::hipDeviceGetAttribute(&res, attr, device);
 | 
				
			||||||
  py::class_<drv::cu_device, drv::device>(m, "cu_device")
 | 
					  return res;
 | 
				
			||||||
      .def(py::init([](int dev_id, bool take_ownership) {
 | 
					}
 | 
				
			||||||
        CUdevice handle;
 | 
					 | 
				
			||||||
        drv::dispatch::cuDeviceGet(&handle, dev_id);
 | 
					 | 
				
			||||||
        return new drv::cu_device(handle, take_ownership);
 | 
					 | 
				
			||||||
      }))
 | 
					 | 
				
			||||||
      .def("max_shared_memory", [](drv::cu_device *self) {
 | 
					 | 
				
			||||||
        return self->max_shared_memory();
 | 
					 | 
				
			||||||
      })
 | 
					 | 
				
			||||||
      .def("enable_peer_access", [](drv::cu_device *self, unsigned long long int peer_mem_ptr) {
 | 
					 | 
				
			||||||
        self->enable_peer_access(peer_mem_ptr);
 | 
					 | 
				
			||||||
      });
 | 
					 | 
				
			||||||
  // host device
 | 
					 | 
				
			||||||
  py::class_<drv::host_device, drv::device>(m, "host_device")
 | 
					 | 
				
			||||||
      .def(py::init<>());
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
  // base stream
 | 
					enum backend_t {
 | 
				
			||||||
  py::class_<drv::stream>(m, "stream");
 | 
					  HOST,
 | 
				
			||||||
  // host stream
 | 
					  CUDA,
 | 
				
			||||||
  py::class_<drv::host_stream, drv::stream>(m, "host_stream")
 | 
					  ROCM,
 | 
				
			||||||
      .def(py::init<>());
 | 
					};
 | 
				
			||||||
  // cuda stream
 | 
					
 | 
				
			||||||
  py::class_<drv::cu_stream, drv::stream>(m, "cu_stream")
 | 
					void cu_enable_peer_access(uint64_t peer_ptr){
 | 
				
			||||||
      // py doesn't support opaque pointer (e.g., CUstream) so
 | 
					  CUcontext context;
 | 
				
			||||||
      // we assume it has been converted to uint64_t
 | 
					  drv::dispatch::cuPointerGetAttribute(&context, CU_POINTER_ATTRIBUTE_CONTEXT, peer_ptr);
 | 
				
			||||||
      .def(py::init([](uint64_t handle, bool take_ownership) {
 | 
					  try {
 | 
				
			||||||
        return std::unique_ptr<drv::cu_stream>(new drv::cu_stream((CUstream)handle, take_ownership));
 | 
					      drv::dispatch::cuCtxEnablePeerAccess(context, 0);
 | 
				
			||||||
      }))
 | 
					  } catch (drv::exception::cuda::peer_access_already_enabled) {}
 | 
				
			||||||
      .def("enqueue", [](drv::cu_stream *self, drv::kernel *kernel,
 | 
					}
 | 
				
			||||||
                         size_t grid_0, size_t grid_1, size_t grid_2,
 | 
					
 | 
				
			||||||
                         size_t block_0, size_t block_1, size_t block_2,
 | 
					void host_enqueue(uint64_t stream, uint64_t kernel,
 | 
				
			||||||
                         const std::string &args,
 | 
					                  uint64_t grid_0, uint64_t grid_1, uint64_t grid_2,
 | 
				
			||||||
                         size_t shared_mem) {
 | 
					                  uint64_t block_0, uint64_t block_1, uint64_t block_2,
 | 
				
			||||||
        return self->enqueue(kernel, {grid_0, grid_1, grid_2}, {block_0, block_1, block_2},
 | 
					                  void* args_ptr, size_t args_size, int64_t shared_mem){
 | 
				
			||||||
                             (void *)args.data(), args.size(), shared_mem);
 | 
					  throw std::runtime_error("unsupported");
 | 
				
			||||||
 | 
					// auto hst = kernel->module()->hst();
 | 
				
			||||||
 | 
					// hst_->futures->reserve(hst_->futures->size() + grid[0]*grid[1]*grid[2]);
 | 
				
			||||||
 | 
					// char* params = new char[args_size];
 | 
				
			||||||
 | 
					// std::memcpy((void*)params, (void*)args, args_size);
 | 
				
			||||||
 | 
					// for(size_t i = 0; i < grid[0]; i++)
 | 
				
			||||||
 | 
					//   for(size_t j = 0; j < grid[1]; j++)
 | 
				
			||||||
 | 
					//     for(size_t k = 0; k < grid[2]; k++)
 | 
				
			||||||
 | 
					//       hst_->futures->emplace_back(hst_->pool->enqueue(hst->fn, (char**)params, int32_t(i), int32_t(j), int32_t(k)));
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					void cu_enqueue(uint64_t stream, uint64_t kernel,
 | 
				
			||||||
 | 
					                uint64_t grid_0, uint64_t grid_1, uint64_t grid_2,
 | 
				
			||||||
 | 
					                uint64_t block_0, uint64_t block_1, uint64_t block_2,
 | 
				
			||||||
 | 
					                void* args_ptr, size_t args_size, int64_t shared_mem){
 | 
				
			||||||
 | 
					  void *config[] = {
 | 
				
			||||||
 | 
					      CU_LAUNCH_PARAM_BUFFER_POINTER, (void*)args_ptr,
 | 
				
			||||||
 | 
					      CU_LAUNCH_PARAM_BUFFER_SIZE,    &args_size,
 | 
				
			||||||
 | 
					      CU_LAUNCH_PARAM_END
 | 
				
			||||||
 | 
					  };
 | 
				
			||||||
 | 
					  drv::dispatch::cuLaunchKernel((CUfunction)kernel, grid_0, grid_1, grid_2, 
 | 
				
			||||||
 | 
					                                block_0, block_1, block_2, 
 | 
				
			||||||
 | 
					                                shared_mem, (CUstream)stream, nullptr, config);
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					void hip_enqueue(uint64_t stream, uint64_t kernel,
 | 
				
			||||||
 | 
					                uint64_t grid_0, uint64_t grid_1, uint64_t grid_2,
 | 
				
			||||||
 | 
					                uint64_t block_0, uint64_t block_1, uint64_t block_2,
 | 
				
			||||||
 | 
					                void* args_ptr, size_t args_size, int64_t shared_mem) {
 | 
				
			||||||
 | 
					  void *config[] = {
 | 
				
			||||||
 | 
					      HIP_LAUNCH_PARAM_BUFFER_POINTER, (void*)args_ptr,
 | 
				
			||||||
 | 
					      HIP_LAUNCH_PARAM_BUFFER_SIZE,    &args_size,
 | 
				
			||||||
 | 
					      HIP_LAUNCH_PARAM_END
 | 
				
			||||||
 | 
					  };
 | 
				
			||||||
 | 
					  drv::dispatch::hipModuleLaunchKernel((hipFunction_t)kernel, grid_0, grid_1, grid_2, 
 | 
				
			||||||
 | 
					                                block_0, block_1, block_2, 
 | 
				
			||||||
 | 
					                                shared_mem, (hipStream_t)stream, nullptr, config);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					void init_triton_runtime(py::module &&m) {
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  // wrap backend_t
 | 
				
			||||||
 | 
					  py::enum_<backend_t>(m, "backend")
 | 
				
			||||||
 | 
					    .value("HOST", HOST)
 | 
				
			||||||
 | 
					    .value("CUDA", CUDA)
 | 
				
			||||||
 | 
					    .value("ROCM", ROCM)
 | 
				
			||||||
 | 
					    .export_values();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  // enable peer-to-peer
 | 
				
			||||||
 | 
					  m.def("enable_peer_access", [](backend_t backend, uint64_t peer_ptr) {
 | 
				
			||||||
 | 
					      if (backend != CUDA)
 | 
				
			||||||
 | 
					        throw std::runtime_error("P2P only supported on CUDA devices!");
 | 
				
			||||||
 | 
					      cu_enable_peer_access(peer_ptr);
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					  );
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  // query maximum shared memory
 | 
				
			||||||
 | 
					  m.def("max_shared_memory", [](backend_t backend, uint64_t device) {
 | 
				
			||||||
 | 
					      if (backend == HOST)
 | 
				
			||||||
 | 
					        return 0;
 | 
				
			||||||
 | 
					      if(backend == CUDA) 
 | 
				
			||||||
 | 
					        return cuGetInfo<CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN>(device);
 | 
				
			||||||
 | 
					      if(backend == ROCM)
 | 
				
			||||||
 | 
					        return hipGetInfo<hipDeviceAttributeMaxSharedMemoryPerBlock>(device);
 | 
				
			||||||
 | 
					      return -1;
 | 
				
			||||||
  });
 | 
					  });
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  py::class_<drv::module>(m, "module");
 | 
					  // enqueue
 | 
				
			||||||
 | 
					  m.def("enqueue", [](backend_t backend, uint64_t stream, uint64_t kernel,
 | 
				
			||||||
 | 
					                      uint64_t grid_0, uint64_t grid_1, uint64_t grid_2,
 | 
				
			||||||
 | 
					                      uint64_t block_0, uint64_t block_1, uint64_t block_2,
 | 
				
			||||||
 | 
					                      const std::string &args, int64_t shared_mem){
 | 
				
			||||||
 | 
					    void* args_ptr = (void*)args.data();
 | 
				
			||||||
 | 
					    size_t args_size = args.size();
 | 
				
			||||||
 | 
					    if(backend == HOST)
 | 
				
			||||||
 | 
					      host_enqueue(stream, kernel, grid_0, grid_1, grid_2, block_0, block_1, block_2, args_ptr, args_size, shared_mem);
 | 
				
			||||||
 | 
					    if(backend == CUDA)
 | 
				
			||||||
 | 
					      cu_enqueue(stream, kernel, grid_0, grid_1, grid_2, block_0, block_1, block_2, args_ptr, args_size, shared_mem);
 | 
				
			||||||
 | 
					    if(backend == ROCM)
 | 
				
			||||||
 | 
					      hip_enqueue(stream, kernel, grid_0, grid_1, grid_2, block_0, block_1, block_2, args_ptr, args_size, shared_mem);
 | 
				
			||||||
 | 
					  });
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  py::class_<drv::cu_module, drv::module>(m, "cu_module")
 | 
					 | 
				
			||||||
      .def("ptx", &drv::cu_module::ptx)
 | 
					 | 
				
			||||||
      .def("cubin", [](drv::cu_module *self) { return py::bytes(self->cubin()); })
 | 
					 | 
				
			||||||
      .def("llir", &drv::cu_module::llir);
 | 
					 | 
				
			||||||
  
 | 
					  
 | 
				
			||||||
  py::class_<drv::kernel>(m, "kernel");
 | 
					 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
/*****************************************************************************/
 | 
					/*****************************************************************************/
 | 
				
			||||||
/* Python bindings for triton::codegen                                       */
 | 
					/* Python bindings for triton::codegen                                       */
 | 
				
			||||||
/*****************************************************************************/
 | 
					/*****************************************************************************/
 | 
				
			||||||
 | 
					typedef std::map<std::string, std::string> asm_map_t;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					std::tuple<uint64_t, uint64_t> cu_compile_llir(const std::string& name, size_t n_shared_bytes, llvm::Module* llvm, uint64_t dev, asm_map_t& asm_map, int cc, int version){
 | 
				
			||||||
 | 
					  // LLVM-IR -> PTX
 | 
				
			||||||
 | 
					  std::string ptx = drv::llir_to_ptx(llvm, cc, version);
 | 
				
			||||||
 | 
					  asm_map["ptx"] = ptx;
 | 
				
			||||||
 | 
					  // PTX -> Binary
 | 
				
			||||||
 | 
					  CUmodule mod = drv::ptx_to_cumodule(ptx, cc);
 | 
				
			||||||
 | 
					  // Handle to the kernel
 | 
				
			||||||
 | 
					  CUfunction fun;
 | 
				
			||||||
 | 
					  drv::dispatch::cuModuleGetFunction(&fun, mod, name.c_str());
 | 
				
			||||||
 | 
					  // Dynamic shared memory
 | 
				
			||||||
 | 
					  int shared_optin;
 | 
				
			||||||
 | 
					  drv::dispatch::cuDeviceGetAttribute(&shared_optin, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN, dev);
 | 
				
			||||||
 | 
					  if(n_shared_bytes > 49152 && shared_optin > 49152){
 | 
				
			||||||
 | 
					    drv::dispatch::cuFuncSetCacheConfig(fun, CU_FUNC_CACHE_PREFER_SHARED);
 | 
				
			||||||
 | 
					    int shared_total, shared_static;
 | 
				
			||||||
 | 
					    int n_spills, n_reg;
 | 
				
			||||||
 | 
					    drv::dispatch::cuDeviceGetAttribute(&shared_total, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR, dev);
 | 
				
			||||||
 | 
					    drv::dispatch::cuFuncGetAttribute(&shared_static, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, fun);
 | 
				
			||||||
 | 
					    drv::dispatch::cuFuncGetAttribute(&n_spills, CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES,  fun);
 | 
				
			||||||
 | 
					    drv::dispatch::cuFuncGetAttribute(&n_reg, CU_FUNC_ATTRIBUTE_NUM_REGS, fun);
 | 
				
			||||||
 | 
					    drv::dispatch::cuFuncSetAttribute(fun, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, shared_optin - shared_static);
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  
 | 
				
			||||||
 | 
					  // record asm
 | 
				
			||||||
 | 
					  return std::make_tuple((uint64_t)mod, (uint64_t)fun);
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					std::tuple<uint64_t, uint64_t> hip_compile_llir(const std::string& name, llvm::Module* llvm, uint64_t dev, asm_map_t& asm_map){
 | 
				
			||||||
 | 
					  // LLVM-IR -> HSA-CO
 | 
				
			||||||
 | 
					  std::string path = drv::llir_to_amdgpu(llvm, "gfx908");
 | 
				
			||||||
 | 
					  // HSA-CO -> hipModule
 | 
				
			||||||
 | 
					  hipModule_t mod = drv::amdgpu_to_hipmodule(path);
 | 
				
			||||||
 | 
					  // Handle to the kernel
 | 
				
			||||||
 | 
					  hipFunction_t fun;
 | 
				
			||||||
 | 
					  drv::dispatch::hipModuleGetFunction(&fun, mod, name.c_str());
 | 
				
			||||||
 | 
					  // record asm
 | 
				
			||||||
 | 
					  return std::make_tuple((uint64_t)mod, (uint64_t)fun);
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
void init_triton_codegen(py::module &&m) {
 | 
					void init_triton_codegen(py::module &&m) {
 | 
				
			||||||
  m.def(
 | 
					  m.def(
 | 
				
			||||||
      "add_passes_to_emit_bin", [](ir::module &ir, drv::device *dev, int num_warps, int num_stages, bool force_nc_cache) {
 | 
					      "compile_ttir", [](backend_t backend, ir::module &ir, uint64_t device, int num_warps, int num_stages, bool force_nc_cache) {
 | 
				
			||||||
        drv::module *mod;
 | 
					        std::string name = ir.get_function_list()[0]->get_name();
 | 
				
			||||||
        drv::kernel *ker;
 | 
					        // record asm as we generate
 | 
				
			||||||
        size_t shared_mem;
 | 
					        asm_map_t asm_map;
 | 
				
			||||||
        triton::codegen::add_passes_to_emit_bin(ir, dev, num_warps, num_stages, force_nc_cache, mod, ker, shared_mem);
 | 
					        std::ostringstream ttir;
 | 
				
			||||||
        std::stringstream ss;
 | 
					        ir::print(ir, ttir);
 | 
				
			||||||
        ir::print(ir, ss);
 | 
					        asm_map["ttir"] = ttir.str();
 | 
				
			||||||
        return std::make_tuple(mod, ker, shared_mem, ss.str());
 | 
					        llvm::LLVMContext ctx;
 | 
				
			||||||
 | 
					        if(backend == CUDA){
 | 
				
			||||||
 | 
					          // device properties
 | 
				
			||||||
 | 
					          CUdevice dev = (CUdevice)device;
 | 
				
			||||||
 | 
					          size_t major = cuGetInfo<CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR>(dev);
 | 
				
			||||||
 | 
					          size_t minor = cuGetInfo<CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR>(dev);
 | 
				
			||||||
 | 
					          size_t cc = major*10 + minor;
 | 
				
			||||||
 | 
					          int version;
 | 
				
			||||||
 | 
					          drv::dispatch::cuDriverGetVersion(&version);
 | 
				
			||||||
 | 
					          // Triton-IR -> NVPTX LLVM-IR
 | 
				
			||||||
 | 
					          triton::codegen::nvidia_cu_target target(cc);
 | 
				
			||||||
 | 
					          int n_shared_bytes;
 | 
				
			||||||
 | 
					          auto llvm = triton::codegen::add_passes_to_emit_bin(ir, ctx, &target, cc, num_warps, num_stages, force_nc_cache, n_shared_bytes);
 | 
				
			||||||
 | 
					          llvm::raw_string_ostream llir(asm_map["llir"]);
 | 
				
			||||||
 | 
					          llir << *llvm;
 | 
				
			||||||
 | 
					          llir.flush();
 | 
				
			||||||
 | 
					          // LLVM-IR -> Bin
 | 
				
			||||||
 | 
					          uint64_t mod, fun;
 | 
				
			||||||
 | 
					          std::tie(mod, fun) = cu_compile_llir(name, n_shared_bytes, &*llvm, device, asm_map, cc, version);
 | 
				
			||||||
 | 
					          return std::make_tuple(mod, fun, asm_map, n_shared_bytes);
 | 
				
			||||||
 | 
					        }
 | 
				
			||||||
 | 
					        if(backend == ROCM){
 | 
				
			||||||
 | 
					          // Triton-IR -> NVPTX LLVM-IR
 | 
				
			||||||
 | 
					          triton::codegen::amd_cl_target target;
 | 
				
			||||||
 | 
					          int n_shared_bytes;
 | 
				
			||||||
 | 
					          auto llvm = triton::codegen::add_passes_to_emit_bin(ir, ctx, &target, 70, num_warps, num_stages, force_nc_cache, n_shared_bytes);
 | 
				
			||||||
 | 
					          llvm::raw_string_ostream llir(asm_map["llir"]);
 | 
				
			||||||
 | 
					          llir << *llvm;
 | 
				
			||||||
 | 
					          llir.flush();
 | 
				
			||||||
 | 
					          // LLVM-IR -> Bin
 | 
				
			||||||
 | 
					          uint64_t mod, fun;
 | 
				
			||||||
 | 
					          std::tie(mod, fun) = hip_compile_llir(name, &*llvm, device, asm_map);
 | 
				
			||||||
 | 
					          return std::make_tuple(mod, fun, asm_map, n_shared_bytes);
 | 
				
			||||||
 | 
					        }
 | 
				
			||||||
      },
 | 
					      },
 | 
				
			||||||
      py::return_value_policy::take_ownership);
 | 
					      py::return_value_policy::take_ownership);
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
@@ -302,7 +445,7 @@ void init_triton_ir(py::module &&m) {
 | 
				
			|||||||
void init_triton(py::module &m) {
 | 
					void init_triton(py::module &m) {
 | 
				
			||||||
  py::module subm = m.def_submodule("triton");
 | 
					  py::module subm = m.def_submodule("triton");
 | 
				
			||||||
  init_triton_codegen(std::move(subm.def_submodule("code_gen")));
 | 
					  init_triton_codegen(std::move(subm.def_submodule("code_gen")));
 | 
				
			||||||
  init_triton_driver(std::move(subm.def_submodule("driver")));
 | 
					  init_triton_runtime(std::move(subm.def_submodule("runtime")));
 | 
				
			||||||
  init_triton_ir(std::move(subm.def_submodule("ir")));
 | 
					  init_triton_ir(std::move(subm.def_submodule("ir")));
 | 
				
			||||||
  init_triton_frontend(std::move(subm.def_submodule("frontend")));
 | 
					  init_triton_frontend(std::move(subm.def_submodule("frontend")));
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -34,6 +34,8 @@ def patch_kernel(template, to_replace):
 | 
				
			|||||||
    return kernel
 | 
					    return kernel
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# generic test functions
 | 
					# generic test functions
 | 
				
			||||||
def _test_unary(dtype_x, expr, torch_expr=None, device='cuda'):
 | 
					def _test_unary(dtype_x, expr, torch_expr=None, device='cuda'):
 | 
				
			||||||
    SIZE = 128
 | 
					    SIZE = 128
 | 
				
			||||||
@@ -425,7 +427,7 @@ def test_permute(dtype, shape, perm, device='cuda'):
 | 
				
			|||||||
    # compare
 | 
					    # compare
 | 
				
			||||||
    triton.testing.assert_almost_equal(z_tri, z_ref)
 | 
					    triton.testing.assert_almost_equal(z_tri, z_ref)
 | 
				
			||||||
    # parse ptx to make sure ld/st are vectorized
 | 
					    # parse ptx to make sure ld/st are vectorized
 | 
				
			||||||
    ptx = pgm.asm('ptx')
 | 
					    ptx = pgm.asm['ptx']
 | 
				
			||||||
    assert 'ld.global.v4' in ptx
 | 
					    assert 'ld.global.v4' in ptx
 | 
				
			||||||
    assert 'st.global.v4' in ptx
 | 
					    assert 'st.global.v4' in ptx
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -484,7 +486,7 @@ def test_dot(epilogue, device='cuda'):
 | 
				
			|||||||
        z_ref += z[0,:][None, :]
 | 
					        z_ref += z[0,:][None, :]
 | 
				
			||||||
    z_ref = z_ref.to(torch.float16)
 | 
					    z_ref = z_ref.to(torch.float16)
 | 
				
			||||||
    # compare
 | 
					    # compare
 | 
				
			||||||
    ptx = pgm.asm('ptx')
 | 
					    ptx = pgm.asm['ptx']
 | 
				
			||||||
    # print(ptx)
 | 
					    # print(ptx)
 | 
				
			||||||
    triton.testing.assert_almost_equal(z_tri, z_ref)
 | 
					    triton.testing.assert_almost_equal(z_tri, z_ref)
 | 
				
			||||||
    # make sure ld/st are vectorized
 | 
					    # make sure ld/st are vectorized
 | 
				
			||||||
@@ -511,3 +513,13 @@ def test_dot(epilogue, device='cuda'):
 | 
				
			|||||||
# ---------------
 | 
					# ---------------
 | 
				
			||||||
# test while
 | 
					# test while
 | 
				
			||||||
# ---------------
 | 
					# ---------------
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# ---------------
 | 
				
			||||||
 | 
					# test noop
 | 
				
			||||||
 | 
					#----------------
 | 
				
			||||||
 | 
					def test_noop(device='cuda'):
 | 
				
			||||||
 | 
					    @triton.jit
 | 
				
			||||||
 | 
					    def kernel(**meta):
 | 
				
			||||||
 | 
					        pass
 | 
				
			||||||
 | 
					    x = triton.testing.random((1,), dtype=torch.int32, device=device)
 | 
				
			||||||
 | 
					    kernel[(1, )](x)
 | 
				
			||||||
@@ -411,9 +411,9 @@ class CodeGenerator(ast.NodeVisitor):
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class Binary:
 | 
					class Binary:
 | 
				
			||||||
    def __init__(self, module, kernel, num_warps, num_stages, force_nc_cache, shared_mem, ir_asm):
 | 
					    def __init__(self, backend, module, kernel, asm, num_warps, num_stages, force_nc_cache, shared_mem):
 | 
				
			||||||
        # cache ir asm
 | 
					        # cache ir asm
 | 
				
			||||||
        self.ir_asm = ir_asm
 | 
					        self.asm = asm
 | 
				
			||||||
        self.module = module
 | 
					        self.module = module
 | 
				
			||||||
        self.kernel = kernel
 | 
					        self.kernel = kernel
 | 
				
			||||||
        self.shared_mem = shared_mem
 | 
					        self.shared_mem = shared_mem
 | 
				
			||||||
@@ -421,29 +421,13 @@ class Binary:
 | 
				
			|||||||
        self.num_stages = num_stages
 | 
					        self.num_stages = num_stages
 | 
				
			||||||
        self.force_nc_cache = force_nc_cache
 | 
					        self.force_nc_cache = force_nc_cache
 | 
				
			||||||
        self.sass = None
 | 
					        self.sass = None
 | 
				
			||||||
 | 
					        self.backend = backend
 | 
				
			||||||
    def asm(self, mode):
 | 
					 | 
				
			||||||
        if mode == 'ttir':
 | 
					 | 
				
			||||||
            return self.ir_asm
 | 
					 | 
				
			||||||
        if mode == 'ptx':
 | 
					 | 
				
			||||||
            return self.module.ptx()
 | 
					 | 
				
			||||||
        if mode == 'sass':
 | 
					 | 
				
			||||||
            if self.sass is None:
 | 
					 | 
				
			||||||
                cubin = self.module.cubin()
 | 
					 | 
				
			||||||
                # get a temporary file name
 | 
					 | 
				
			||||||
                fd, path = tempfile.mkstemp(suffix='.cubin')
 | 
					 | 
				
			||||||
                f = open(path, 'wb')
 | 
					 | 
				
			||||||
                f.write(cubin)
 | 
					 | 
				
			||||||
                f.close()
 | 
					 | 
				
			||||||
                # extract SASS from cubin
 | 
					 | 
				
			||||||
                self.sass = extract(path, None)
 | 
					 | 
				
			||||||
            return self.sass
 | 
					 | 
				
			||||||
        if mode == 'llir':
 | 
					 | 
				
			||||||
            return self.module.llir()
 | 
					 | 
				
			||||||
        raise ValueError('Unsupported mode ' + mode)
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def __call__(self, stream, args, grid_0, grid_1=1, grid_2=1):
 | 
					    def __call__(self, stream, args, grid_0, grid_1=1, grid_2=1):
 | 
				
			||||||
        stream.enqueue(self.kernel, grid_0, grid_1, grid_2, self.num_warps * 32, 1, 1, args, self.shared_mem)
 | 
					        _triton.runtime.enqueue(self.backend, stream, self.kernel,
 | 
				
			||||||
 | 
					                                grid_0, grid_1, grid_2, 
 | 
				
			||||||
 | 
					                                self.num_warps * 32, 1, 1, 
 | 
				
			||||||
 | 
					                                args, self.shared_mem)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class CompilationError(Exception):
 | 
					class CompilationError(Exception):
 | 
				
			||||||
@@ -548,10 +532,15 @@ class Kernel:
 | 
				
			|||||||
                raise e
 | 
					                raise e
 | 
				
			||||||
            raise CompilationError(self.fn.src, node, e)
 | 
					            raise CompilationError(self.fn.src, node, e)
 | 
				
			||||||
        # Compile to machine code
 | 
					        # Compile to machine code
 | 
				
			||||||
        mod, ker, shared_mem, ir_asm = _triton.code_gen.add_passes_to_emit_bin(generator.module, device, num_warps, num_stages, force_nc_cache)
 | 
					        if torch.version.hip is None:
 | 
				
			||||||
        if shared_mem > device.max_shared_memory():
 | 
					            backend = _triton.runtime.backend.CUDA
 | 
				
			||||||
            raise OutOfResources(shared_mem, device.max_shared_memory(), "shared memory")
 | 
					        else:
 | 
				
			||||||
        return Binary(mod, ker, num_warps, num_stages, force_nc_cache, shared_mem, ir_asm)
 | 
					            backend = _triton.runtime.backend.ROCM
 | 
				
			||||||
 | 
					        mod, ker, asm, shared_mem = _triton.code_gen.compile_ttir(backend, generator.module, device, num_warps, num_stages, force_nc_cache)
 | 
				
			||||||
 | 
					        max_shared_memory = _triton.runtime.max_shared_memory(backend, device)
 | 
				
			||||||
 | 
					        if shared_mem > max_shared_memory:
 | 
				
			||||||
 | 
					            raise OutOfResources(shared_mem, max_shared_memory, "shared memory")
 | 
				
			||||||
 | 
					        return Binary(backend, mod, ker, asm, num_warps, num_stages, force_nc_cache, shared_mem)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def __call__(self, *wargs, grid, num_warps=4, num_stages=2, force_nc_cache=False, **meta):
 | 
					    def __call__(self, *wargs, grid, num_warps=4, num_stages=2, force_nc_cache=False, **meta):
 | 
				
			||||||
        # device inference
 | 
					        # device inference
 | 
				
			||||||
@@ -571,19 +560,20 @@ class Kernel:
 | 
				
			|||||||
                             " Only CUDA is supported at the moment")
 | 
					                             " Only CUDA is supported at the moment")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        device = torch.device('cuda', torch.cuda.current_device())
 | 
					        device = torch.device('cuda', torch.cuda.current_device())
 | 
				
			||||||
        tt_device = _triton.driver.cu_device(device.index, False)
 | 
					        device_ty  = device.type
 | 
				
			||||||
        if len(set(device_ids)) != 1 or device_ids[0] != device.index:
 | 
					        device_idx = device.index
 | 
				
			||||||
 | 
					        if len(set(device_ids)) != 1 or device_ids[0] != device_idx:
 | 
				
			||||||
            # try to enable P2P communication
 | 
					            # try to enable P2P communication
 | 
				
			||||||
            for arg_idx, dst_idx in zip(tensor_idxs, device_ids):
 | 
					            for arg_idx, dst_idx in zip(tensor_idxs, device_ids):
 | 
				
			||||||
                if dst_idx != device.index:
 | 
					                if dst_idx != device_idx:
 | 
				
			||||||
                    try:
 | 
					                    try:
 | 
				
			||||||
                        tt_device.enable_peer_access(wargs[arg_idx].data_ptr())
 | 
					                        _triton.runtime.enable_peer_access(self.backend, wargs[arg_idx].data_ptr())
 | 
				
			||||||
                    except RuntimeError as e:
 | 
					                    except RuntimeError as e:
 | 
				
			||||||
                        raise RuntimeError("Cannot enable P2P access from device {} to device {}: {}"
 | 
					                        raise RuntimeError("Cannot enable P2P access from device {} to device {}: {}"
 | 
				
			||||||
                                           .format(device.index, dst_idx, str(e)))
 | 
					                                           .format(device_idx, dst_idx, str(e)))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        # enqueue kernel on the current device
 | 
					        # enqueue kernel on the current device
 | 
				
			||||||
        torch.cuda.set_device(device.index)
 | 
					        torch.cuda.set_device(device_idx)
 | 
				
			||||||
        # attributes
 | 
					        # attributes
 | 
				
			||||||
        args = [arg.data_ptr() if i in tensor_idxs else arg for i, arg in enumerate(wargs)]
 | 
					        args = [arg.data_ptr() if i in tensor_idxs else arg for i, arg in enumerate(wargs)]
 | 
				
			||||||
        attributes = {i: Kernel.pow2_divisor(a) for i, a in enumerate(args) if isinstance(a, int)}
 | 
					        attributes = {i: Kernel.pow2_divisor(a) for i, a in enumerate(args) if isinstance(a, int)}
 | 
				
			||||||
@@ -594,12 +584,12 @@ class Kernel:
 | 
				
			|||||||
        attr_key = frozenset(attributes.items())
 | 
					        attr_key = frozenset(attributes.items())
 | 
				
			||||||
        meta_key = frozenset(meta.items())
 | 
					        meta_key = frozenset(meta.items())
 | 
				
			||||||
        const_key = frozenset(constants.items())
 | 
					        const_key = frozenset(constants.items())
 | 
				
			||||||
        key = (device.type, device.index, types_key, attr_key, num_warps, num_stages, meta_key, const_key)
 | 
					        key = (device_ty, device_idx, types_key, attr_key, num_warps, num_stages, meta_key, const_key)
 | 
				
			||||||
        cache = self.fn.cache
 | 
					        cache = self.fn.cache
 | 
				
			||||||
        if key not in cache:
 | 
					        if key not in cache:
 | 
				
			||||||
            # compile and cache configuration if necessary
 | 
					            # compile and cache configuration if necessary
 | 
				
			||||||
            cache[key] = self._compile(
 | 
					            cache[key] = self._compile(
 | 
				
			||||||
                *wargs, device=tt_device, attributes=attributes,
 | 
					                *wargs, device=device_idx, attributes=attributes,
 | 
				
			||||||
                num_warps=num_warps, num_stages=num_stages, force_nc_cache=force_nc_cache, 
 | 
					                num_warps=num_warps, num_stages=num_stages, force_nc_cache=force_nc_cache, 
 | 
				
			||||||
                constants=constants, **meta
 | 
					                constants=constants, **meta
 | 
				
			||||||
            )
 | 
					            )
 | 
				
			||||||
@@ -608,8 +598,7 @@ class Kernel:
 | 
				
			|||||||
        params = struct.pack(fmt, *args)
 | 
					        params = struct.pack(fmt, *args)
 | 
				
			||||||
        # enqueue cached function into stream
 | 
					        # enqueue cached function into stream
 | 
				
			||||||
        binary = cache[key]
 | 
					        binary = cache[key]
 | 
				
			||||||
        cu_stream = torch.cuda.current_stream(device.index).cuda_stream
 | 
					        stream = torch.cuda.current_stream(device_idx).cuda_stream
 | 
				
			||||||
        stream = _triton.driver.cu_stream(cu_stream, False)
 | 
					 | 
				
			||||||
        grid = grid(meta) if hasattr(grid, '__call__') else grid
 | 
					        grid = grid(meta) if hasattr(grid, '__call__') else grid
 | 
				
			||||||
        binary(stream, params, *grid)
 | 
					        binary(stream, params, *grid)
 | 
				
			||||||
        return binary
 | 
					        return binary
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -64,7 +64,7 @@ def add(x: torch.Tensor, y: torch.Tensor):
 | 
				
			|||||||
    #  - each torch.tensor object is implicitly converted into a pointer to its first element.
 | 
					    #  - each torch.tensor object is implicitly converted into a pointer to its first element.
 | 
				
			||||||
    #  - `triton.jit`'ed functions can be index with a launch grid to obtain a callable GPU kernel
 | 
					    #  - `triton.jit`'ed functions can be index with a launch grid to obtain a callable GPU kernel
 | 
				
			||||||
    #  - don't forget to pass meta-parameters as keywords arguments
 | 
					    #  - don't forget to pass meta-parameters as keywords arguments
 | 
				
			||||||
    add_kernel[grid](x, y, output, n_elements, BLOCK_SIZE=1024)
 | 
					    pgm = add_kernel[grid](x, y, output, n_elements, BLOCK_SIZE=1024)
 | 
				
			||||||
    # We return a handle to z but, since `torch.cuda.synchronize()` hasn't been called, the kernel is still
 | 
					    # We return a handle to z but, since `torch.cuda.synchronize()` hasn't been called, the kernel is still
 | 
				
			||||||
    # running asynchronously at this point.
 | 
					    # running asynchronously at this point.
 | 
				
			||||||
    return output
 | 
					    return output
 | 
				
			||||||
@@ -85,6 +85,7 @@ print(
 | 
				
			|||||||
    f'The maximum difference between torch and triton is '
 | 
					    f'The maximum difference between torch and triton is '
 | 
				
			||||||
    f'{torch.max(torch.abs(output_torch - output_triton))}'
 | 
					    f'{torch.max(torch.abs(output_torch - output_triton))}'
 | 
				
			||||||
)
 | 
					)
 | 
				
			||||||
 | 
					exit()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# %%
 | 
					# %%
 | 
				
			||||||
# Seems like we're good to go!
 | 
					# Seems like we're good to go!
 | 
				
			||||||
 
 | 
				
			|||||||
		Reference in New Issue
	
	Block a user