C++: Added clBLAS sGEMM ABI (still buggy)
This commit is contained in:
@@ -11,12 +11,12 @@ if(CUDA_FOUND)
|
||||
endif()
|
||||
|
||||
#CLAMDBLAS
|
||||
find_package(CLAMDBLAS)
|
||||
if(CLAMDBLAS_FOUND)
|
||||
#find_package(CLAMDBLAS)
|
||||
#if(CLAMDBLAS_FOUND)
|
||||
set(BLAS_DEF ${BLAS_DEF} "-DBENCH_CLBLAS")
|
||||
include_directories(${CLAMDBLAS_INCLUDE_DIR})
|
||||
set(BLAS_LIBS ${BLAS_LIBS} ${CLAMDBLAS_LIBRARIES} )
|
||||
endif()
|
||||
#include_directories(${CLAMDBLAS_INCLUDE_DIR})
|
||||
#set(BLAS_LIBS ${BLAS_LIBS} ${CLAMDBLAS_LIBRARIES} )
|
||||
#endif()
|
||||
|
||||
#CBLAS
|
||||
find_package(MKL)
|
||||
|
240
bench/blas.cpp
240
bench/blas.cpp
@@ -2,7 +2,7 @@
|
||||
#include "isaac/symbolic/execute.h"
|
||||
#include "isaac/tools/timer.hpp"
|
||||
#ifdef BENCH_CLBLAS
|
||||
#include "clBLAS.h"
|
||||
#include "isaac/wrap/clBLAS.h"
|
||||
#endif
|
||||
#ifdef BENCH_CBLAS
|
||||
#include "cblas.h"
|
||||
@@ -166,127 +166,127 @@ void bench(ad::numeric_type dtype, std::string operation)
|
||||
// /*--BLAS1--*/
|
||||
// /*---------*/
|
||||
|
||||
if(operation=="axpy")
|
||||
{
|
||||
for(int_t N: create_log_range(1e3, 2e7, 50, 64))
|
||||
{
|
||||
std::cout << N;
|
||||
ad::array x(N, dtype), y(N, dtype);
|
||||
/* ISAAC */
|
||||
std::list<ad::driver::Event> events;\
|
||||
BENCHMARK_ISAAC(y = ad::control(x + y, ad::execution_options_type(0, &events), ad::dispatcher_options_type(false)), 3*N*dtsize/t)
|
||||
BENCHMARK_ISAAC(y = ad::control(x + y, ad::execution_options_type(0, &events), ad::dispatcher_options_type(true)), 3*N*dtsize/t)
|
||||
/* clblas */
|
||||
#ifdef BENCH_CLBLAS
|
||||
BENCHMARK_CLBLAS(clblasSaxpy(N, 1, CL_HANDLE(x.data()), 0, 1, CL_HANDLE(y.data()), 0, 1, 1, &CL_HANDLE(queue), 0, NULL, &event()), 3*N*dtsize/t)
|
||||
#endif
|
||||
/* BLAS */
|
||||
#ifdef BENCH_CBLAS
|
||||
std::vector<float> cx(N), cy(N);
|
||||
ad::copy(x, cx);
|
||||
ad::copy(y, cy);
|
||||
BENCHMARK_HOST(cblas_saxpy(N, 1, cx.data(), 1, cy.data(), 1), 3*N*dtsize/t);
|
||||
#endif
|
||||
/* CuBLAS */
|
||||
#ifdef BENCH_CUBLAS
|
||||
T *cux, *cuy;
|
||||
cudaMalloc((void**) &cux, N * sizeof(T));
|
||||
cudaMalloc((void**) &cuy, N * sizeof(T));
|
||||
BENCHMARK_CUDA(cublasSaxpy(N, 2, cux, 1, cuy, 1), 3*N*dtsize/t)
|
||||
cudaFree(cux);
|
||||
cudaFree(cuy);
|
||||
#endif
|
||||
std::cout << std::endl;
|
||||
}
|
||||
}
|
||||
// if(operation=="axpy")
|
||||
// {
|
||||
// for(int_t N: create_log_range(1e3, 2e7, 50, 64))
|
||||
// {
|
||||
// std::cout << N;
|
||||
// ad::array x(N, dtype), y(N, dtype);
|
||||
// /* ISAAC */
|
||||
// std::list<ad::driver::Event> events;\
|
||||
// BENCHMARK_ISAAC(y = ad::control(x + y, ad::execution_options_type(0, &events), ad::dispatcher_options_type(false)), 3*N*dtsize/t)
|
||||
// BENCHMARK_ISAAC(y = ad::control(x + y, ad::execution_options_type(0, &events), ad::dispatcher_options_type(true)), 3*N*dtsize/t)
|
||||
// /* clblas */
|
||||
// #ifdef BENCH_CLBLAS
|
||||
// BENCHMARK_CLBLAS(clblasSaxpy(N, 1, CL_HANDLE(x.data()), 0, 1, CL_HANDLE(y.data()), 0, 1, 1, &CL_HANDLE(queue), 0, NULL, &event()), 3*N*dtsize/t)
|
||||
// #endif
|
||||
// /* BLAS */
|
||||
// #ifdef BENCH_CBLAS
|
||||
// std::vector<float> cx(N), cy(N);
|
||||
// ad::copy(x, cx);
|
||||
// ad::copy(y, cy);
|
||||
// BENCHMARK_HOST(cblas_saxpy(N, 1, cx.data(), 1, cy.data(), 1), 3*N*dtsize/t);
|
||||
// #endif
|
||||
// /* CuBLAS */
|
||||
// #ifdef BENCH_CUBLAS
|
||||
// T *cux, *cuy;
|
||||
// cudaMalloc((void**) &cux, N * sizeof(T));
|
||||
// cudaMalloc((void**) &cuy, N * sizeof(T));
|
||||
// BENCHMARK_CUDA(cublasSaxpy(N, 2, cux, 1, cuy, 1), 3*N*dtsize/t)
|
||||
// cudaFree(cux);
|
||||
// cudaFree(cuy);
|
||||
// #endif
|
||||
// std::cout << std::endl;
|
||||
// }
|
||||
// }
|
||||
|
||||
if(operation=="dot")
|
||||
{
|
||||
for(int_t N: create_log_range(1e3, 2e7, 50, 64))
|
||||
{
|
||||
std::cout << N;
|
||||
/* ISAAC */
|
||||
ad::array x(N, dtype), y(N, dtype);
|
||||
ad::array scratch(N, dtype);
|
||||
ad::scalar s(dtype);
|
||||
s = dot(x,y); queue.synchronize();
|
||||
BENCHMARK_ISAAC(s = ad::control(dot(x,y), ad::execution_options_type(0, &events), ad::dispatcher_options_type(true)), 2*N*dtsize/t)
|
||||
/* clblas */
|
||||
#ifdef BENCH_CLBLAS
|
||||
BENCHMARK_CLBLAS(clblasSdot(N, CL_HANDLE(s.data()), 0, CL_HANDLE(x.data()), 0, 1, CL_HANDLE(y.data()), 0, 1, CL_HANDLE(scratch.data()), 1, &CL_HANDLE(queue), 0, NULL, &event()), 2*N*dtsize/t)
|
||||
#endif
|
||||
/* BLAS */
|
||||
#ifdef BENCH_CBLAS
|
||||
std::vector<float> cx(N), cy(N);
|
||||
ad::copy(x, cx);
|
||||
ad::copy(y, cy);
|
||||
BENCHMARK_HOST(cblas_sdot(N, cx.data(), 1, cy.data(), 1), 2*N*dtsize/t);
|
||||
#endif
|
||||
#ifdef BENCH_CUBLAS
|
||||
T *cux, *cuy;
|
||||
T result;
|
||||
cudaMalloc((void**) &cux, N * sizeof(T));
|
||||
cudaMalloc((void**) &cuy, N * sizeof(T));
|
||||
BENCHMARK_CUDA(cublasSdot(N, cux, 1, cuy, 1), 2*N*dtsize/t)
|
||||
cudaFree(cux);
|
||||
cudaFree(cuy);
|
||||
#endif
|
||||
std::cout << std::endl;
|
||||
}
|
||||
std::cout << "\n\n" << std::flush;
|
||||
}
|
||||
// if(operation=="dot")
|
||||
// {
|
||||
// for(int_t N: create_log_range(1e3, 2e7, 50, 64))
|
||||
// {
|
||||
// std::cout << N;
|
||||
// /* ISAAC */
|
||||
// ad::array x(N, dtype), y(N, dtype);
|
||||
// ad::array scratch(N, dtype);
|
||||
// ad::scalar s(dtype);
|
||||
// s = dot(x,y); queue.synchronize();
|
||||
// BENCHMARK_ISAAC(s = ad::control(dot(x,y), ad::execution_options_type(0, &events), ad::dispatcher_options_type(true)), 2*N*dtsize/t)
|
||||
// /* clblas */
|
||||
// #ifdef BENCH_CLBLAS
|
||||
// BENCHMARK_CLBLAS(clblasSdot(N, CL_HANDLE(s.data()), 0, CL_HANDLE(x.data()), 0, 1, CL_HANDLE(y.data()), 0, 1, CL_HANDLE(scratch.data()), 1, &CL_HANDLE(queue), 0, NULL, &event()), 2*N*dtsize/t)
|
||||
// #endif
|
||||
// /* BLAS */
|
||||
// #ifdef BENCH_CBLAS
|
||||
// std::vector<float> cx(N), cy(N);
|
||||
// ad::copy(x, cx);
|
||||
// ad::copy(y, cy);
|
||||
// BENCHMARK_HOST(cblas_sdot(N, cx.data(), 1, cy.data(), 1), 2*N*dtsize/t);
|
||||
// #endif
|
||||
// #ifdef BENCH_CUBLAS
|
||||
// T *cux, *cuy;
|
||||
// T result;
|
||||
// cudaMalloc((void**) &cux, N * sizeof(T));
|
||||
// cudaMalloc((void**) &cuy, N * sizeof(T));
|
||||
// BENCHMARK_CUDA(cublasSdot(N, cux, 1, cuy, 1), 2*N*dtsize/t)
|
||||
// cudaFree(cux);
|
||||
// cudaFree(cuy);
|
||||
// #endif
|
||||
// std::cout << std::endl;
|
||||
// }
|
||||
// std::cout << "\n\n" << std::flush;
|
||||
// }
|
||||
|
||||
if(operation.substr(0, 4)=="gemv")
|
||||
{
|
||||
std::vector<std::tuple<int_t, int_t> > MNs;
|
||||
MNs.push_back(std::make_tuple(896,896));
|
||||
MNs.push_back(std::make_tuple(3072,3072));
|
||||
MNs.push_back(std::make_tuple(64,32000));
|
||||
MNs.push_back(std::make_tuple(896,32000));
|
||||
MNs.push_back(std::make_tuple(32000, 64));
|
||||
MNs.push_back(std::make_tuple(32000, 896));
|
||||
// if(operation.substr(0, 4)=="gemv")
|
||||
// {
|
||||
// std::vector<std::tuple<int_t, int_t> > MNs;
|
||||
// MNs.push_back(std::make_tuple(896,896));
|
||||
// MNs.push_back(std::make_tuple(3072,3072));
|
||||
// MNs.push_back(std::make_tuple(64,32000));
|
||||
// MNs.push_back(std::make_tuple(896,32000));
|
||||
// MNs.push_back(std::make_tuple(32000, 64));
|
||||
// MNs.push_back(std::make_tuple(32000, 896));
|
||||
|
||||
/*---------*/
|
||||
/*--BLAS2--*/
|
||||
/*---------*/
|
||||
//T-layout
|
||||
for(std::tuple<int_t, int_t> MN: MNs)
|
||||
{
|
||||
int_t M = std::get<0>(MN);
|
||||
int_t N = std::get<1>(MN);
|
||||
std::cout << M << "," << N;
|
||||
/* ISAAC */
|
||||
ad::array A(N, M, dtype), y(M, dtype), x(N, dtype);
|
||||
#if HAS_A_BLAS
|
||||
int_t lda = A.ld();
|
||||
#endif
|
||||
y = dot(trans(A),x); queue.synchronize();
|
||||
BENCHMARK_ISAAC(y = ad::control(dot(trans(A),x), ad::execution_options_type(0, &events), ad::dispatcher_options_type(false)),(M*N + M + N)*dtsize/t);
|
||||
BENCHMARK_ISAAC(y = ad::control(dot(trans(A),x), ad::execution_options_type(0, &events), ad::dispatcher_options_type(true)),(M*N + M + N)*dtsize/t);
|
||||
#ifdef BENCH_CLBLAS
|
||||
BENCHMARK_CLBLAS(clblasSgemv(clblasColumnMajor, clblasTrans, N, M, 1, CL_HANDLE(A.data()), 0, lda, CL_HANDLE(x.data()), 0, 1, 0, CL_HANDLE(y.data()), 0, 1, 1, &CL_HANDLE(queue),0, NULL, &event()), (M*N + M + N)*dtsize/t)
|
||||
#endif
|
||||
#ifdef BENCH_CBLAS
|
||||
std::vector<float> cA(N*M), cx(N), cy(M);
|
||||
ad::copy(x, cx);
|
||||
ad::copy(y, cy);
|
||||
ad::copy(A, cA);
|
||||
BENCHMARK_HOST(cblas_sgemv(CblasColMajor, CblasTrans, N, M, 1, cA.data(), lda, cx.data(), 1, 0, cy.data(), 1), (M*N + M + N)*dtsize/t);
|
||||
#endif
|
||||
#ifdef BENCH_CUBLAS
|
||||
T *cuA, *cux, *cuy;
|
||||
cudaMalloc((void**) &cuA, N * M * sizeof(T));
|
||||
cudaMalloc((void**) &cux, N * sizeof(T));
|
||||
cudaMalloc((void**) &cuy, M * sizeof(T));
|
||||
BENCHMARK_CUDA(cublasSgemv('t', N, M, 1, cuA, lda, cux, 1, 0, cuy, 1), (M*N + M + N)*dtsize/t)
|
||||
cudaFree(cuA);
|
||||
cudaFree(cux);
|
||||
cudaFree(cuy);
|
||||
#endif
|
||||
std::cout << std::endl;
|
||||
}
|
||||
std::cout << "\n\n" << std::flush;
|
||||
}
|
||||
// /*---------*/
|
||||
// /*--BLAS2--*/
|
||||
// /*---------*/
|
||||
// //T-layout
|
||||
// for(std::tuple<int_t, int_t> MN: MNs)
|
||||
// {
|
||||
// int_t M = std::get<0>(MN);
|
||||
// int_t N = std::get<1>(MN);
|
||||
// std::cout << M << "," << N;
|
||||
// /* ISAAC */
|
||||
// ad::array A(N, M, dtype), y(M, dtype), x(N, dtype);
|
||||
// #if HAS_A_BLAS
|
||||
// int_t lda = A.ld();
|
||||
// #endif
|
||||
// y = dot(trans(A),x); queue.synchronize();
|
||||
// BENCHMARK_ISAAC(y = ad::control(dot(trans(A),x), ad::execution_options_type(0, &events), ad::dispatcher_options_type(false)),(M*N + M + N)*dtsize/t);
|
||||
// BENCHMARK_ISAAC(y = ad::control(dot(trans(A),x), ad::execution_options_type(0, &events), ad::dispatcher_options_type(true)),(M*N + M + N)*dtsize/t);
|
||||
// #ifdef BENCH_CLBLAS
|
||||
// BENCHMARK_CLBLAS(clblasSgemv(clblasColumnMajor, clblasTrans, N, M, 1, CL_HANDLE(A.data()), 0, lda, CL_HANDLE(x.data()), 0, 1, 0, CL_HANDLE(y.data()), 0, 1, 1, &CL_HANDLE(queue),0, NULL, &event()), (M*N + M + N)*dtsize/t)
|
||||
// #endif
|
||||
// #ifdef BENCH_CBLAS
|
||||
// std::vector<float> cA(N*M), cx(N), cy(M);
|
||||
// ad::copy(x, cx);
|
||||
// ad::copy(y, cy);
|
||||
// ad::copy(A, cA);
|
||||
// BENCHMARK_HOST(cblas_sgemv(CblasColMajor, CblasTrans, N, M, 1, cA.data(), lda, cx.data(), 1, 0, cy.data(), 1), (M*N + M + N)*dtsize/t);
|
||||
// #endif
|
||||
// #ifdef BENCH_CUBLAS
|
||||
// T *cuA, *cux, *cuy;
|
||||
// cudaMalloc((void**) &cuA, N * M * sizeof(T));
|
||||
// cudaMalloc((void**) &cux, N * sizeof(T));
|
||||
// cudaMalloc((void**) &cuy, M * sizeof(T));
|
||||
// BENCHMARK_CUDA(cublasSgemv('t', N, M, 1, cuA, lda, cux, 1, 0, cuy, 1), (M*N + M + N)*dtsize/t)
|
||||
// cudaFree(cuA);
|
||||
// cudaFree(cux);
|
||||
// cudaFree(cuy);
|
||||
// #endif
|
||||
// std::cout << std::endl;
|
||||
// }
|
||||
// std::cout << "\n\n" << std::flush;
|
||||
// }
|
||||
|
||||
if(operation.substr(0,4)=="gemm")
|
||||
{
|
||||
@@ -312,7 +312,7 @@ void bench(ad::numeric_type dtype, std::string operation)
|
||||
#if HAS_A_BLAS
|
||||
int_t lda = A.ld(), ldb = B.ld(), ldc = C.ld();
|
||||
#endif
|
||||
BENCHMARK_ISAAC(C = ad::control(dot(A,trans(B)), ad::execution_options_type(0, &events), ad::dispatcher_options_type(false)), (double)2*M*N*K/t);
|
||||
// BENCHMARK_ISAAC(C = ad::control(dot(A,trans(B)), ad::execution_options_type(0, &events), ad::dispatcher_options_type(false)), (double)2*M*N*K/t);
|
||||
//BENCHMARK_ISAAC(C = ad::control(dot(A,trans(B)), ad::execution_options_type(0, &events), ad::dispatcher_options_type(true)), (double)2*M*N*K/t);
|
||||
/* clblas */
|
||||
#ifdef BENCH_CLBLAS
|
||||
|
@@ -17,14 +17,12 @@ class array: public array_base
|
||||
public:
|
||||
//1D Constructors
|
||||
array(int_t size1, numeric_type dtype, driver::Context context = driver::queues.default_context());
|
||||
array(int_t size1, numeric_type dtype, driver::Buffer data);
|
||||
template<typename DT>
|
||||
array(std::vector<DT> const & data, driver::Context context = driver::queues.default_context());
|
||||
array(array & v, slice const & s1);
|
||||
|
||||
//2D Constructors
|
||||
array(int_t size1, int_t size2, numeric_type dtype, driver::Context context = driver::queues.default_context());
|
||||
// array(int_t size1, int_t size2, numeric_type dtype, cl_mem data);
|
||||
template<typename DT>
|
||||
array(int_t size1, int_t size2, std::vector<DT> const & data, driver::Context context = driver::queues.default_context());
|
||||
array(array & M, slice const & s1, slice const & s2);
|
||||
@@ -33,7 +31,7 @@ public:
|
||||
array(int_t size1, int_t size2, int_t size3, numeric_type dtype, driver::Context context = driver::queues.default_context());
|
||||
|
||||
//General constructor
|
||||
array(numeric_type dtype, driver::Buffer data, slice const & s1, slice const & s2, int_t ld, driver::Context context = driver::queues.default_context());
|
||||
array(numeric_type dtype, driver::Buffer data, slice const & s1, slice const & s2, int_t ld);
|
||||
|
||||
array(array_expression const & proxy);
|
||||
array(array const &);
|
||||
@@ -105,7 +103,7 @@ private:
|
||||
void inject(values_holder&) const;
|
||||
template<class T> T cast() const;
|
||||
public:
|
||||
explicit scalar(numeric_type dtype, driver::Buffer const & data, int_t offset, driver::Context context = driver::queues.default_context());
|
||||
explicit scalar(numeric_type dtype, driver::Buffer const & data, int_t offset);
|
||||
explicit scalar(value_scalar value, driver::Context context = driver::queues.default_context());
|
||||
explicit scalar(numeric_type dtype, driver::Context context = driver::queues.default_context());
|
||||
scalar(array_expression const & proxy);
|
||||
|
@@ -14,8 +14,11 @@ class Event
|
||||
{
|
||||
friend class CommandQueue;
|
||||
public:
|
||||
Event(cl::Event const & event);
|
||||
Event(backend_type backend);
|
||||
long elapsed_time() const;
|
||||
operator cl::Event();
|
||||
|
||||
private:
|
||||
backend_type backend_;
|
||||
#ifdef ISAAC_WITH_CUDA
|
||||
|
@@ -220,7 +220,12 @@ private:
|
||||
struct execution_options_type
|
||||
{
|
||||
execution_options_type(unsigned int _queue_id = 0, std::list<driver::Event>* _events = NULL, std::vector<driver::Event>* _dependencies = NULL) :
|
||||
events(_events), dependencies(_dependencies), queue_id_(_queue_id){}
|
||||
events(_events), dependencies(_dependencies), queue_id_(_queue_id)
|
||||
{}
|
||||
|
||||
execution_options_type(driver::CommandQueue const & queue, std::list<driver::Event> *_events = NULL, std::vector<driver::Event> *_dependencies = NULL) :
|
||||
events(_events), dependencies(_dependencies), queue_(new driver::CommandQueue(queue))
|
||||
{}
|
||||
|
||||
void enqueue(driver::Context const & context, driver::Kernel const & kernel, driver::NDRange global, driver::NDRange local) const
|
||||
{
|
||||
|
53
include/isaac/wrap/clBLAS-complex.h
Normal file
53
include/isaac/wrap/clBLAS-complex.h
Normal file
@@ -0,0 +1,53 @@
|
||||
/* ************************************************************************
|
||||
* Copyright 2013 Advanced Micro Devices, Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
* ************************************************************************/
|
||||
|
||||
|
||||
#ifndef CLBLAS_COMPLEX_H_
|
||||
#define CLBLAS_COMPLEX H_
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
typedef cl_float2 FloatComplex;
|
||||
typedef cl_double2 DoubleComplex;
|
||||
|
||||
static __inline FloatComplex
|
||||
floatComplex(float real, float imag)
|
||||
{
|
||||
FloatComplex z;
|
||||
z.s[0] = real;
|
||||
z.s[1] = imag;
|
||||
return z;
|
||||
}
|
||||
|
||||
static __inline DoubleComplex
|
||||
doubleComplex(double real, double imag)
|
||||
{
|
||||
DoubleComplex z;
|
||||
z.s[0] = real;
|
||||
z.s[1] = imag;
|
||||
return z;
|
||||
}
|
||||
|
||||
#define CREAL(v) ((v).s[0])
|
||||
#define CIMAG(v) ((v).s[1])
|
||||
|
||||
#ifdef __cplusplus
|
||||
} /* extern "C" { */
|
||||
#endif
|
||||
|
||||
#endif /* CLBLAS_COMPLEX_H_ */
|
9654
include/isaac/wrap/clBLAS.h
Normal file
9654
include/isaac/wrap/clBLAS.h
Normal file
File diff suppressed because it is too large
Load Diff
@@ -19,11 +19,6 @@ array::array(int_t shape0, numeric_type dtype, driver::Context context) :
|
||||
context_(context), data_(context_, size_of(dtype)*dsize())
|
||||
{ }
|
||||
|
||||
array::array(int_t shape0, numeric_type dtype, driver::Buffer data) :
|
||||
dtype_(dtype), shape_(shape0, 1, 1, 1), start_(0, 0, 0, 0), stride_(1, 1, 1, 1), ld_(shape_[0]),
|
||||
context_(data.context()), data_(data)
|
||||
{ }
|
||||
|
||||
template<class DT>
|
||||
array::array(std::vector<DT> const & x, driver::Context context):
|
||||
dtype_(to_numeric_type<DT>::value), shape_(x.size(), 1), start_(0, 0, 0, 0), stride_(1, 1, 1, 1), ld_(shape_[0]),
|
||||
@@ -90,9 +85,9 @@ INSTANTIATE(double);
|
||||
#undef INSTANTIATE
|
||||
|
||||
// General
|
||||
array::array(numeric_type dtype, driver::Buffer data, slice const & s0, slice const & s1, int_t ld, driver::Context context):
|
||||
array::array(numeric_type dtype, driver::Buffer data, slice const & s0, slice const & s1, int_t ld):
|
||||
dtype_(dtype), shape_(s0.size, s1.size), start_(s0.start, s1.start), stride_(s0.stride, s1.stride),
|
||||
ld_(ld), context_(context), data_(data)
|
||||
ld_(ld), context_(data.context()), data_(data)
|
||||
{ }
|
||||
|
||||
array::array(array_expression const & proxy) : array(control(proxy)){}
|
||||
@@ -236,13 +231,13 @@ array_expression array::T() const
|
||||
scalar array::operator [](int_t idx)
|
||||
{
|
||||
assert(nshape()==1);
|
||||
return scalar(dtype_, data_, idx, context_);
|
||||
return scalar(dtype_, data_, idx);
|
||||
}
|
||||
|
||||
const scalar array::operator [](int_t idx) const
|
||||
{
|
||||
assert(nshape()==1);
|
||||
return scalar(dtype_, data_, idx, context_);
|
||||
return scalar(dtype_, data_, idx);
|
||||
}
|
||||
|
||||
|
||||
@@ -268,7 +263,7 @@ void copy(driver::Context & ctx, driver::Buffer const & data, T value)
|
||||
|
||||
}
|
||||
|
||||
scalar::scalar(numeric_type dtype, const driver::Buffer &data, int_t offset, driver::Context context): array(dtype, data, _(offset, offset+1), _(1,2), 1, context)
|
||||
scalar::scalar(numeric_type dtype, const driver::Buffer &data, int_t offset): array(dtype, data, _(offset, offset+1), _(1,2), 1)
|
||||
{ }
|
||||
|
||||
scalar::scalar(value_scalar value, driver::Context context) : array(1, value.dtype(), context)
|
||||
|
@@ -21,6 +21,11 @@ Event::Event(backend_type backend) : backend_(backend), h_(backend_)
|
||||
}
|
||||
}
|
||||
|
||||
Event::Event(cl::Event const & event) : backend_(OPENCL), h_(backend_)
|
||||
{
|
||||
*h_.cl = event;
|
||||
}
|
||||
|
||||
long Event::elapsed_time() const
|
||||
{
|
||||
switch(backend_)
|
||||
@@ -38,6 +43,10 @@ long Event::elapsed_time() const
|
||||
}
|
||||
}
|
||||
|
||||
Event::operator cl::Event()
|
||||
{
|
||||
return *h_.cl;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
71
lib/wrap/clBLAS.cpp
Normal file
71
lib/wrap/clBLAS.cpp
Normal file
@@ -0,0 +1,71 @@
|
||||
#include "CL/cl.hpp"
|
||||
|
||||
#include "isaac/wrap/clBLAS.h"
|
||||
#include "isaac/array.h"
|
||||
|
||||
namespace is = isaac;
|
||||
|
||||
extern "C"
|
||||
{
|
||||
|
||||
clblasStatus clblasSetup()
|
||||
{
|
||||
return clblasSuccess;
|
||||
}
|
||||
|
||||
void clblasTeardown()
|
||||
{
|
||||
|
||||
}
|
||||
|
||||
clblasStatus clblasSgemm(clblasOrder order, clblasTranspose transA, clblasTranspose transB,
|
||||
size_t M, size_t N, size_t K,
|
||||
cl_float alpha, const cl_mem mA, size_t offA, size_t lda,
|
||||
const cl_mem mB, size_t offB, size_t ldb, cl_float beta,
|
||||
cl_mem mC, size_t offC, size_t ldc,
|
||||
cl_uint numCommandQueues, cl_command_queue *commandQueues,
|
||||
cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events)
|
||||
{
|
||||
is::int_t As1 = M, As2 = K;
|
||||
is::int_t Bs1 = K, Bs2 = N;
|
||||
|
||||
if(transA==clblasTrans) std::swap(As1, As2);
|
||||
if(transB==clblasTrans) std::swap(Bs1, Bs2);
|
||||
|
||||
is::array A(is::FLOAT_TYPE, cl::Buffer(mA), is::_(offA%lda, As1, 1), is::_(offA/lda, As2, 1), lda);
|
||||
is::array B(is::FLOAT_TYPE, cl::Buffer(mB), is::_(offB%ldb, Bs1, 1), is::_(offB/ldb, Bs2, 1), ldb);
|
||||
is::array C(is::FLOAT_TYPE, cl::Buffer(mC), is::_(offC%ldc, M, 1), is::_(offC/ldc, N, 1), ldc);
|
||||
|
||||
std::vector<is::driver::Event> waitlist;
|
||||
for(cl_uint i = 0 ; i < numEventsInWaitList ; ++i)
|
||||
waitlist.push_back(cl::Event(eventWaitList[i]));
|
||||
|
||||
std::list<is::driver::Event> levents;
|
||||
|
||||
for(cl_uint i = 0 ; i < numCommandQueues ; ++i)
|
||||
{
|
||||
clRetainCommandQueue(commandQueues[i]);
|
||||
cl::CommandQueue clqueue(commandQueues[i]);
|
||||
is::driver::CommandQueue queue(clqueue);
|
||||
|
||||
|
||||
is::execution_options_type opt(queue, &levents, &waitlist);
|
||||
|
||||
if(transA==clblasTrans && transB==clblasTrans)
|
||||
C = is::control(alpha*dot(A.T(), B.T()) + beta*C, opt);
|
||||
else if(transA==clblasTrans && transB==clblasNoTrans)
|
||||
C = is::control(alpha*dot(A.T(), B) + beta*C, opt);
|
||||
else if(transA==clblasNoTrans && transB==clblasTrans)
|
||||
C = is::control(alpha*dot(A, B.T()) + beta*C, opt);
|
||||
else
|
||||
C = is::control(alpha*dot(A, B) + beta*C, opt);
|
||||
}
|
||||
|
||||
if(events)
|
||||
*events = static_cast<cl::Event>(levents.front())();
|
||||
std::cout << events << std::endl;
|
||||
|
||||
return clblasSuccess;
|
||||
}
|
||||
|
||||
}
|
Reference in New Issue
Block a user