191 lines
5.9 KiB
C++
191 lines
5.9 KiB
C++
#include "src/copy.h"
|
|
#include "triton/driver/stream.h"
|
|
#include "triton/runtime/function.h"
|
|
#include "triton/tools/bench.hpp"
|
|
#include "triton/external/half.hpp"
|
|
#include "util.h"
|
|
|
|
int32_t off(const std::vector<int32_t>& idx, const std::vector<int32_t>& strides) {
|
|
int32_t res = 0;
|
|
for(size_t d = 0; d < idx.size(); d++)
|
|
res += idx[d] * strides[d];
|
|
return res;
|
|
}
|
|
|
|
struct copy_arg_t{
|
|
CUdeviceptr X;
|
|
CUdeviceptr Y;
|
|
int S0;
|
|
int S1;
|
|
int S2;
|
|
};
|
|
|
|
|
|
enum run_mode_t {
|
|
BENCH,
|
|
TEST
|
|
};
|
|
|
|
enum dtype_t {
|
|
FLOAT,
|
|
HALF,
|
|
DOUBLE
|
|
};
|
|
|
|
|
|
template<class T>
|
|
void cc_copy_nd(const std::vector<T>& x, std::vector<T>& y,
|
|
const std::vector<int32_t>& shape,
|
|
const std::vector<int32_t>& x_order, const std::vector<int32_t>& y_order) {
|
|
size_t rank = shape.size();
|
|
// strides for x
|
|
std::vector<int32_t> x_strides(shape.size());
|
|
for(size_t d = 0; d < rank; d++)
|
|
x_strides[x_order[d]] = (d == 0) ? 1 : (x_strides[x_order[d-1]] * shape[x_order[d-1]]);
|
|
// strides for y
|
|
std::vector<int32_t> y_strides(shape.size());
|
|
for(size_t d = 0; d < rank; d++)
|
|
y_strides[y_order[d]] = (d == 0) ? 1 : (y_strides[y_order[d-1]] * shape[y_order[d-1]]);
|
|
// copy 1d
|
|
if(rank == 1)
|
|
for(int32_t i = 0; i < shape[0]; i++)
|
|
y[off({i}, y_strides)] = x[off({i}, x_strides)];
|
|
// copy 2d
|
|
if(rank == 2)
|
|
for(int32_t i = 0; i < shape[0]; i++)
|
|
for(int32_t j = 0; j < shape[1]; j++)
|
|
y[off({i, j}, y_strides)] = x[off({i, j}, x_strides)];
|
|
// copy 3d
|
|
if(rank == 3)
|
|
for(int32_t i = 0; i < shape[0]; i++)
|
|
for(int32_t j = 0; j < shape[1]; j++)
|
|
for(int32_t k = 0; k < shape[2]; k++)
|
|
y[off({i, j, k}, y_strides)] = x[off({i, j, k}, x_strides)];
|
|
}
|
|
|
|
template<class T>
|
|
struct to_string;
|
|
|
|
template<> struct to_string<half_float::half>{
|
|
static constexpr const char* value = "half";
|
|
};
|
|
|
|
template<> struct to_string<float>{
|
|
static constexpr const char* value = "float";
|
|
};
|
|
|
|
template<> struct to_string<double>{
|
|
static constexpr const char* value = "double";
|
|
};
|
|
|
|
template<typename T>
|
|
void triton_copy_nd(drv::context* context, drv::stream* stream, const std::vector<int32_t>& shape,
|
|
const std::vector<int32_t>& x_order, const std::vector<int32_t>& y_order,
|
|
std::vector<std::vector<std::string>> TS,
|
|
run_mode_t mode, std::vector<double>& bench, bool &test) {
|
|
std::string ty = to_string<T>::value;
|
|
size_t dtsize = sizeof(T);
|
|
drv::device* device = context->device();
|
|
|
|
// rank
|
|
size_t rank = shape.size();
|
|
// size
|
|
size_t size = 1;
|
|
for(int32_t d: shape)
|
|
size *= d;
|
|
std::vector<std::string> shapename = {"S0", "S1", "S2"};
|
|
// strides for x
|
|
std::vector<std::string> x_strides = {"1"};
|
|
for(size_t d = 0; d < rank - 1; d++)
|
|
x_strides.push_back(x_strides[d] + " * " + shapename[x_order[d]]);
|
|
// strides for y
|
|
std::vector<std::string> y_strides = {"1"};
|
|
for(size_t d = 0; d < rank - 1; d++)
|
|
y_strides.push_back(y_strides[d] + " * " + shapename[y_order[d]]);
|
|
|
|
// create inputs
|
|
auto dx = std::unique_ptr<drv::buffer>(drv::buffer::create(context, size*dtsize));
|
|
auto dy = std::unique_ptr<drv::buffer>(drv::buffer::create(context, size*dtsize));
|
|
// create options
|
|
rt::function::options_space_t opt;
|
|
|
|
|
|
// macros
|
|
opt.defines.push_back({"TYPE", {ty}});
|
|
for(size_t d = 0; d < rank; d++)
|
|
opt.defines.push_back({"STRIDE_XS" + std::to_string(x_order[d]), {x_strides[d]}});
|
|
for(size_t d = 0; d < rank; d++)
|
|
opt.defines.push_back({"STRIDE_YS" + std::to_string(y_order[d]), {y_strides[d]}});
|
|
if(TS.empty())
|
|
TS = tile_nd(rank);
|
|
for(size_t d = 0; d < rank; d++)
|
|
opt.defines.push_back({"TS" + std::to_string(d), TS[d]});
|
|
opt.num_warps = {4};
|
|
|
|
// kernel
|
|
rt::function function(src::copy_nd[rank - 1], opt);
|
|
copy_arg_t args = {*dx->cu(), *dy->cu(), shape[0]};
|
|
if(shape.size() > 1) args.S1 = shape[1];
|
|
if(shape.size() > 2) args.S2 = shape[2];
|
|
std::vector<std::string> ts = {"TS0", "TS1", "TS2"};
|
|
auto grid = grid_nd(shape, ts);
|
|
|
|
// metrics
|
|
if(mode == BENCH){
|
|
auto gbps = [&](double ns) { return 2 * size * dtsize / (ns * 1e-9) * 1e-9; };
|
|
double triton_ns = triton::tools::bench([&]() { function((void**)&args, sizeof(args), grid, stream, device);}, stream);
|
|
bench.push_back(gbps(triton_ns));
|
|
}
|
|
|
|
// test triton
|
|
if(mode == TEST){
|
|
std::vector<T> hx(size);
|
|
std::vector<T> hy(size);
|
|
std::vector<T> ry(size);
|
|
for(size_t i = 0; i < hx.size(); i++)
|
|
hx[i] = static_cast<T>((float)rand()/RAND_MAX);
|
|
stream->write(&*dx, true, 0, hx);
|
|
function((void**)&args, sizeof(args), grid, stream, device);
|
|
stream->synchronize();
|
|
stream->read(&*dy, true, 0, hy);
|
|
cc_copy_nd(hx, ry, shape, x_order, y_order);
|
|
test = testing::diff(hy, ry);
|
|
}
|
|
}
|
|
|
|
std::vector<double> bench_copy_nd(drv::context* context, drv::stream* stream, dtype_t dtype, const std::vector<int32_t>& shape,
|
|
const std::vector<int32_t>& x_order, const std::vector<int32_t>& y_order) {
|
|
std::vector<double> bench;
|
|
bool test;
|
|
switch(dtype){
|
|
case HALF:
|
|
triton_copy_nd<half_float::half>(context, stream, shape, x_order, y_order, {}, BENCH, bench, test);
|
|
break;
|
|
case FLOAT:
|
|
triton_copy_nd<float>(context, stream, shape, x_order, y_order, {}, BENCH, bench, test);
|
|
break;
|
|
default: break;
|
|
}
|
|
return bench;
|
|
}
|
|
|
|
bool test_copy_nd(drv::context* context, drv::stream* stream, dtype_t dtype, const std::vector<int32_t>& shape,
|
|
const std::vector<int32_t>& TS,
|
|
const std::vector<int32_t>& x_order, const std::vector<int32_t>& y_order) {
|
|
std::vector<double> bench;
|
|
bool test;
|
|
std::vector<std::vector<std::string>> TSS;
|
|
for(int32_t d: TS)
|
|
TSS.push_back({std::to_string(d)});
|
|
switch(dtype){
|
|
case HALF:
|
|
triton_copy_nd<half_float::half>(context, stream, shape, x_order, y_order, TSS, TEST, bench, test);
|
|
break;
|
|
case FLOAT:
|
|
triton_copy_nd<float>(context, stream, shape, x_order, y_order, TSS, TEST, bench, test);
|
|
break;
|
|
default: break;
|
|
}
|
|
return test;
|
|
}
|