143 lines
4.9 KiB
C
143 lines
4.9 KiB
C
![]() |
#include "src/copy.h"
|
||
|
#include "triton/driver/stream.h"
|
||
|
#include "triton/runtime/function.h"
|
||
|
#include "triton/tools/bench.hpp"
|
||
|
#include "util.h"
|
||
|
|
||
|
int32_t off(const std::vector<int32_t>& idx, const std::vector<int32_t>& strides) {
|
||
|
int32_t res = 0;
|
||
|
for(size_t d = 0; d < idx.size(); d++)
|
||
|
res += idx[d] * strides[d];
|
||
|
return res;
|
||
|
}
|
||
|
|
||
|
enum run_mode_t {
|
||
|
BENCH,
|
||
|
TEST
|
||
|
};
|
||
|
|
||
|
template<class T>
|
||
|
void cc_copy_nd(const std::vector<T>& x, std::vector<T>& y,
|
||
|
const std::vector<int32_t>& shape,
|
||
|
const std::vector<int32_t>& x_order, const std::vector<int32_t>& y_order) {
|
||
|
size_t rank = shape.size();
|
||
|
// strides for x
|
||
|
std::vector<int32_t> x_strides(shape.size());
|
||
|
for(size_t d = 0; d < rank; d++)
|
||
|
x_strides[x_order[d]] = (d == 0) ? 1 : (x_strides[x_order[d-1]] * shape[x_order[d-1]]);
|
||
|
// strides for y
|
||
|
std::vector<int32_t> y_strides(shape.size());
|
||
|
for(size_t d = 0; d < rank; d++)
|
||
|
y_strides[y_order[d]] = (d == 0) ? 1 : (y_strides[y_order[d-1]] * shape[y_order[d-1]]);
|
||
|
// copy 1d
|
||
|
if(rank == 1)
|
||
|
for(int32_t i = 0; i < shape[0]; i++)
|
||
|
y[off({i}, y_strides)] = x[off({i}, x_strides)];
|
||
|
// copy 2d
|
||
|
if(rank == 2)
|
||
|
for(int32_t i = 0; i < shape[0]; i++)
|
||
|
for(int32_t j = 0; j < shape[1]; j++)
|
||
|
y[off({i, j}, y_strides)] = x[off({i, j}, x_strides)];
|
||
|
// copy 3d
|
||
|
if(rank == 3)
|
||
|
for(int32_t i = 0; i < shape[0]; i++)
|
||
|
for(int32_t j = 0; j < shape[1]; j++)
|
||
|
for(int32_t k = 0; k < shape[2]; k++)
|
||
|
y[off({i, j, k}, y_strides)] = x[off({i, j, k}, x_strides)];
|
||
|
}
|
||
|
|
||
|
void triton_copy_nd(drv::stream* stream, const std::vector<int32_t>& shape,
|
||
|
const std::vector<int32_t>& x_order, const std::vector<int32_t>& y_order,
|
||
|
std::vector<std::vector<std::string>> TS,
|
||
|
run_mode_t mode, std::vector<double>& bench, bool &test) {
|
||
|
typedef float NumericT;
|
||
|
std::string ty = "float";
|
||
|
size_t dtsize = sizeof(NumericT);
|
||
|
drv::context* context = stream->context();
|
||
|
|
||
|
// rank
|
||
|
size_t rank = shape.size();
|
||
|
// size
|
||
|
size_t size = 1;
|
||
|
for(int32_t d: shape)
|
||
|
size *= d;
|
||
|
std::vector<std::string> shapename = {"S0", "S1", "S2"};
|
||
|
// strides for x
|
||
|
std::vector<std::string> x_strides = {"1"};
|
||
|
for(size_t d = 0; d < rank - 1; d++)
|
||
|
x_strides.push_back(x_strides[d] + " * " + shapename[x_order[d]]);
|
||
|
// strides for y
|
||
|
std::vector<std::string> y_strides = {"1"};
|
||
|
for(size_t d = 0; d < rank - 1; d++)
|
||
|
y_strides.push_back(y_strides[d] + " * " + shapename[y_order[d]]);
|
||
|
|
||
|
// create inputs
|
||
|
auto dx = std::unique_ptr<drv::buffer>(drv::buffer::create(context, size*dtsize));
|
||
|
auto dy = std::unique_ptr<drv::buffer>(drv::buffer::create(context, size*dtsize));
|
||
|
// create options
|
||
|
rt::function::options_space_t opt;
|
||
|
|
||
|
|
||
|
// macros
|
||
|
opt.defines.push_back({"TYPE", {ty}});
|
||
|
for(size_t d = 0; d < rank; d++)
|
||
|
opt.defines.push_back({"STRIDE_XS" + std::to_string(x_order[d]), {x_strides[d]}});
|
||
|
for(size_t d = 0; d < rank; d++)
|
||
|
opt.defines.push_back({"STRIDE_YS" + std::to_string(y_order[d]), {y_strides[d]}});
|
||
|
if(TS.empty())
|
||
|
TS = tile_nd(rank);
|
||
|
for(size_t d = 0; d < rank; d++)
|
||
|
opt.defines.push_back({"TS" + std::to_string(d), TS[d]});
|
||
|
opt.num_warps = {4};
|
||
|
|
||
|
// kernel
|
||
|
rt::function function(src::copy_nd[rank - 1], opt);
|
||
|
std::vector<rt::arg> args = {&*dx, &*dy};
|
||
|
for(int32_t d: shape)
|
||
|
args.push_back(d);
|
||
|
std::vector<std::string> ts = {"TS0", "TS1", "TS2"};
|
||
|
auto grid = grid_nd(shape, ts);
|
||
|
|
||
|
// metrics
|
||
|
if(mode == BENCH){
|
||
|
auto gbps = [&](double ns) { return 2 * size * dtsize / (ns * 1e-9) * 1e-9; };
|
||
|
double triton_ns = triton::tools::bench([&]() { function(args, grid, stream);}, stream);
|
||
|
bench.push_back(gbps(triton_ns));
|
||
|
}
|
||
|
|
||
|
// test triton
|
||
|
if(mode == TEST){
|
||
|
std::vector<NumericT> hx(size);
|
||
|
std::vector<NumericT> hy(size);
|
||
|
std::vector<NumericT> ry(size);
|
||
|
for(size_t i = 0; i < hx.size(); i++)
|
||
|
hx[i] = static_cast<NumericT>((float)rand()/RAND_MAX);
|
||
|
stream->write(&*dx, true, 0, hx);
|
||
|
function(args, grid, stream);
|
||
|
stream->synchronize();
|
||
|
stream->read(&*dy, true, 0, hy);
|
||
|
cc_copy_nd(hx, ry, shape, x_order, y_order);
|
||
|
test = testing::diff(hy, ry);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
std::vector<double> bench_copy_nd(drv::stream* stream, const std::vector<int32_t>& shape,
|
||
|
const std::vector<int32_t>& x_order, const std::vector<int32_t>& y_order) {
|
||
|
std::vector<double> bench;
|
||
|
bool test;
|
||
|
triton_copy_nd(stream, shape, x_order, y_order, {}, BENCH, bench, test);
|
||
|
return bench;
|
||
|
}
|
||
|
|
||
|
bool test_copy_nd(drv::stream* stream, const std::vector<int32_t>& shape,
|
||
|
const std::vector<int32_t>& TS,
|
||
|
const std::vector<int32_t>& x_order, const std::vector<int32_t>& y_order) {
|
||
|
std::vector<double> bench;
|
||
|
bool test;
|
||
|
std::vector<std::vector<std::string>> TSS;
|
||
|
for(int32_t d: TS)
|
||
|
TSS.push_back({std::to_string(d)});
|
||
|
triton_copy_nd(stream, shape, x_order, y_order, TSS, TEST, bench, test);
|
||
|
return test;
|
||
|
}
|