Files
triton/tests/common/copy.h
2020-04-09 01:10:11 -04:00

182 lines
5.7 KiB
C++

#include "src/copy.h"
#include "triton/driver/stream.h"
#include "triton/runtime/function.h"
#include "triton/tools/bench.hpp"
#include "triton/external/half.hpp"
#include "util.h"
int32_t off(const std::vector<int32_t>& idx, const std::vector<int32_t>& strides) {
int32_t res = 0;
for(size_t d = 0; d < idx.size(); d++)
res += idx[d] * strides[d];
return res;
}
enum run_mode_t {
BENCH,
TEST
};
enum dtype_t {
FLOAT,
HALF,
DOUBLE
};
template<class T>
void cc_copy_nd(const std::vector<T>& x, std::vector<T>& y,
const std::vector<int32_t>& shape,
const std::vector<int32_t>& x_order, const std::vector<int32_t>& y_order) {
size_t rank = shape.size();
// strides for x
std::vector<int32_t> x_strides(shape.size());
for(size_t d = 0; d < rank; d++)
x_strides[x_order[d]] = (d == 0) ? 1 : (x_strides[x_order[d-1]] * shape[x_order[d-1]]);
// strides for y
std::vector<int32_t> y_strides(shape.size());
for(size_t d = 0; d < rank; d++)
y_strides[y_order[d]] = (d == 0) ? 1 : (y_strides[y_order[d-1]] * shape[y_order[d-1]]);
// copy 1d
if(rank == 1)
for(int32_t i = 0; i < shape[0]; i++)
y[off({i}, y_strides)] = x[off({i}, x_strides)];
// copy 2d
if(rank == 2)
for(int32_t i = 0; i < shape[0]; i++)
for(int32_t j = 0; j < shape[1]; j++)
y[off({i, j}, y_strides)] = x[off({i, j}, x_strides)];
// copy 3d
if(rank == 3)
for(int32_t i = 0; i < shape[0]; i++)
for(int32_t j = 0; j < shape[1]; j++)
for(int32_t k = 0; k < shape[2]; k++)
y[off({i, j, k}, y_strides)] = x[off({i, j, k}, x_strides)];
}
template<class T>
struct to_string;
template<> struct to_string<half_float::half>{
static constexpr const char* value = "half";
};
template<> struct to_string<float>{
static constexpr const char* value = "float";
};
template<> struct to_string<double>{
static constexpr const char* value = "double";
};
template<typename T>
void triton_copy_nd(drv::stream* stream, const std::vector<int32_t>& shape,
const std::vector<int32_t>& x_order, const std::vector<int32_t>& y_order,
std::vector<std::vector<std::string>> TS,
run_mode_t mode, std::vector<double>& bench, bool &test) {
std::string ty = to_string<T>::value;
size_t dtsize = sizeof(T);
drv::context* context = stream->context();
// rank
size_t rank = shape.size();
// size
size_t size = 1;
for(int32_t d: shape)
size *= d;
std::vector<std::string> shapename = {"S0", "S1", "S2"};
// strides for x
std::vector<std::string> x_strides = {"1"};
for(size_t d = 0; d < rank - 1; d++)
x_strides.push_back(x_strides[d] + " * " + shapename[x_order[d]]);
// strides for y
std::vector<std::string> y_strides = {"1"};
for(size_t d = 0; d < rank - 1; d++)
y_strides.push_back(y_strides[d] + " * " + shapename[y_order[d]]);
// create inputs
auto dx = std::unique_ptr<drv::buffer>(drv::buffer::create(context, size*dtsize));
auto dy = std::unique_ptr<drv::buffer>(drv::buffer::create(context, size*dtsize));
// create options
rt::function::options_space_t opt;
// macros
opt.defines.push_back({"TYPE", {ty}});
for(size_t d = 0; d < rank; d++)
opt.defines.push_back({"STRIDE_XS" + std::to_string(x_order[d]), {x_strides[d]}});
for(size_t d = 0; d < rank; d++)
opt.defines.push_back({"STRIDE_YS" + std::to_string(y_order[d]), {y_strides[d]}});
if(TS.empty())
TS = tile_nd(rank);
for(size_t d = 0; d < rank; d++)
opt.defines.push_back({"TS" + std::to_string(d), TS[d]});
opt.num_warps = {4};
// kernel
rt::function function(src::copy_nd[rank - 1], opt);
std::vector<rt::arg> args = {&*dx, &*dy};
for(int32_t d: shape)
args.push_back(d);
std::vector<std::string> ts = {"TS0", "TS1", "TS2"};
auto grid = grid_nd(shape, ts);
// metrics
if(mode == BENCH){
auto gbps = [&](double ns) { return 2 * size * dtsize / (ns * 1e-9) * 1e-9; };
double triton_ns = triton::tools::bench([&]() { function(args, grid, stream);}, stream);
bench.push_back(gbps(triton_ns));
}
// test triton
if(mode == TEST){
std::vector<T> hx(size);
std::vector<T> hy(size);
std::vector<T> ry(size);
for(size_t i = 0; i < hx.size(); i++)
hx[i] = static_cast<T>((float)rand()/RAND_MAX);
stream->write(&*dx, true, 0, hx);
function(args, grid, stream);
stream->synchronize();
stream->read(&*dy, true, 0, hy);
cc_copy_nd(hx, ry, shape, x_order, y_order);
test = testing::diff(hy, ry);
}
}
std::vector<double> bench_copy_nd(drv::stream* stream, dtype_t dtype, const std::vector<int32_t>& shape,
const std::vector<int32_t>& x_order, const std::vector<int32_t>& y_order) {
std::vector<double> bench;
bool test;
switch(dtype){
case HALF:
triton_copy_nd<half_float::half>(stream, shape, x_order, y_order, {}, BENCH, bench, test);
break;
case FLOAT:
triton_copy_nd<float>(stream, shape, x_order, y_order, {}, BENCH, bench, test);
break;
default: break;
}
return bench;
}
bool test_copy_nd(drv::stream* stream, dtype_t dtype, const std::vector<int32_t>& shape,
const std::vector<int32_t>& TS,
const std::vector<int32_t>& x_order, const std::vector<int32_t>& y_order) {
std::vector<double> bench;
bool test;
std::vector<std::vector<std::string>> TSS;
for(int32_t d: TS)
TSS.push_back({std::to_string(d)});
switch(dtype){
case HALF:
triton_copy_nd<half_float::half>(stream, shape, x_order, y_order, TSS, TEST, bench, test);
break;
case FLOAT:
triton_copy_nd<float>(stream, shape, x_order, y_order, TSS, TEST, bench, test);
break;
default: break;
}
return test;
}