triton/bench/overhead.cpp

#include "atidlas/array.h"
#include "atidlas/tools/timer.hpp"

#include <vector>

namespace ad = atidlas;

#ifdef BENCH_CUBLAS
__global__ void dummy(){}
#endif


int main()
{
  for(ad::cl_ext::queues_type::data_type::const_iterator it = ad::cl_ext::queues.data().begin() ; it != ad::cl_ext::queues.data().end() ; ++it)
  {
    cl::CommandQueue queue = it->second[0];
    cl::Context context = it->first;
    cl::Device device = queue.getInfo<CL_QUEUE_DEVICE>();
    cl::Program program(context,"__kernel void dummy(){}");
    program.build();
    cl::Kernel kernel(program, "dummy");

    cl::NDRange offset = cl::NullRange;
    cl::NDRange global(1);
    cl::NDRange local(1);

    cl::Event event;
    std::cout << "Device: " << device.getInfo<CL_DEVICE_NAME>() << std::endl;
    std::cout << "-------------------------" << std::endl;

    queue.enqueueNDRangeKernel(kernel, offset, global, local, NULL, &event);
    queue.flush();
    queue.finish();

    {
    long time = event.getProfilingInfo<CL_PROFILING_COMMAND_END>() - event.getProfilingInfo<CL_PROFILING_COMMAND_START>();
    std::cout << "Kernel launch overhead: " << time << std::endl;
    }

#ifdef BENCH_CUBLAS
    float time;
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start);
    dummy<<<1, 1>>>();
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&time, start, stop);
    std::cout << "CUDA Kernel launch overhead: " << time << std::endl;
#endif
    std::cout << "-------------------------" << std::endl;
  }

}
Added benchmark for expression tree creation 2015-01-18 17:12:09 -05:00			`#include "atidlas/array.h"`
			`#include "atidlas/tools/timer.hpp"`

			`#include <vector>`

			`namespace ad = atidlas;`

Added a control flow API 2015-02-03 15:20:33 -05:00			`#ifdef BENCH_CUBLAS`
			`__global__ void dummy(){}`
			`#endif`


Added benchmark for expression tree creation 2015-01-18 17:12:09 -05:00			`int main()`
			`{`
More efficient access pattern in the GEMV kernel 2015-02-10 23:01:16 -05:00			`for(ad::cl_ext::queues_type::data_type::const_iterator it = ad::cl_ext::queues.data().begin() ; it != ad::cl_ext::queues.data().end() ; ++it)`
Multiple devices for tests 2015-01-19 14:40:13 -05:00			`{`
More efficient access pattern in the GEMV kernel 2015-02-10 23:01:16 -05:00			`cl::CommandQueue queue = it->second[0];`
			`cl::Context context = it->first;`
Added a control flow API 2015-02-03 15:20:33 -05:00			`cl::Device device = queue.getInfo<CL_QUEUE_DEVICE>();`
Fixed overhead-benchmark 2015-02-06 02:00:02 -05:00			`cl::Program program(context,"__kernel void dummy(){}");`
			`program.build();`
Added a control flow API 2015-02-03 15:20:33 -05:00			`cl::Kernel kernel(program, "dummy");`

			`cl::NDRange offset = cl::NullRange;`
			`cl::NDRange global(1);`
			`cl::NDRange local(1);`

			`cl::Event event;`
Multiple devices for tests 2015-01-19 14:40:13 -05:00			`std::cout << "Device: " << device.getInfo<CL_DEVICE_NAME>() << std::endl;`
			`std::cout << "-------------------------" << std::endl;`
Added a control flow API 2015-02-03 15:20:33 -05:00
			`queue.enqueueNDRangeKernel(kernel, offset, global, local, NULL, &event);`
			`queue.flush();`
			`queue.finish();`

Fixed overhead-benchmark 2015-02-06 02:00:02 -05:00			`{`
			`long time = event.getProfilingInfo<CL_PROFILING_COMMAND_END>() - event.getProfilingInfo<CL_PROFILING_COMMAND_START>();`
Added a control flow API 2015-02-03 15:20:33 -05:00			`std::cout << "Kernel launch overhead: " << time << std::endl;`
Fixed overhead-benchmark 2015-02-06 02:00:02 -05:00			`}`
Added a control flow API 2015-02-03 15:20:33 -05:00
			`#ifdef BENCH_CUBLAS`
Fixed overhead-benchmark 2015-02-06 02:00:02 -05:00			`float time;`
Added a control flow API 2015-02-03 15:20:33 -05:00			`cudaEvent_t start, stop;`
			`cudaEventCreate(&start);`
			`cudaEventCreate(&stop);`
			`cudaEventRecord(start);`
Fixed overhead-benchmark 2015-02-06 02:00:02 -05:00			`dummy<<<1, 1>>>();`
Added a control flow API 2015-02-03 15:20:33 -05:00			`cudaEventRecord(stop);`
Fixed overhead-benchmark 2015-02-06 02:00:02 -05:00			`cudaEventSynchronize(stop);`
Added a control flow API 2015-02-03 15:20:33 -05:00			`cudaEventElapsedTime(&time, start, stop);`
			`std::cout << "CUDA Kernel launch overhead: " << time << std::endl;`
			`#endif`
Multiple devices for tests 2015-01-19 14:40:13 -05:00			`std::cout << "-------------------------" << std::endl;`
Added benchmark for expression tree creation 2015-01-18 17:12:09 -05:00			`}`

			`}`