Bench: Fixed CUDA synchronization issue

This commit is contained in:
Philippe Tillet
2016-09-29 17:36:21 -04:00
parent 8bf7344681
commit 5d0e29db1f
5 changed files with 31 additions and 33 deletions

View File

@@ -45,10 +45,10 @@ void bench(sc::numeric_type dtype, std::string operation)
using std::get;
using std::make_tuple;
unsigned int dtsize = sc::size_of(dtype);
//unsigned int dtsize = sc::size_of(dtype);
sc::driver::CommandQueue & queue = sc::driver::backend::queues::get(sc::driver::backend::contexts::get_default(),0);
auto sync = [&](){ queue.synchronize(); };
auto cusync = [&](){ cudaDeviceSynchronize(); };
/*---------*/
/*--BLAS1--*/
/*---------*/
@@ -73,7 +73,7 @@ void bench(sc::numeric_type dtype, std::string operation)
times.push_back(bench([&](){cblas_saxpy(N, alpha, cx.data(), 1, cy.data(), 1);}, sync));
#endif
#ifdef BENCH_CUBLAS
times.push_back(bench([&](){cublasSaxpy(N, alpha, (T*)cu(x), 1, (T*)cu(y), 1);}, sync));
times.push_back(bench([&](){cublasSaxpy(N, alpha, (T*)cu(x), 1, (T*)cu(y), 1);}, cusync));
#endif
}
}
@@ -99,7 +99,7 @@ void bench(sc::numeric_type dtype, std::string operation)
times.push_back(bench([&](){cblas_sdot(N, cx.data(), 1, cy.data(), 1);}, sync));
#endif
#ifdef BENCH_CUBLAS
times.push_back(bench([&](){cublasSdot(N, (T*)cu(x), 1, (T*)cu(y), 1);}, sync));
times.push_back(bench([&](){cublasSdot(N, (T*)cu(x), 1, (T*)cu(y), 1);}, cusync));
#endif
}
}
@@ -155,7 +155,7 @@ void bench(sc::numeric_type dtype, std::string operation)
times.push_back(bench([&](){cblas_sgemv(CblasColMajor, AT?CblasTrans:CblasNoTrans, As1, As2, 1, cA.data(), lda, cx.data(), 1, 0, cy.data(), 1);}, sync));
#endif
#ifdef BENCH_CUBLAS
times.push_back(bench([&](){cublasSgemv(AT?'t':'n', As1, As2, 1, (T*)cu(A), lda, (T*)cu(x), 1, 0, (T*)cu(y), 1);}, sync));
times.push_back(bench([&](){cublasSgemv(AT?'t':'n', As1, As2, 1, (T*)cu(A), lda, (T*)cu(x), 1, 0, (T*)cu(y), 1);}, cusync));
#endif
}
}
@@ -238,7 +238,7 @@ void bench(sc::numeric_type dtype, std::string operation)
times.push_back(bench([&](){cblas_sgemm(CblasColMajor, AT?CblasTrans:CblasNoTrans, BT?CblasTrans:CblasNoTrans, M, N, K, 1, cA.data(), lda, cB.data(), ldb, 1, cC.data(), ldc);}, sync));
#endif
#ifdef BENCH_CUBLAS
times.push_back(bench([&](){cublasSgemm(AT?'t':'n', BT?'t':'n', M, N, K, 1, (T*)cu(A), lda, (T*)cu(B), ldb, 1, (T*)cu(C), ldc);}, sync));
times.push_back(bench([&](){cublasSgemm(AT?'t':'n', BT?'t':'n', M, N, K, 1, (T*)cu(A), lda, (T*)cu(B), ldb, 1, (T*)cu(C), ldc);}, cusync));
#endif
std::cout << name << "\t" << M << "\t" << N << "\t" << K << "\t" << cAT << "\t" << cBT;
std::transform(times.begin(), times.end(), std::back_inserter(tflops), [&](double t){ return 2*M*N*K/t*1e-3;});

File diff suppressed because one or more lines are too long

View File

@@ -73,7 +73,11 @@ def main():
libraries += ['gnustl_shared']
#Source files
<<<<<<< Updated upstream
src = 'src/lib/exception/api.cpp src/lib/exception/driver.cpp src/lib/value_scalar.cpp src/lib/random/rand.cpp src/lib/driver/check.cpp src/lib/driver/ndrange.cpp src/lib/driver/platform.cpp src/lib/driver/backend.cpp src/lib/driver/program.cpp src/lib/driver/command_queue.cpp src/lib/driver/event.cpp src/lib/driver/kernel.cpp src/lib/driver/handle.cpp src/lib/driver/device.cpp src/lib/driver/program_cache.cpp src/lib/driver/buffer.cpp src/lib/driver/context.cpp src/lib/driver/dispatch.cpp src/lib/jit/generation/engine/stream.cpp src/lib/jit/generation/engine/keywords.cpp src/lib/jit/generation/reduce_1d.cpp src/lib/jit/generation/elementwise_1d.cpp src/lib/jit/generation/base.cpp src/lib/jit/generation/elementwise_2d.cpp src/lib/jit/generation/matrix_product.cpp src/lib/jit/generation/reduce_2d.cpp src/lib/jit/syntax/engine/object.cpp src/lib/jit/syntax/engine/macro.cpp src/lib/jit/syntax/engine/process.cpp src/lib/jit/syntax/engine/binder.cpp src/lib/jit/syntax/expression/operations.cpp src/lib/jit/syntax/expression/expression.cpp src/lib/jit/syntax/expression/preset.cpp src/lib/api/blas/clBLAS.cpp src/lib/api/blas/cublas.cpp src/lib/runtime/execute.cpp src/lib/runtime/scheduler/dag.cpp src/lib/runtime/scheduler/strategies/heft.cpp src/lib/runtime/inference/predictors/random_forest.cpp src/lib/runtime/inference/profiles.cpp src/lib/runtime/inference/database.cpp src/lib/array.cpp '.split() + [os.path.join('src', 'bind', sf) for sf in ['_isaac.cpp', 'core.cpp', 'driver.cpp', 'kernels.cpp', 'exceptions.cpp']]
=======
src = 'src/lib/random/rand.cpp src/lib/jit/syntax/expression/preset.cpp src/lib/jit/syntax/expression/expression.cpp src/lib/jit/syntax/expression/operations.cpp src/lib/jit/syntax/engine/binder.cpp src/lib/jit/syntax/engine/macro.cpp src/lib/jit/syntax/engine/object.cpp src/lib/jit/syntax/engine/process.cpp src/lib/jit/generation/base.cpp src/lib/jit/generation/engine/keywords.cpp src/lib/jit/generation/engine/stream.cpp src/lib/jit/generation/reduce_1d.cpp src/lib/jit/generation/elementwise_1d.cpp src/lib/jit/generation/matrix_product.cpp src/lib/jit/generation/reduce_2d.cpp src/lib/jit/generation/elementwise_2d.cpp src/lib/runtime/execute.cpp src/lib/runtime/scheduler/dag.cpp src/lib/runtime/scheduler/strategies/heft.cpp src/lib/runtime/inference/predictors/random_forest.cpp src/lib/runtime/inference/database.cpp src/lib/runtime/inference/profiles.cpp src/lib/value_scalar.cpp src/lib/exception/api.cpp src/lib/exception/driver.cpp src/lib/driver/kernel.cpp src/lib/driver/program.cpp src/lib/driver/check.cpp src/lib/driver/backend.cpp src/lib/driver/context.cpp src/lib/driver/command_queue.cpp src/lib/driver/device.cpp src/lib/driver/platform.cpp src/lib/driver/handle.cpp src/lib/driver/dispatch.cpp src/lib/driver/buffer.cpp src/lib/driver/program_cache.cpp src/lib/driver/event.cpp src/lib/driver/ndrange.cpp src/lib/array.cpp src/lib/api/blas/cublas.cpp src/lib/api/blas/clBLAS.cpp '.split() + [os.path.join('src', 'bind', sf) for sf in ['_isaac.cpp', 'core.cpp', 'driver.cpp', 'kernels.cpp', 'exceptions.cpp']]
>>>>>>> Stashed changes
boostsrc = 'external/boost/libs/'
for s in ['numpy','python','smart_ptr','system','thread']:
src = src + [x for x in recursive_glob('external/boost/libs/' + s + '/src/','.cpp') if 'win32' not in x and 'pthread' not in x]

View File

@@ -192,7 +192,7 @@ def is_local_optimum(parameters, template, sizes, context):
elif issubclass(template, sc.templates.reduce_2d):
sweep_over = [0,1,2,3,4]
elif issubclass(template, sc.templates.matrix_product):
sweep_over = [1,2,3,4,5,6,7]
sweep_over = [0,1,2,3,4]
#Evaluate the provided parameters guess
try:

View File

@@ -99,34 +99,23 @@ class Tuner:
if level=='simple':
sizes = [(2560,2560,2560)]
elif level=='intermediate':
sizes = [#Square
sizes = [#Square
(896,896,896),
(1536,1536,1536),
(2176, 2176,2176),
(1536,1536,1536),
(2176, 2176,2176),
#Rank-32 updates
(896,896,32),
(1536,1536,32),
(2176,2176,32),
#Covariance
(32,32,16000),
(64,64,64000),
(256,256,32000),
#Convolutions
(3025,64,363),
(729,192,1200),
(169,384,1728),
(169,256,3456),
(169,128,2304),
(169,2304,256),
(169,3456,256),
(169,1728,384),
(729,1600,192),
(3025,363,64),
(2304,256,169),
(3456,256,169),
(1728,384,169),
(1600,192,729),
(363,64,3025)]
(32,32,16000),
(64,64,64000),
(256,256,32000)]
#DeepSpeech
sizes = []
for MK in [1760, 2048, 2560]:
for N in [16, 32, 64, 128, MK]:
sizes += [(MK, N, MK)]
elif level=='full':
sizes = product(pow2range(5, 12), pow2range(5, 12), pow2range(5, 17))
@@ -188,15 +177,20 @@ class Tuner:
clf = RandomForestRegressor(min(10, idx+1), max_depth=min(10, idx+1)).fit(X, Y)
#clf, nrmse = model.train(X, Y, profiles)
predperf = clf.predict(x)[0]
best = (-predperf).argsort()[:5]
best = (-predperf).argsort()
perf = []
for b in best:
try:
perf += [performance(x, tools.benchmark(operation, profiles[b], tree))]
break
except profile_execution_failure:
pass
predicted = profiles[best[argmax(perf)]]
retune = not optimize.is_local_optimum(predicted, operation, x, context)
if perf:
predicted = profiles[best[argmax(perf)]]
retune = not optimize.is_local_optimum(predicted, operation, x, context)
else:
retune = True
predicted = None
#Retune if necessary
if retune: