Bench: Fixed CUDA synchronization issue

This commit is contained in:
Philippe Tillet
2016-09-29 17:36:21 -04:00
parent 8bf7344681
commit 5d0e29db1f
5 changed files with 31 additions and 33 deletions

View File

@@ -45,10 +45,10 @@ void bench(sc::numeric_type dtype, std::string operation)
using std::get; using std::get;
using std::make_tuple; using std::make_tuple;
unsigned int dtsize = sc::size_of(dtype); //unsigned int dtsize = sc::size_of(dtype);
sc::driver::CommandQueue & queue = sc::driver::backend::queues::get(sc::driver::backend::contexts::get_default(),0); sc::driver::CommandQueue & queue = sc::driver::backend::queues::get(sc::driver::backend::contexts::get_default(),0);
auto sync = [&](){ queue.synchronize(); }; auto sync = [&](){ queue.synchronize(); };
auto cusync = [&](){ cudaDeviceSynchronize(); };
/*---------*/ /*---------*/
/*--BLAS1--*/ /*--BLAS1--*/
/*---------*/ /*---------*/
@@ -73,7 +73,7 @@ void bench(sc::numeric_type dtype, std::string operation)
times.push_back(bench([&](){cblas_saxpy(N, alpha, cx.data(), 1, cy.data(), 1);}, sync)); times.push_back(bench([&](){cblas_saxpy(N, alpha, cx.data(), 1, cy.data(), 1);}, sync));
#endif #endif
#ifdef BENCH_CUBLAS #ifdef BENCH_CUBLAS
times.push_back(bench([&](){cublasSaxpy(N, alpha, (T*)cu(x), 1, (T*)cu(y), 1);}, sync)); times.push_back(bench([&](){cublasSaxpy(N, alpha, (T*)cu(x), 1, (T*)cu(y), 1);}, cusync));
#endif #endif
} }
} }
@@ -99,7 +99,7 @@ void bench(sc::numeric_type dtype, std::string operation)
times.push_back(bench([&](){cblas_sdot(N, cx.data(), 1, cy.data(), 1);}, sync)); times.push_back(bench([&](){cblas_sdot(N, cx.data(), 1, cy.data(), 1);}, sync));
#endif #endif
#ifdef BENCH_CUBLAS #ifdef BENCH_CUBLAS
times.push_back(bench([&](){cublasSdot(N, (T*)cu(x), 1, (T*)cu(y), 1);}, sync)); times.push_back(bench([&](){cublasSdot(N, (T*)cu(x), 1, (T*)cu(y), 1);}, cusync));
#endif #endif
} }
} }
@@ -155,7 +155,7 @@ void bench(sc::numeric_type dtype, std::string operation)
times.push_back(bench([&](){cblas_sgemv(CblasColMajor, AT?CblasTrans:CblasNoTrans, As1, As2, 1, cA.data(), lda, cx.data(), 1, 0, cy.data(), 1);}, sync)); times.push_back(bench([&](){cblas_sgemv(CblasColMajor, AT?CblasTrans:CblasNoTrans, As1, As2, 1, cA.data(), lda, cx.data(), 1, 0, cy.data(), 1);}, sync));
#endif #endif
#ifdef BENCH_CUBLAS #ifdef BENCH_CUBLAS
times.push_back(bench([&](){cublasSgemv(AT?'t':'n', As1, As2, 1, (T*)cu(A), lda, (T*)cu(x), 1, 0, (T*)cu(y), 1);}, sync)); times.push_back(bench([&](){cublasSgemv(AT?'t':'n', As1, As2, 1, (T*)cu(A), lda, (T*)cu(x), 1, 0, (T*)cu(y), 1);}, cusync));
#endif #endif
} }
} }
@@ -238,7 +238,7 @@ void bench(sc::numeric_type dtype, std::string operation)
times.push_back(bench([&](){cblas_sgemm(CblasColMajor, AT?CblasTrans:CblasNoTrans, BT?CblasTrans:CblasNoTrans, M, N, K, 1, cA.data(), lda, cB.data(), ldb, 1, cC.data(), ldc);}, sync)); times.push_back(bench([&](){cblas_sgemm(CblasColMajor, AT?CblasTrans:CblasNoTrans, BT?CblasTrans:CblasNoTrans, M, N, K, 1, cA.data(), lda, cB.data(), ldb, 1, cC.data(), ldc);}, sync));
#endif #endif
#ifdef BENCH_CUBLAS #ifdef BENCH_CUBLAS
times.push_back(bench([&](){cublasSgemm(AT?'t':'n', BT?'t':'n', M, N, K, 1, (T*)cu(A), lda, (T*)cu(B), ldb, 1, (T*)cu(C), ldc);}, sync)); times.push_back(bench([&](){cublasSgemm(AT?'t':'n', BT?'t':'n', M, N, K, 1, (T*)cu(A), lda, (T*)cu(B), ldb, 1, (T*)cu(C), ldc);}, cusync));
#endif #endif
std::cout << name << "\t" << M << "\t" << N << "\t" << K << "\t" << cAT << "\t" << cBT; std::cout << name << "\t" << M << "\t" << N << "\t" << K << "\t" << cAT << "\t" << cBT;
std::transform(times.begin(), times.end(), std::back_inserter(tflops), [&](double t){ return 2*M*N*K/t*1e-3;}); std::transform(times.begin(), times.end(), std::back_inserter(tflops), [&](double t){ return 2*M*N*K/t*1e-3;});

File diff suppressed because one or more lines are too long

View File

@@ -73,7 +73,11 @@ def main():
libraries += ['gnustl_shared'] libraries += ['gnustl_shared']
#Source files #Source files
<<<<<<< Updated upstream
src = 'src/lib/exception/api.cpp src/lib/exception/driver.cpp src/lib/value_scalar.cpp src/lib/random/rand.cpp src/lib/driver/check.cpp src/lib/driver/ndrange.cpp src/lib/driver/platform.cpp src/lib/driver/backend.cpp src/lib/driver/program.cpp src/lib/driver/command_queue.cpp src/lib/driver/event.cpp src/lib/driver/kernel.cpp src/lib/driver/handle.cpp src/lib/driver/device.cpp src/lib/driver/program_cache.cpp src/lib/driver/buffer.cpp src/lib/driver/context.cpp src/lib/driver/dispatch.cpp src/lib/jit/generation/engine/stream.cpp src/lib/jit/generation/engine/keywords.cpp src/lib/jit/generation/reduce_1d.cpp src/lib/jit/generation/elementwise_1d.cpp src/lib/jit/generation/base.cpp src/lib/jit/generation/elementwise_2d.cpp src/lib/jit/generation/matrix_product.cpp src/lib/jit/generation/reduce_2d.cpp src/lib/jit/syntax/engine/object.cpp src/lib/jit/syntax/engine/macro.cpp src/lib/jit/syntax/engine/process.cpp src/lib/jit/syntax/engine/binder.cpp src/lib/jit/syntax/expression/operations.cpp src/lib/jit/syntax/expression/expression.cpp src/lib/jit/syntax/expression/preset.cpp src/lib/api/blas/clBLAS.cpp src/lib/api/blas/cublas.cpp src/lib/runtime/execute.cpp src/lib/runtime/scheduler/dag.cpp src/lib/runtime/scheduler/strategies/heft.cpp src/lib/runtime/inference/predictors/random_forest.cpp src/lib/runtime/inference/profiles.cpp src/lib/runtime/inference/database.cpp src/lib/array.cpp '.split() + [os.path.join('src', 'bind', sf) for sf in ['_isaac.cpp', 'core.cpp', 'driver.cpp', 'kernels.cpp', 'exceptions.cpp']] src = 'src/lib/exception/api.cpp src/lib/exception/driver.cpp src/lib/value_scalar.cpp src/lib/random/rand.cpp src/lib/driver/check.cpp src/lib/driver/ndrange.cpp src/lib/driver/platform.cpp src/lib/driver/backend.cpp src/lib/driver/program.cpp src/lib/driver/command_queue.cpp src/lib/driver/event.cpp src/lib/driver/kernel.cpp src/lib/driver/handle.cpp src/lib/driver/device.cpp src/lib/driver/program_cache.cpp src/lib/driver/buffer.cpp src/lib/driver/context.cpp src/lib/driver/dispatch.cpp src/lib/jit/generation/engine/stream.cpp src/lib/jit/generation/engine/keywords.cpp src/lib/jit/generation/reduce_1d.cpp src/lib/jit/generation/elementwise_1d.cpp src/lib/jit/generation/base.cpp src/lib/jit/generation/elementwise_2d.cpp src/lib/jit/generation/matrix_product.cpp src/lib/jit/generation/reduce_2d.cpp src/lib/jit/syntax/engine/object.cpp src/lib/jit/syntax/engine/macro.cpp src/lib/jit/syntax/engine/process.cpp src/lib/jit/syntax/engine/binder.cpp src/lib/jit/syntax/expression/operations.cpp src/lib/jit/syntax/expression/expression.cpp src/lib/jit/syntax/expression/preset.cpp src/lib/api/blas/clBLAS.cpp src/lib/api/blas/cublas.cpp src/lib/runtime/execute.cpp src/lib/runtime/scheduler/dag.cpp src/lib/runtime/scheduler/strategies/heft.cpp src/lib/runtime/inference/predictors/random_forest.cpp src/lib/runtime/inference/profiles.cpp src/lib/runtime/inference/database.cpp src/lib/array.cpp '.split() + [os.path.join('src', 'bind', sf) for sf in ['_isaac.cpp', 'core.cpp', 'driver.cpp', 'kernels.cpp', 'exceptions.cpp']]
=======
src = 'src/lib/random/rand.cpp src/lib/jit/syntax/expression/preset.cpp src/lib/jit/syntax/expression/expression.cpp src/lib/jit/syntax/expression/operations.cpp src/lib/jit/syntax/engine/binder.cpp src/lib/jit/syntax/engine/macro.cpp src/lib/jit/syntax/engine/object.cpp src/lib/jit/syntax/engine/process.cpp src/lib/jit/generation/base.cpp src/lib/jit/generation/engine/keywords.cpp src/lib/jit/generation/engine/stream.cpp src/lib/jit/generation/reduce_1d.cpp src/lib/jit/generation/elementwise_1d.cpp src/lib/jit/generation/matrix_product.cpp src/lib/jit/generation/reduce_2d.cpp src/lib/jit/generation/elementwise_2d.cpp src/lib/runtime/execute.cpp src/lib/runtime/scheduler/dag.cpp src/lib/runtime/scheduler/strategies/heft.cpp src/lib/runtime/inference/predictors/random_forest.cpp src/lib/runtime/inference/database.cpp src/lib/runtime/inference/profiles.cpp src/lib/value_scalar.cpp src/lib/exception/api.cpp src/lib/exception/driver.cpp src/lib/driver/kernel.cpp src/lib/driver/program.cpp src/lib/driver/check.cpp src/lib/driver/backend.cpp src/lib/driver/context.cpp src/lib/driver/command_queue.cpp src/lib/driver/device.cpp src/lib/driver/platform.cpp src/lib/driver/handle.cpp src/lib/driver/dispatch.cpp src/lib/driver/buffer.cpp src/lib/driver/program_cache.cpp src/lib/driver/event.cpp src/lib/driver/ndrange.cpp src/lib/array.cpp src/lib/api/blas/cublas.cpp src/lib/api/blas/clBLAS.cpp '.split() + [os.path.join('src', 'bind', sf) for sf in ['_isaac.cpp', 'core.cpp', 'driver.cpp', 'kernels.cpp', 'exceptions.cpp']]
>>>>>>> Stashed changes
boostsrc = 'external/boost/libs/' boostsrc = 'external/boost/libs/'
for s in ['numpy','python','smart_ptr','system','thread']: for s in ['numpy','python','smart_ptr','system','thread']:
src = src + [x for x in recursive_glob('external/boost/libs/' + s + '/src/','.cpp') if 'win32' not in x and 'pthread' not in x] src = src + [x for x in recursive_glob('external/boost/libs/' + s + '/src/','.cpp') if 'win32' not in x and 'pthread' not in x]

View File

@@ -192,7 +192,7 @@ def is_local_optimum(parameters, template, sizes, context):
elif issubclass(template, sc.templates.reduce_2d): elif issubclass(template, sc.templates.reduce_2d):
sweep_over = [0,1,2,3,4] sweep_over = [0,1,2,3,4]
elif issubclass(template, sc.templates.matrix_product): elif issubclass(template, sc.templates.matrix_product):
sweep_over = [1,2,3,4,5,6,7] sweep_over = [0,1,2,3,4]
#Evaluate the provided parameters guess #Evaluate the provided parameters guess
try: try:

View File

@@ -99,34 +99,23 @@ class Tuner:
if level=='simple': if level=='simple':
sizes = [(2560,2560,2560)] sizes = [(2560,2560,2560)]
elif level=='intermediate': elif level=='intermediate':
sizes = [#Square sizes = [#Square
(896,896,896), (896,896,896),
(1536,1536,1536), (1536,1536,1536),
(2176, 2176,2176), (2176, 2176,2176),
#Rank-32 updates #Rank-32 updates
(896,896,32), (896,896,32),
(1536,1536,32), (1536,1536,32),
(2176,2176,32), (2176,2176,32),
#Covariance #Covariance
(32,32,16000), (32,32,16000),
(64,64,64000), (64,64,64000),
(256,256,32000), (256,256,32000)]
#Convolutions #DeepSpeech
(3025,64,363), sizes = []
(729,192,1200), for MK in [1760, 2048, 2560]:
(169,384,1728), for N in [16, 32, 64, 128, MK]:
(169,256,3456), sizes += [(MK, N, MK)]
(169,128,2304),
(169,2304,256),
(169,3456,256),
(169,1728,384),
(729,1600,192),
(3025,363,64),
(2304,256,169),
(3456,256,169),
(1728,384,169),
(1600,192,729),
(363,64,3025)]
elif level=='full': elif level=='full':
sizes = product(pow2range(5, 12), pow2range(5, 12), pow2range(5, 17)) sizes = product(pow2range(5, 12), pow2range(5, 12), pow2range(5, 17))
@@ -188,15 +177,20 @@ class Tuner:
clf = RandomForestRegressor(min(10, idx+1), max_depth=min(10, idx+1)).fit(X, Y) clf = RandomForestRegressor(min(10, idx+1), max_depth=min(10, idx+1)).fit(X, Y)
#clf, nrmse = model.train(X, Y, profiles) #clf, nrmse = model.train(X, Y, profiles)
predperf = clf.predict(x)[0] predperf = clf.predict(x)[0]
best = (-predperf).argsort()[:5] best = (-predperf).argsort()
perf = [] perf = []
for b in best: for b in best:
try: try:
perf += [performance(x, tools.benchmark(operation, profiles[b], tree))] perf += [performance(x, tools.benchmark(operation, profiles[b], tree))]
break
except profile_execution_failure: except profile_execution_failure:
pass pass
predicted = profiles[best[argmax(perf)]] if perf:
retune = not optimize.is_local_optimum(predicted, operation, x, context) predicted = profiles[best[argmax(perf)]]
retune = not optimize.is_local_optimum(predicted, operation, x, context)
else:
retune = True
predicted = None
#Retune if necessary #Retune if necessary
if retune: if retune: