Bench: Fixed CUDA synchronization issue
This commit is contained in:
@@ -45,10 +45,10 @@ void bench(sc::numeric_type dtype, std::string operation)
|
||||
using std::get;
|
||||
using std::make_tuple;
|
||||
|
||||
unsigned int dtsize = sc::size_of(dtype);
|
||||
//unsigned int dtsize = sc::size_of(dtype);
|
||||
sc::driver::CommandQueue & queue = sc::driver::backend::queues::get(sc::driver::backend::contexts::get_default(),0);
|
||||
auto sync = [&](){ queue.synchronize(); };
|
||||
|
||||
auto cusync = [&](){ cudaDeviceSynchronize(); };
|
||||
/*---------*/
|
||||
/*--BLAS1--*/
|
||||
/*---------*/
|
||||
@@ -73,7 +73,7 @@ void bench(sc::numeric_type dtype, std::string operation)
|
||||
times.push_back(bench([&](){cblas_saxpy(N, alpha, cx.data(), 1, cy.data(), 1);}, sync));
|
||||
#endif
|
||||
#ifdef BENCH_CUBLAS
|
||||
times.push_back(bench([&](){cublasSaxpy(N, alpha, (T*)cu(x), 1, (T*)cu(y), 1);}, sync));
|
||||
times.push_back(bench([&](){cublasSaxpy(N, alpha, (T*)cu(x), 1, (T*)cu(y), 1);}, cusync));
|
||||
#endif
|
||||
}
|
||||
}
|
||||
@@ -99,7 +99,7 @@ void bench(sc::numeric_type dtype, std::string operation)
|
||||
times.push_back(bench([&](){cblas_sdot(N, cx.data(), 1, cy.data(), 1);}, sync));
|
||||
#endif
|
||||
#ifdef BENCH_CUBLAS
|
||||
times.push_back(bench([&](){cublasSdot(N, (T*)cu(x), 1, (T*)cu(y), 1);}, sync));
|
||||
times.push_back(bench([&](){cublasSdot(N, (T*)cu(x), 1, (T*)cu(y), 1);}, cusync));
|
||||
#endif
|
||||
}
|
||||
}
|
||||
@@ -155,7 +155,7 @@ void bench(sc::numeric_type dtype, std::string operation)
|
||||
times.push_back(bench([&](){cblas_sgemv(CblasColMajor, AT?CblasTrans:CblasNoTrans, As1, As2, 1, cA.data(), lda, cx.data(), 1, 0, cy.data(), 1);}, sync));
|
||||
#endif
|
||||
#ifdef BENCH_CUBLAS
|
||||
times.push_back(bench([&](){cublasSgemv(AT?'t':'n', As1, As2, 1, (T*)cu(A), lda, (T*)cu(x), 1, 0, (T*)cu(y), 1);}, sync));
|
||||
times.push_back(bench([&](){cublasSgemv(AT?'t':'n', As1, As2, 1, (T*)cu(A), lda, (T*)cu(x), 1, 0, (T*)cu(y), 1);}, cusync));
|
||||
#endif
|
||||
}
|
||||
}
|
||||
@@ -238,7 +238,7 @@ void bench(sc::numeric_type dtype, std::string operation)
|
||||
times.push_back(bench([&](){cblas_sgemm(CblasColMajor, AT?CblasTrans:CblasNoTrans, BT?CblasTrans:CblasNoTrans, M, N, K, 1, cA.data(), lda, cB.data(), ldb, 1, cC.data(), ldc);}, sync));
|
||||
#endif
|
||||
#ifdef BENCH_CUBLAS
|
||||
times.push_back(bench([&](){cublasSgemm(AT?'t':'n', BT?'t':'n', M, N, K, 1, (T*)cu(A), lda, (T*)cu(B), ldb, 1, (T*)cu(C), ldc);}, sync));
|
||||
times.push_back(bench([&](){cublasSgemm(AT?'t':'n', BT?'t':'n', M, N, K, 1, (T*)cu(A), lda, (T*)cu(B), ldb, 1, (T*)cu(C), ldc);}, cusync));
|
||||
#endif
|
||||
std::cout << name << "\t" << M << "\t" << N << "\t" << K << "\t" << cAT << "\t" << cBT;
|
||||
std::transform(times.begin(), times.end(), std::back_inserter(tflops), [&](double t){ return 2*M*N*K/t*1e-3;});
|
||||
|
File diff suppressed because one or more lines are too long
@@ -73,7 +73,11 @@ def main():
|
||||
libraries += ['gnustl_shared']
|
||||
|
||||
#Source files
|
||||
<<<<<<< Updated upstream
|
||||
src = 'src/lib/exception/api.cpp src/lib/exception/driver.cpp src/lib/value_scalar.cpp src/lib/random/rand.cpp src/lib/driver/check.cpp src/lib/driver/ndrange.cpp src/lib/driver/platform.cpp src/lib/driver/backend.cpp src/lib/driver/program.cpp src/lib/driver/command_queue.cpp src/lib/driver/event.cpp src/lib/driver/kernel.cpp src/lib/driver/handle.cpp src/lib/driver/device.cpp src/lib/driver/program_cache.cpp src/lib/driver/buffer.cpp src/lib/driver/context.cpp src/lib/driver/dispatch.cpp src/lib/jit/generation/engine/stream.cpp src/lib/jit/generation/engine/keywords.cpp src/lib/jit/generation/reduce_1d.cpp src/lib/jit/generation/elementwise_1d.cpp src/lib/jit/generation/base.cpp src/lib/jit/generation/elementwise_2d.cpp src/lib/jit/generation/matrix_product.cpp src/lib/jit/generation/reduce_2d.cpp src/lib/jit/syntax/engine/object.cpp src/lib/jit/syntax/engine/macro.cpp src/lib/jit/syntax/engine/process.cpp src/lib/jit/syntax/engine/binder.cpp src/lib/jit/syntax/expression/operations.cpp src/lib/jit/syntax/expression/expression.cpp src/lib/jit/syntax/expression/preset.cpp src/lib/api/blas/clBLAS.cpp src/lib/api/blas/cublas.cpp src/lib/runtime/execute.cpp src/lib/runtime/scheduler/dag.cpp src/lib/runtime/scheduler/strategies/heft.cpp src/lib/runtime/inference/predictors/random_forest.cpp src/lib/runtime/inference/profiles.cpp src/lib/runtime/inference/database.cpp src/lib/array.cpp '.split() + [os.path.join('src', 'bind', sf) for sf in ['_isaac.cpp', 'core.cpp', 'driver.cpp', 'kernels.cpp', 'exceptions.cpp']]
|
||||
=======
|
||||
src = 'src/lib/random/rand.cpp src/lib/jit/syntax/expression/preset.cpp src/lib/jit/syntax/expression/expression.cpp src/lib/jit/syntax/expression/operations.cpp src/lib/jit/syntax/engine/binder.cpp src/lib/jit/syntax/engine/macro.cpp src/lib/jit/syntax/engine/object.cpp src/lib/jit/syntax/engine/process.cpp src/lib/jit/generation/base.cpp src/lib/jit/generation/engine/keywords.cpp src/lib/jit/generation/engine/stream.cpp src/lib/jit/generation/reduce_1d.cpp src/lib/jit/generation/elementwise_1d.cpp src/lib/jit/generation/matrix_product.cpp src/lib/jit/generation/reduce_2d.cpp src/lib/jit/generation/elementwise_2d.cpp src/lib/runtime/execute.cpp src/lib/runtime/scheduler/dag.cpp src/lib/runtime/scheduler/strategies/heft.cpp src/lib/runtime/inference/predictors/random_forest.cpp src/lib/runtime/inference/database.cpp src/lib/runtime/inference/profiles.cpp src/lib/value_scalar.cpp src/lib/exception/api.cpp src/lib/exception/driver.cpp src/lib/driver/kernel.cpp src/lib/driver/program.cpp src/lib/driver/check.cpp src/lib/driver/backend.cpp src/lib/driver/context.cpp src/lib/driver/command_queue.cpp src/lib/driver/device.cpp src/lib/driver/platform.cpp src/lib/driver/handle.cpp src/lib/driver/dispatch.cpp src/lib/driver/buffer.cpp src/lib/driver/program_cache.cpp src/lib/driver/event.cpp src/lib/driver/ndrange.cpp src/lib/array.cpp src/lib/api/blas/cublas.cpp src/lib/api/blas/clBLAS.cpp '.split() + [os.path.join('src', 'bind', sf) for sf in ['_isaac.cpp', 'core.cpp', 'driver.cpp', 'kernels.cpp', 'exceptions.cpp']]
|
||||
>>>>>>> Stashed changes
|
||||
boostsrc = 'external/boost/libs/'
|
||||
for s in ['numpy','python','smart_ptr','system','thread']:
|
||||
src = src + [x for x in recursive_glob('external/boost/libs/' + s + '/src/','.cpp') if 'win32' not in x and 'pthread' not in x]
|
||||
|
@@ -192,7 +192,7 @@ def is_local_optimum(parameters, template, sizes, context):
|
||||
elif issubclass(template, sc.templates.reduce_2d):
|
||||
sweep_over = [0,1,2,3,4]
|
||||
elif issubclass(template, sc.templates.matrix_product):
|
||||
sweep_over = [1,2,3,4,5,6,7]
|
||||
sweep_over = [0,1,2,3,4]
|
||||
|
||||
#Evaluate the provided parameters guess
|
||||
try:
|
||||
|
@@ -99,34 +99,23 @@ class Tuner:
|
||||
if level=='simple':
|
||||
sizes = [(2560,2560,2560)]
|
||||
elif level=='intermediate':
|
||||
sizes = [#Square
|
||||
sizes = [#Square
|
||||
(896,896,896),
|
||||
(1536,1536,1536),
|
||||
(2176, 2176,2176),
|
||||
(1536,1536,1536),
|
||||
(2176, 2176,2176),
|
||||
#Rank-32 updates
|
||||
(896,896,32),
|
||||
(1536,1536,32),
|
||||
(2176,2176,32),
|
||||
#Covariance
|
||||
(32,32,16000),
|
||||
(64,64,64000),
|
||||
(256,256,32000),
|
||||
#Convolutions
|
||||
(3025,64,363),
|
||||
(729,192,1200),
|
||||
(169,384,1728),
|
||||
(169,256,3456),
|
||||
(169,128,2304),
|
||||
(169,2304,256),
|
||||
(169,3456,256),
|
||||
(169,1728,384),
|
||||
(729,1600,192),
|
||||
(3025,363,64),
|
||||
(2304,256,169),
|
||||
(3456,256,169),
|
||||
(1728,384,169),
|
||||
(1600,192,729),
|
||||
(363,64,3025)]
|
||||
(32,32,16000),
|
||||
(64,64,64000),
|
||||
(256,256,32000)]
|
||||
#DeepSpeech
|
||||
sizes = []
|
||||
for MK in [1760, 2048, 2560]:
|
||||
for N in [16, 32, 64, 128, MK]:
|
||||
sizes += [(MK, N, MK)]
|
||||
elif level=='full':
|
||||
sizes = product(pow2range(5, 12), pow2range(5, 12), pow2range(5, 17))
|
||||
|
||||
@@ -188,15 +177,20 @@ class Tuner:
|
||||
clf = RandomForestRegressor(min(10, idx+1), max_depth=min(10, idx+1)).fit(X, Y)
|
||||
#clf, nrmse = model.train(X, Y, profiles)
|
||||
predperf = clf.predict(x)[0]
|
||||
best = (-predperf).argsort()[:5]
|
||||
best = (-predperf).argsort()
|
||||
perf = []
|
||||
for b in best:
|
||||
try:
|
||||
perf += [performance(x, tools.benchmark(operation, profiles[b], tree))]
|
||||
break
|
||||
except profile_execution_failure:
|
||||
pass
|
||||
predicted = profiles[best[argmax(perf)]]
|
||||
retune = not optimize.is_local_optimum(predicted, operation, x, context)
|
||||
if perf:
|
||||
predicted = profiles[best[argmax(perf)]]
|
||||
retune = not optimize.is_local_optimum(predicted, operation, x, context)
|
||||
else:
|
||||
retune = True
|
||||
predicted = None
|
||||
|
||||
#Retune if necessary
|
||||
if retune:
|
||||
|
Reference in New Issue
Block a user