Bench: Fixed CUDA synchronization issue
This commit is contained in:
@@ -45,10 +45,10 @@ void bench(sc::numeric_type dtype, std::string operation)
|
|||||||
using std::get;
|
using std::get;
|
||||||
using std::make_tuple;
|
using std::make_tuple;
|
||||||
|
|
||||||
unsigned int dtsize = sc::size_of(dtype);
|
//unsigned int dtsize = sc::size_of(dtype);
|
||||||
sc::driver::CommandQueue & queue = sc::driver::backend::queues::get(sc::driver::backend::contexts::get_default(),0);
|
sc::driver::CommandQueue & queue = sc::driver::backend::queues::get(sc::driver::backend::contexts::get_default(),0);
|
||||||
auto sync = [&](){ queue.synchronize(); };
|
auto sync = [&](){ queue.synchronize(); };
|
||||||
|
auto cusync = [&](){ cudaDeviceSynchronize(); };
|
||||||
/*---------*/
|
/*---------*/
|
||||||
/*--BLAS1--*/
|
/*--BLAS1--*/
|
||||||
/*---------*/
|
/*---------*/
|
||||||
@@ -73,7 +73,7 @@ void bench(sc::numeric_type dtype, std::string operation)
|
|||||||
times.push_back(bench([&](){cblas_saxpy(N, alpha, cx.data(), 1, cy.data(), 1);}, sync));
|
times.push_back(bench([&](){cblas_saxpy(N, alpha, cx.data(), 1, cy.data(), 1);}, sync));
|
||||||
#endif
|
#endif
|
||||||
#ifdef BENCH_CUBLAS
|
#ifdef BENCH_CUBLAS
|
||||||
times.push_back(bench([&](){cublasSaxpy(N, alpha, (T*)cu(x), 1, (T*)cu(y), 1);}, sync));
|
times.push_back(bench([&](){cublasSaxpy(N, alpha, (T*)cu(x), 1, (T*)cu(y), 1);}, cusync));
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -99,7 +99,7 @@ void bench(sc::numeric_type dtype, std::string operation)
|
|||||||
times.push_back(bench([&](){cblas_sdot(N, cx.data(), 1, cy.data(), 1);}, sync));
|
times.push_back(bench([&](){cblas_sdot(N, cx.data(), 1, cy.data(), 1);}, sync));
|
||||||
#endif
|
#endif
|
||||||
#ifdef BENCH_CUBLAS
|
#ifdef BENCH_CUBLAS
|
||||||
times.push_back(bench([&](){cublasSdot(N, (T*)cu(x), 1, (T*)cu(y), 1);}, sync));
|
times.push_back(bench([&](){cublasSdot(N, (T*)cu(x), 1, (T*)cu(y), 1);}, cusync));
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -155,7 +155,7 @@ void bench(sc::numeric_type dtype, std::string operation)
|
|||||||
times.push_back(bench([&](){cblas_sgemv(CblasColMajor, AT?CblasTrans:CblasNoTrans, As1, As2, 1, cA.data(), lda, cx.data(), 1, 0, cy.data(), 1);}, sync));
|
times.push_back(bench([&](){cblas_sgemv(CblasColMajor, AT?CblasTrans:CblasNoTrans, As1, As2, 1, cA.data(), lda, cx.data(), 1, 0, cy.data(), 1);}, sync));
|
||||||
#endif
|
#endif
|
||||||
#ifdef BENCH_CUBLAS
|
#ifdef BENCH_CUBLAS
|
||||||
times.push_back(bench([&](){cublasSgemv(AT?'t':'n', As1, As2, 1, (T*)cu(A), lda, (T*)cu(x), 1, 0, (T*)cu(y), 1);}, sync));
|
times.push_back(bench([&](){cublasSgemv(AT?'t':'n', As1, As2, 1, (T*)cu(A), lda, (T*)cu(x), 1, 0, (T*)cu(y), 1);}, cusync));
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -238,7 +238,7 @@ void bench(sc::numeric_type dtype, std::string operation)
|
|||||||
times.push_back(bench([&](){cblas_sgemm(CblasColMajor, AT?CblasTrans:CblasNoTrans, BT?CblasTrans:CblasNoTrans, M, N, K, 1, cA.data(), lda, cB.data(), ldb, 1, cC.data(), ldc);}, sync));
|
times.push_back(bench([&](){cblas_sgemm(CblasColMajor, AT?CblasTrans:CblasNoTrans, BT?CblasTrans:CblasNoTrans, M, N, K, 1, cA.data(), lda, cB.data(), ldb, 1, cC.data(), ldc);}, sync));
|
||||||
#endif
|
#endif
|
||||||
#ifdef BENCH_CUBLAS
|
#ifdef BENCH_CUBLAS
|
||||||
times.push_back(bench([&](){cublasSgemm(AT?'t':'n', BT?'t':'n', M, N, K, 1, (T*)cu(A), lda, (T*)cu(B), ldb, 1, (T*)cu(C), ldc);}, sync));
|
times.push_back(bench([&](){cublasSgemm(AT?'t':'n', BT?'t':'n', M, N, K, 1, (T*)cu(A), lda, (T*)cu(B), ldb, 1, (T*)cu(C), ldc);}, cusync));
|
||||||
#endif
|
#endif
|
||||||
std::cout << name << "\t" << M << "\t" << N << "\t" << K << "\t" << cAT << "\t" << cBT;
|
std::cout << name << "\t" << M << "\t" << N << "\t" << K << "\t" << cAT << "\t" << cBT;
|
||||||
std::transform(times.begin(), times.end(), std::back_inserter(tflops), [&](double t){ return 2*M*N*K/t*1e-3;});
|
std::transform(times.begin(), times.end(), std::back_inserter(tflops), [&](double t){ return 2*M*N*K/t*1e-3;});
|
||||||
|
File diff suppressed because one or more lines are too long
@@ -73,7 +73,11 @@ def main():
|
|||||||
libraries += ['gnustl_shared']
|
libraries += ['gnustl_shared']
|
||||||
|
|
||||||
#Source files
|
#Source files
|
||||||
|
<<<<<<< Updated upstream
|
||||||
src = 'src/lib/exception/api.cpp src/lib/exception/driver.cpp src/lib/value_scalar.cpp src/lib/random/rand.cpp src/lib/driver/check.cpp src/lib/driver/ndrange.cpp src/lib/driver/platform.cpp src/lib/driver/backend.cpp src/lib/driver/program.cpp src/lib/driver/command_queue.cpp src/lib/driver/event.cpp src/lib/driver/kernel.cpp src/lib/driver/handle.cpp src/lib/driver/device.cpp src/lib/driver/program_cache.cpp src/lib/driver/buffer.cpp src/lib/driver/context.cpp src/lib/driver/dispatch.cpp src/lib/jit/generation/engine/stream.cpp src/lib/jit/generation/engine/keywords.cpp src/lib/jit/generation/reduce_1d.cpp src/lib/jit/generation/elementwise_1d.cpp src/lib/jit/generation/base.cpp src/lib/jit/generation/elementwise_2d.cpp src/lib/jit/generation/matrix_product.cpp src/lib/jit/generation/reduce_2d.cpp src/lib/jit/syntax/engine/object.cpp src/lib/jit/syntax/engine/macro.cpp src/lib/jit/syntax/engine/process.cpp src/lib/jit/syntax/engine/binder.cpp src/lib/jit/syntax/expression/operations.cpp src/lib/jit/syntax/expression/expression.cpp src/lib/jit/syntax/expression/preset.cpp src/lib/api/blas/clBLAS.cpp src/lib/api/blas/cublas.cpp src/lib/runtime/execute.cpp src/lib/runtime/scheduler/dag.cpp src/lib/runtime/scheduler/strategies/heft.cpp src/lib/runtime/inference/predictors/random_forest.cpp src/lib/runtime/inference/profiles.cpp src/lib/runtime/inference/database.cpp src/lib/array.cpp '.split() + [os.path.join('src', 'bind', sf) for sf in ['_isaac.cpp', 'core.cpp', 'driver.cpp', 'kernels.cpp', 'exceptions.cpp']]
|
src = 'src/lib/exception/api.cpp src/lib/exception/driver.cpp src/lib/value_scalar.cpp src/lib/random/rand.cpp src/lib/driver/check.cpp src/lib/driver/ndrange.cpp src/lib/driver/platform.cpp src/lib/driver/backend.cpp src/lib/driver/program.cpp src/lib/driver/command_queue.cpp src/lib/driver/event.cpp src/lib/driver/kernel.cpp src/lib/driver/handle.cpp src/lib/driver/device.cpp src/lib/driver/program_cache.cpp src/lib/driver/buffer.cpp src/lib/driver/context.cpp src/lib/driver/dispatch.cpp src/lib/jit/generation/engine/stream.cpp src/lib/jit/generation/engine/keywords.cpp src/lib/jit/generation/reduce_1d.cpp src/lib/jit/generation/elementwise_1d.cpp src/lib/jit/generation/base.cpp src/lib/jit/generation/elementwise_2d.cpp src/lib/jit/generation/matrix_product.cpp src/lib/jit/generation/reduce_2d.cpp src/lib/jit/syntax/engine/object.cpp src/lib/jit/syntax/engine/macro.cpp src/lib/jit/syntax/engine/process.cpp src/lib/jit/syntax/engine/binder.cpp src/lib/jit/syntax/expression/operations.cpp src/lib/jit/syntax/expression/expression.cpp src/lib/jit/syntax/expression/preset.cpp src/lib/api/blas/clBLAS.cpp src/lib/api/blas/cublas.cpp src/lib/runtime/execute.cpp src/lib/runtime/scheduler/dag.cpp src/lib/runtime/scheduler/strategies/heft.cpp src/lib/runtime/inference/predictors/random_forest.cpp src/lib/runtime/inference/profiles.cpp src/lib/runtime/inference/database.cpp src/lib/array.cpp '.split() + [os.path.join('src', 'bind', sf) for sf in ['_isaac.cpp', 'core.cpp', 'driver.cpp', 'kernels.cpp', 'exceptions.cpp']]
|
||||||
|
=======
|
||||||
|
src = 'src/lib/random/rand.cpp src/lib/jit/syntax/expression/preset.cpp src/lib/jit/syntax/expression/expression.cpp src/lib/jit/syntax/expression/operations.cpp src/lib/jit/syntax/engine/binder.cpp src/lib/jit/syntax/engine/macro.cpp src/lib/jit/syntax/engine/object.cpp src/lib/jit/syntax/engine/process.cpp src/lib/jit/generation/base.cpp src/lib/jit/generation/engine/keywords.cpp src/lib/jit/generation/engine/stream.cpp src/lib/jit/generation/reduce_1d.cpp src/lib/jit/generation/elementwise_1d.cpp src/lib/jit/generation/matrix_product.cpp src/lib/jit/generation/reduce_2d.cpp src/lib/jit/generation/elementwise_2d.cpp src/lib/runtime/execute.cpp src/lib/runtime/scheduler/dag.cpp src/lib/runtime/scheduler/strategies/heft.cpp src/lib/runtime/inference/predictors/random_forest.cpp src/lib/runtime/inference/database.cpp src/lib/runtime/inference/profiles.cpp src/lib/value_scalar.cpp src/lib/exception/api.cpp src/lib/exception/driver.cpp src/lib/driver/kernel.cpp src/lib/driver/program.cpp src/lib/driver/check.cpp src/lib/driver/backend.cpp src/lib/driver/context.cpp src/lib/driver/command_queue.cpp src/lib/driver/device.cpp src/lib/driver/platform.cpp src/lib/driver/handle.cpp src/lib/driver/dispatch.cpp src/lib/driver/buffer.cpp src/lib/driver/program_cache.cpp src/lib/driver/event.cpp src/lib/driver/ndrange.cpp src/lib/array.cpp src/lib/api/blas/cublas.cpp src/lib/api/blas/clBLAS.cpp '.split() + [os.path.join('src', 'bind', sf) for sf in ['_isaac.cpp', 'core.cpp', 'driver.cpp', 'kernels.cpp', 'exceptions.cpp']]
|
||||||
|
>>>>>>> Stashed changes
|
||||||
boostsrc = 'external/boost/libs/'
|
boostsrc = 'external/boost/libs/'
|
||||||
for s in ['numpy','python','smart_ptr','system','thread']:
|
for s in ['numpy','python','smart_ptr','system','thread']:
|
||||||
src = src + [x for x in recursive_glob('external/boost/libs/' + s + '/src/','.cpp') if 'win32' not in x and 'pthread' not in x]
|
src = src + [x for x in recursive_glob('external/boost/libs/' + s + '/src/','.cpp') if 'win32' not in x and 'pthread' not in x]
|
||||||
|
@@ -192,7 +192,7 @@ def is_local_optimum(parameters, template, sizes, context):
|
|||||||
elif issubclass(template, sc.templates.reduce_2d):
|
elif issubclass(template, sc.templates.reduce_2d):
|
||||||
sweep_over = [0,1,2,3,4]
|
sweep_over = [0,1,2,3,4]
|
||||||
elif issubclass(template, sc.templates.matrix_product):
|
elif issubclass(template, sc.templates.matrix_product):
|
||||||
sweep_over = [1,2,3,4,5,6,7]
|
sweep_over = [0,1,2,3,4]
|
||||||
|
|
||||||
#Evaluate the provided parameters guess
|
#Evaluate the provided parameters guess
|
||||||
try:
|
try:
|
||||||
|
@@ -99,34 +99,23 @@ class Tuner:
|
|||||||
if level=='simple':
|
if level=='simple':
|
||||||
sizes = [(2560,2560,2560)]
|
sizes = [(2560,2560,2560)]
|
||||||
elif level=='intermediate':
|
elif level=='intermediate':
|
||||||
sizes = [#Square
|
sizes = [#Square
|
||||||
(896,896,896),
|
(896,896,896),
|
||||||
(1536,1536,1536),
|
(1536,1536,1536),
|
||||||
(2176, 2176,2176),
|
(2176, 2176,2176),
|
||||||
#Rank-32 updates
|
#Rank-32 updates
|
||||||
(896,896,32),
|
(896,896,32),
|
||||||
(1536,1536,32),
|
(1536,1536,32),
|
||||||
(2176,2176,32),
|
(2176,2176,32),
|
||||||
#Covariance
|
#Covariance
|
||||||
(32,32,16000),
|
(32,32,16000),
|
||||||
(64,64,64000),
|
(64,64,64000),
|
||||||
(256,256,32000),
|
(256,256,32000)]
|
||||||
#Convolutions
|
#DeepSpeech
|
||||||
(3025,64,363),
|
sizes = []
|
||||||
(729,192,1200),
|
for MK in [1760, 2048, 2560]:
|
||||||
(169,384,1728),
|
for N in [16, 32, 64, 128, MK]:
|
||||||
(169,256,3456),
|
sizes += [(MK, N, MK)]
|
||||||
(169,128,2304),
|
|
||||||
(169,2304,256),
|
|
||||||
(169,3456,256),
|
|
||||||
(169,1728,384),
|
|
||||||
(729,1600,192),
|
|
||||||
(3025,363,64),
|
|
||||||
(2304,256,169),
|
|
||||||
(3456,256,169),
|
|
||||||
(1728,384,169),
|
|
||||||
(1600,192,729),
|
|
||||||
(363,64,3025)]
|
|
||||||
elif level=='full':
|
elif level=='full':
|
||||||
sizes = product(pow2range(5, 12), pow2range(5, 12), pow2range(5, 17))
|
sizes = product(pow2range(5, 12), pow2range(5, 12), pow2range(5, 17))
|
||||||
|
|
||||||
@@ -188,15 +177,20 @@ class Tuner:
|
|||||||
clf = RandomForestRegressor(min(10, idx+1), max_depth=min(10, idx+1)).fit(X, Y)
|
clf = RandomForestRegressor(min(10, idx+1), max_depth=min(10, idx+1)).fit(X, Y)
|
||||||
#clf, nrmse = model.train(X, Y, profiles)
|
#clf, nrmse = model.train(X, Y, profiles)
|
||||||
predperf = clf.predict(x)[0]
|
predperf = clf.predict(x)[0]
|
||||||
best = (-predperf).argsort()[:5]
|
best = (-predperf).argsort()
|
||||||
perf = []
|
perf = []
|
||||||
for b in best:
|
for b in best:
|
||||||
try:
|
try:
|
||||||
perf += [performance(x, tools.benchmark(operation, profiles[b], tree))]
|
perf += [performance(x, tools.benchmark(operation, profiles[b], tree))]
|
||||||
|
break
|
||||||
except profile_execution_failure:
|
except profile_execution_failure:
|
||||||
pass
|
pass
|
||||||
predicted = profiles[best[argmax(perf)]]
|
if perf:
|
||||||
retune = not optimize.is_local_optimum(predicted, operation, x, context)
|
predicted = profiles[best[argmax(perf)]]
|
||||||
|
retune = not optimize.is_local_optimum(predicted, operation, x, context)
|
||||||
|
else:
|
||||||
|
retune = True
|
||||||
|
predicted = None
|
||||||
|
|
||||||
#Retune if necessary
|
#Retune if necessary
|
||||||
if retune:
|
if retune:
|
||||||
|
Reference in New Issue
Block a user