Bench: Fixed CUDA synchronization issue

2016-09-29 17:36:21 -04:00
parent 8bf7344681
commit 5d0e29db1f
5 changed files with 31 additions and 33 deletions
--- a/bench/blas.cpp
+++ b/bench/blas.cpp
@@ -45,10 +45,10 @@ void bench(sc::numeric_type dtype, std::string operation)
  using std::get;
  using std::make_tuple;

-  unsigned int dtsize = sc::size_of(dtype);
+  //unsigned int dtsize = sc::size_of(dtype);
  sc::driver::CommandQueue & queue = sc::driver::backend::queues::get(sc::driver::backend::contexts::get_default(),0);
  auto sync = [&](){ queue.synchronize(); };
-
+  auto cusync = [&](){ cudaDeviceSynchronize(); };
  /*---------*/
  /*--BLAS1--*/
  /*---------*/
@@ -73,7 +73,7 @@ void bench(sc::numeric_type dtype, std::string operation)
      times.push_back(bench([&](){cblas_saxpy(N, alpha, cx.data(), 1, cy.data(), 1);}, sync));
 #endif
 #ifdef BENCH_CUBLAS
-      times.push_back(bench([&](){cublasSaxpy(N, alpha, (T*)cu(x), 1, (T*)cu(y), 1);}, sync));
+      times.push_back(bench([&](){cublasSaxpy(N, alpha, (T*)cu(x), 1, (T*)cu(y), 1);}, cusync));
 #endif
    }
  }
@@ -99,7 +99,7 @@ void bench(sc::numeric_type dtype, std::string operation)
      times.push_back(bench([&](){cblas_sdot(N, cx.data(), 1, cy.data(), 1);}, sync));
 #endif
 #ifdef BENCH_CUBLAS
-      times.push_back(bench([&](){cublasSdot(N, (T*)cu(x), 1, (T*)cu(y), 1);}, sync));
+      times.push_back(bench([&](){cublasSdot(N, (T*)cu(x), 1, (T*)cu(y), 1);}, cusync));
 #endif
    }
  }
@@ -155,7 +155,7 @@ void bench(sc::numeric_type dtype, std::string operation)
      times.push_back(bench([&](){cblas_sgemv(CblasColMajor, AT?CblasTrans:CblasNoTrans, As1, As2, 1, cA.data(), lda, cx.data(), 1, 0, cy.data(), 1);}, sync));
 #endif
 #ifdef BENCH_CUBLAS
-      times.push_back(bench([&](){cublasSgemv(AT?'t':'n', As1, As2, 1, (T*)cu(A), lda, (T*)cu(x), 1, 0, (T*)cu(y), 1);}, sync));
+      times.push_back(bench([&](){cublasSgemv(AT?'t':'n', As1, As2, 1, (T*)cu(A), lda, (T*)cu(x), 1, 0, (T*)cu(y), 1);}, cusync));
 #endif
    }
  }
@@ -238,7 +238,7 @@ void bench(sc::numeric_type dtype, std::string operation)
      times.push_back(bench([&](){cblas_sgemm(CblasColMajor, AT?CblasTrans:CblasNoTrans, BT?CblasTrans:CblasNoTrans, M, N, K, 1, cA.data(), lda, cB.data(), ldb, 1, cC.data(), ldc);}, sync));
 #endif
 #ifdef BENCH_CUBLAS
-      times.push_back(bench([&](){cublasSgemm(AT?'t':'n', BT?'t':'n', M, N, K, 1, (T*)cu(A), lda, (T*)cu(B), ldb, 1, (T*)cu(C), ldc);}, sync));
+      times.push_back(bench([&](){cublasSgemm(AT?'t':'n', BT?'t':'n', M, N, K, 1, (T*)cu(A), lda, (T*)cu(B), ldb, 1, (T*)cu(C), ldc);}, cusync));
 #endif
      std::cout << name << "\t" << M << "\t" << N << "\t" << K << "\t" << cAT << "\t" << cBT;
      std::transform(times.begin(), times.end(), std::back_inserter(tflops), [&](double t){ return 2*M*N*K/t*1e-3;});
--- a/lib/runtime/inference/database/nvidia/json/sm_6_1.json
+++ b/lib/runtime/inference/database/nvidia/json/sm_6_1.json
--- a/python/setup.py
+++ b/python/setup.py
@@ -73,7 +73,11 @@ def main():
      libraries += ['gnustl_shared']

    #Source files
+<<<<<<< Updated upstream
    src =  'src/lib/exception/api.cpp src/lib/exception/driver.cpp src/lib/value_scalar.cpp src/lib/random/rand.cpp src/lib/driver/check.cpp src/lib/driver/ndrange.cpp src/lib/driver/platform.cpp src/lib/driver/backend.cpp src/lib/driver/program.cpp src/lib/driver/command_queue.cpp src/lib/driver/event.cpp src/lib/driver/kernel.cpp src/lib/driver/handle.cpp src/lib/driver/device.cpp src/lib/driver/program_cache.cpp src/lib/driver/buffer.cpp src/lib/driver/context.cpp src/lib/driver/dispatch.cpp src/lib/jit/generation/engine/stream.cpp src/lib/jit/generation/engine/keywords.cpp src/lib/jit/generation/reduce_1d.cpp src/lib/jit/generation/elementwise_1d.cpp src/lib/jit/generation/base.cpp src/lib/jit/generation/elementwise_2d.cpp src/lib/jit/generation/matrix_product.cpp src/lib/jit/generation/reduce_2d.cpp src/lib/jit/syntax/engine/object.cpp src/lib/jit/syntax/engine/macro.cpp src/lib/jit/syntax/engine/process.cpp src/lib/jit/syntax/engine/binder.cpp src/lib/jit/syntax/expression/operations.cpp src/lib/jit/syntax/expression/expression.cpp src/lib/jit/syntax/expression/preset.cpp src/lib/api/blas/clBLAS.cpp src/lib/api/blas/cublas.cpp src/lib/runtime/execute.cpp src/lib/runtime/scheduler/dag.cpp src/lib/runtime/scheduler/strategies/heft.cpp src/lib/runtime/inference/predictors/random_forest.cpp src/lib/runtime/inference/profiles.cpp src/lib/runtime/inference/database.cpp src/lib/array.cpp '.split() + [os.path.join('src', 'bind', sf)  for sf in ['_isaac.cpp', 'core.cpp', 'driver.cpp', 'kernels.cpp', 'exceptions.cpp']]
+=======
+    src =  'src/lib/random/rand.cpp src/lib/jit/syntax/expression/preset.cpp src/lib/jit/syntax/expression/expression.cpp src/lib/jit/syntax/expression/operations.cpp src/lib/jit/syntax/engine/binder.cpp src/lib/jit/syntax/engine/macro.cpp src/lib/jit/syntax/engine/object.cpp src/lib/jit/syntax/engine/process.cpp src/lib/jit/generation/base.cpp src/lib/jit/generation/engine/keywords.cpp src/lib/jit/generation/engine/stream.cpp src/lib/jit/generation/reduce_1d.cpp src/lib/jit/generation/elementwise_1d.cpp src/lib/jit/generation/matrix_product.cpp src/lib/jit/generation/reduce_2d.cpp src/lib/jit/generation/elementwise_2d.cpp src/lib/runtime/execute.cpp src/lib/runtime/scheduler/dag.cpp src/lib/runtime/scheduler/strategies/heft.cpp src/lib/runtime/inference/predictors/random_forest.cpp src/lib/runtime/inference/database.cpp src/lib/runtime/inference/profiles.cpp src/lib/value_scalar.cpp src/lib/exception/api.cpp src/lib/exception/driver.cpp src/lib/driver/kernel.cpp src/lib/driver/program.cpp src/lib/driver/check.cpp src/lib/driver/backend.cpp src/lib/driver/context.cpp src/lib/driver/command_queue.cpp src/lib/driver/device.cpp src/lib/driver/platform.cpp src/lib/driver/handle.cpp src/lib/driver/dispatch.cpp src/lib/driver/buffer.cpp src/lib/driver/program_cache.cpp src/lib/driver/event.cpp src/lib/driver/ndrange.cpp src/lib/array.cpp src/lib/api/blas/cublas.cpp src/lib/api/blas/clBLAS.cpp '.split() + [os.path.join('src', 'bind', sf)  for sf in ['_isaac.cpp', 'core.cpp', 'driver.cpp', 'kernels.cpp', 'exceptions.cpp']]
+>>>>>>> Stashed changes
    boostsrc = 'external/boost/libs/'
    for s in ['numpy','python','smart_ptr','system','thread']:
        src = src + [x for x in recursive_glob('external/boost/libs/' + s + '/src/','.cpp') if 'win32' not in x and 'pthread' not in x]
--- a/tune/android/tune/optimize.py
+++ b/tune/android/tune/optimize.py
@@ -192,7 +192,7 @@ def is_local_optimum(parameters, template, sizes, context):
    elif issubclass(template, sc.templates.reduce_2d):
        sweep_over = [0,1,2,3,4]
    elif issubclass(template, sc.templates.matrix_product):
-        sweep_over = [1,2,3,4,5,6,7]
+        sweep_over = [0,1,2,3,4]
    
    #Evaluate the provided parameters guess
    try:
--- a/tune/android/tune/tune.py
+++ b/tune/android/tune/tune.py
@@ -99,34 +99,23 @@ class Tuner:
            if level=='simple':
                sizes = [(2560,2560,2560)]
            elif level=='intermediate':
-                sizes = [#Square
+               sizes = [#Square
                         (896,896,896),
-			 (1536,1536,1536),
-			 (2176, 2176,2176),
+                         (1536,1536,1536),
+                         (2176, 2176,2176),
                         #Rank-32 updates
                         (896,896,32),
                         (1536,1536,32),
                         (2176,2176,32),
                         #Covariance
-			 (32,32,16000),
-			 (64,64,64000),
-                         (256,256,32000),
-                         #Convolutions
-                         (3025,64,363),
-                         (729,192,1200),
-                         (169,384,1728),
-                         (169,256,3456),
-                         (169,128,2304),
-                         (169,2304,256),
-                         (169,3456,256),
-                         (169,1728,384),
-                         (729,1600,192),
-                         (3025,363,64), 
-                         (2304,256,169),
-                         (3456,256,169),
-                         (1728,384,169),
-                         (1600,192,729),
-                         (363,64,3025)]
+                         (32,32,16000),
+                         (64,64,64000),
+                         (256,256,32000)]
+               #DeepSpeech
+               sizes = []
+               for MK in [1760, 2048, 2560]:
+                   for N in [16, 32, 64, 128, MK]:
+                       sizes += [(MK, N, MK)]
            elif level=='full':
 			    sizes = product(pow2range(5, 12), pow2range(5, 12), pow2range(5, 17))

@@ -188,15 +177,20 @@ class Tuner:
                    clf = RandomForestRegressor(min(10, idx+1), max_depth=min(10, idx+1)).fit(X, Y)
                    #clf, nrmse = model.train(X, Y, profiles)
                    predperf = clf.predict(x)[0]
-                    best = (-predperf).argsort()[:5]
+                    best = (-predperf).argsort()
                    perf = []
                    for b in best:
                        try:
                            perf += [performance(x, tools.benchmark(operation, profiles[b], tree))]
+                            break
                        except profile_execution_failure:
                            pass
-                    predicted = profiles[best[argmax(perf)]]
-                retune = not optimize.is_local_optimum(predicted, operation, x, context)
+                    if perf:
+                        predicted = profiles[best[argmax(perf)]]
+                        retune = not optimize.is_local_optimum(predicted, operation, x, context)
+                    else:
+                        retune = True
+                        predicted = None
                
            #Retune if necessary
            if retune: