From 7ccd771cccbc5ae7832b0031c5e1070414caf113 Mon Sep 17 00:00:00 2001 From: Stephen Akridge Date: Wed, 1 Aug 2018 14:10:39 -0700 Subject: [PATCH] Only send sigverify to GPU if batch size is >64 Seems to be a decent crossover point for Xeon E5-2620 v4 8c,16t vs. nvidia 1080ti --- Cargo.toml | 4 ++++ benches/sigverify.rs | 36 ++++++++++++++++++++++++++++++++++++ src/sigverify.rs | 16 +++++++++++++++- src/transaction.rs | 1 - 4 files changed, 55 insertions(+), 2 deletions(-) create mode 100644 benches/sigverify.rs diff --git a/Cargo.toml b/Cargo.toml index 4389e28607..6d1b39dd29 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -116,3 +116,7 @@ harness = false [[bench]] name = "signature" harness = false + +[[bench]] +name = "sigverify" +harness = false diff --git a/benches/sigverify.rs b/benches/sigverify.rs new file mode 100644 index 0000000000..0c774f1e5b --- /dev/null +++ b/benches/sigverify.rs @@ -0,0 +1,36 @@ +#[macro_use] +extern crate criterion; +extern crate bincode; +extern crate rayon; +extern crate solana; + +use criterion::{Bencher, Criterion}; +use solana::packet::{to_packets, PacketRecycler}; +use solana::sigverify; +use solana::transaction::test_tx; + +fn bench_sig_verify(bencher: &mut Bencher) { + let tx = test_tx(); + + // generate packet vector + let packet_recycler = PacketRecycler::default(); + let batches = to_packets(&packet_recycler, &vec![tx; 128]); + + // verify packets + bencher.iter(|| { + let _ans = sigverify::ed25519_verify(&batches); + }) +} + +fn bench(criterion: &mut Criterion) { + criterion.bench_function("bench_sig_verify", |bencher| { + bench_sig_verify(bencher); + }); +} + +criterion_group!( + name = benches; + config = Criterion::default().sample_size(2); + targets = bench +); +criterion_main!(benches); diff --git a/src/sigverify.rs b/src/sigverify.rs index 107eea5471..85517a7a03 100644 --- a/src/sigverify.rs +++ b/src/sigverify.rs @@ -41,7 +41,6 @@ pub fn init() { // stub } -#[cfg(not(feature = "cuda"))] fn verify_packet(packet: &Packet) -> u8 { use ring::signature; use signature::{PublicKey, Signature}; @@ -81,6 +80,11 @@ fn batch_size(batches: &[SharedPackets]) -> usize { #[cfg_attr(feature = "cargo-clippy", allow(ptr_arg))] #[cfg(not(feature = "cuda"))] pub fn ed25519_verify(batches: &Vec) -> Vec> { + ed25519_verify_cpu(batches) +} + +#[cfg_attr(feature = "cargo-clippy", allow(ptr_arg))] +pub fn ed25519_verify_cpu(batches: &Vec) -> Vec> { use rayon::prelude::*; let count = batch_size(batches); info!("CPU ECDSA for {}", batch_size(batches)); @@ -134,6 +138,16 @@ pub fn init() { pub fn ed25519_verify(batches: &Vec) -> Vec> { use packet::PACKET_DATA_SIZE; let count = batch_size(batches); + + // micro-benchmarks show GPU time for smallest batch around 15-20ms + // and CPU speed for 64-128 sig verifies around 10-20ms. 64 is a nice + // power-of-two number around that accounting for the fact that the CPU + // may be busy doing other things while being a real fullnode + // TODO: dynamically adjust this crossover + if count < 64 { + return ed25519_verify_cpu(batches); + } + info!("CUDA ECDSA for {}", batch_size(batches)); let mut out = Vec::new(); let mut elems = Vec::new(); diff --git a/src/transaction.rs b/src/transaction.rs index 5781a4b4d1..718bc57e30 100644 --- a/src/transaction.rs +++ b/src/transaction.rs @@ -207,7 +207,6 @@ impl Transaction { } } -#[cfg(test)] pub fn test_tx() -> Transaction { let keypair1 = KeyPair::new(); let pubkey1 = keypair1.pubkey();