Remove CUDA feature (#6094)

2019-09-26 13:36:51 -07:00
parent a964570b1a
commit b4da83a3ab
33 changed files with 375 additions and 512 deletions
--- a/core/Cargo.toml
+++ b/core/Cargo.toml
@ -14,7 +14,6 @@ edition = "2018"
 codecov = { repository = "solana-labs/solana", branch = "master", service = "github" }

 [features]
-cuda = []
 pin_gpu_memory = []

 [dependencies]
@ -27,6 +26,8 @@ core_affinity = "0.5.9"
 crc = { version = "1.8.1", optional = true }
 crossbeam-channel = "0.3"
 dir-diff = "0.3.1"
+dlopen = "0.1.8"
+dlopen_derive = "0.1.4"
 fs_extra = "1.1.0"
 indexmap = "1.1"
 itertools = "0.8.0"
--- a/core/build.rs
+++ b/core/build.rs
@ -1,50 +0,0 @@
-use std::env;
-use std::fs;
-use std::path::Path;
-use std::process::exit;
-
-fn main() {
-    println!("cargo:rerun-if-changed=build.rs");
-
-    if env::var("CARGO_FEATURE_CUDA").is_ok() {
-        if cfg!(not(target_os = "linux")) {
-            eprintln!("Error: CUDA feature is only available on Linux");
-            exit(1);
-        }
-        println!("cargo:rustc-cfg=cuda");
-
-        let perf_libs_dir = {
-            let manifest_dir = env::var("CARGO_MANIFEST_DIR").unwrap();
-            let mut path = Path::new(&manifest_dir);
-            path = path.parent().unwrap();
-            let mut path = path.join(Path::new("target/perf-libs"));
-            path.push(env::var("SOLANA_PERF_LIBS_CUDA").unwrap_or_else(|err| {
-                eprintln!("Error: SOLANA_PERF_LIBS_CUDA not defined: {}", err);
-                exit(1);
-            }));
-            path
-        };
-        let perf_libs_dir = perf_libs_dir.to_str().unwrap();
-
-        // Ensure `perf_libs_dir` exists.  It's been observed that
-        // a cargo:rerun-if-changed= directive with a non-existent
-        // directory triggers a rebuild on every |cargo build| invocation
-        fs::create_dir_all(&perf_libs_dir).unwrap_or_else(|err| {
-            if err.kind() != std::io::ErrorKind::AlreadyExists {
-                panic!("Unable to create {}: {:?}", perf_libs_dir, err);
-            }
-        });
-        println!("cargo:rerun-if-changed={}", perf_libs_dir);
-        println!("cargo:rustc-link-search=native={}", perf_libs_dir);
-        if cfg!(windows) {
-            println!("cargo:rerun-if-changed={}/libcuda-crypt.dll", perf_libs_dir);
-        } else if cfg!(target_os = "macos") {
-            println!(
-                "cargo:rerun-if-changed={}/libcuda-crypt.dylib",
-                perf_libs_dir
-            );
-        } else {
-            println!("cargo:rerun-if-changed={}/libcuda-crypt.so", perf_libs_dir);
-        }
-    }
-}
--- a/core/src/chacha_cuda.rs
+++ b/core/src/chacha_cuda.rs
@ -1,11 +1,8 @@
-// Module used by validators to approve storage mining proofs
-// // in parallel using the GPU
+// Module used by validators to approve storage mining proofs in parallel using the GPU

 use crate::blocktree::Blocktree;
 use crate::chacha::{CHACHA_BLOCK_SIZE, CHACHA_KEY_SIZE};
-use crate::sigverify::{
-    chacha_cbc_encrypt_many_sample, chacha_end_sha_state, chacha_init_sha_state,
-};
+use crate::perf_libs;
 use solana_sdk::hash::Hash;
 use std::io;
 use std::mem::size_of;
@ -22,6 +19,7 @@ pub fn chacha_cbc_encrypt_file_many_keys(
    ivecs: &mut [u8],
    samples: &[u64],
 ) -> io::Result<Vec<Hash>> {
+    let api = perf_libs::api().expect("no perf libs");
    if ivecs.len() % CHACHA_BLOCK_SIZE != 0 {
        return Err(io::Error::new(
            io::ErrorKind::Other,
@ -45,7 +43,7 @@ pub fn chacha_cbc_encrypt_file_many_keys(
    let mut total_size = 0;
    let mut time: f32 = 0.0;
    unsafe {
-        chacha_init_sha_state(int_sha_states.as_mut_ptr(), num_keys as u32);
+        (api.chacha_init_sha_state)(int_sha_states.as_mut_ptr(), num_keys as u32);
    }
    loop {
        match blocktree.get_data_shreds(current_slot, start_index, std::u64::MAX, &mut buffer) {
@ -73,7 +71,7 @@ pub fn chacha_cbc_encrypt_file_many_keys(
                }

                unsafe {
-                    chacha_cbc_encrypt_many_sample(
+                    (api.chacha_cbc_encrypt_many_sample)(
                        buffer[..size].as_ptr(),
                        int_sha_states.as_mut_ptr(),
                        size,
@ -97,7 +95,7 @@ pub fn chacha_cbc_encrypt_file_many_keys(
        }
    }
    unsafe {
-        chacha_end_sha_state(
+        (api.chacha_end_sha_state)(
            int_sha_states.as_ptr(),
            sha_states.as_mut_ptr(),
            num_keys as u32,
@ -114,22 +112,23 @@ pub fn chacha_cbc_encrypt_file_many_keys(

 #[cfg(test)]
 mod tests {
+    use super::*;
    use crate::blocktree::get_tmp_ledger_path;
-    use crate::blocktree::Blocktree;
    use crate::chacha::chacha_cbc_encrypt_ledger;
-    use crate::chacha_cuda::chacha_cbc_encrypt_file_many_keys;
    use crate::entry::create_ticks;
    use crate::replicator::sample_file;
    use solana_sdk::clock::DEFAULT_SLOTS_PER_SEGMENT;
-    use solana_sdk::hash::Hash;
    use solana_sdk::signature::{Keypair, KeypairUtil};
    use std::fs::{remove_dir_all, remove_file};
    use std::path::Path;
-    use std::sync::Arc;

    #[test]
    fn test_encrypt_file_many_keys_single() {
        solana_logger::setup();
+        if perf_libs::api().is_none() {
+            info!("perf-libs unavailable, skipped");
+            return;
+        }

        let slots_per_segment = 32;
        let entries = create_ticks(slots_per_segment, Hash::default());
@ -189,6 +188,10 @@ mod tests {
    #[test]
    fn test_encrypt_file_many_keys_multiple_keys() {
        solana_logger::setup();
+        if perf_libs::api().is_none() {
+            info!("perf-libs unavailable, skipped");
+            return;
+        }

        let entries = create_ticks(32, Hash::default());
        let ledger_dir = "test_encrypt_file_many_keys_multiple";
@ -255,6 +258,12 @@ mod tests {

    #[test]
    fn test_encrypt_file_many_keys_bad_key_length() {
+        solana_logger::setup();
+        if perf_libs::api().is_none() {
+            info!("perf-libs unavailable, skipped");
+            return;
+        }
+
        let mut keys = hex!("abc123");
        let ledger_dir = "test_encrypt_file_many_keys_bad_key_length";
        let ledger_path = get_tmp_ledger_path(ledger_dir);
--- a/core/src/cuda_runtime.rs
+++ b/core/src/cuda_runtime.rs
@ -5,48 +5,55 @@
 //    copies from host memory to GPU memory unless the memory is page-pinned and
 //    cannot be paged to disk. The cuda driver provides these interfaces to pin and unpin memory.

+#[cfg(feature = "pin_gpu_memory")]
+use crate::perf_libs;
 use crate::recycler::Reset;
-
-#[cfg(all(feature = "cuda", feature = "pin_gpu_memory"))]
-use crate::sigverify::{cuda_host_register, cuda_host_unregister};
 use std::ops::{Deref, DerefMut};

-#[cfg(all(feature = "cuda", feature = "pin_gpu_memory"))]
+#[cfg(feature = "pin_gpu_memory")]
 use std::os::raw::c_int;

-#[cfg(all(feature = "cuda", feature = "pin_gpu_memory"))]
+#[cfg(feature = "pin_gpu_memory")]
 const CUDA_SUCCESS: c_int = 0;

 pub fn pin<T>(_mem: &mut Vec<T>) {
-    #[cfg(all(feature = "cuda", feature = "pin_gpu_memory"))]
-    unsafe {
-        use core::ffi::c_void;
-        use std::mem::size_of;
+    #[cfg(feature = "pin_gpu_memory")]
+    {
+        if let Some(api) = perf_libs::api() {
+            unsafe {
+                use core::ffi::c_void;
+                use std::mem::size_of;

-        let err = cuda_host_register(
-            _mem.as_mut_ptr() as *mut c_void,
-            _mem.capacity() * size_of::<T>(),
-            0,
-        );
-        if err != CUDA_SUCCESS {
-            error!(
-                "cudaHostRegister error: {} ptr: {:?} bytes: {}",
-                err,
-                _mem.as_ptr(),
-                _mem.capacity() * size_of::<T>()
-            );
+                let err = (api.cuda_host_register)(
+                    _mem.as_mut_ptr() as *mut c_void,
+                    _mem.capacity() * size_of::<T>(),
+                    0,
+                );
+                if err != CUDA_SUCCESS {
+                    error!(
+                        "cudaHostRegister error: {} ptr: {:?} bytes: {}",
+                        err,
+                        _mem.as_ptr(),
+                        _mem.capacity() * size_of::<T>()
+                    );
+                }
+            }
        }
    }
 }

 pub fn unpin<T>(_mem: *mut T) {
-    #[cfg(all(feature = "cuda", feature = "pin_gpu_memory"))]
-    unsafe {
-        use core::ffi::c_void;
+    #[cfg(feature = "pin_gpu_memory")]
+    {
+        if let Some(api) = perf_libs::api() {
+            unsafe {
+                use core::ffi::c_void;

-        let err = cuda_host_unregister(_mem as *mut c_void);
-        if err != CUDA_SUCCESS {
-            error!("cudaHostUnregister returned: {} ptr: {:?}", err, _mem);
+                let err = (api.cuda_host_unregister)(_mem as *mut c_void);
+                if err != CUDA_SUCCESS {
+                    error!("cudaHostUnregister returned: {} ptr: {:?}", err, _mem);
+                }
+            }
        }
    }
 }
--- a/core/src/entry.rs
+++ b/core/src/entry.rs
@ -3,6 +3,7 @@
 //! transactions within it. Entries cannot be reordered, and its field `num_hashes`
 //! represents an approximate amount of time since the last Entry was created.
 use crate::packet::{Blob, SharedBlob};
+use crate::perf_libs;
 use crate::poh::Poh;
 use crate::result::Result;
 use bincode::{deserialize, serialized_size};
@ -10,20 +11,14 @@ use rayon::prelude::*;
 use rayon::ThreadPool;
 use solana_merkle_tree::MerkleTree;
 use solana_metrics::inc_new_counter_warn;
+use solana_rayon_threadlimit::get_thread_count;
 use solana_sdk::hash::Hash;
 use solana_sdk::timing;
 use solana_sdk::transaction::Transaction;
 use std::borrow::Borrow;
 use std::cell::RefCell;
 use std::sync::mpsc::{Receiver, Sender};
-use std::sync::{Arc, RwLock};
-
-#[cfg(feature = "cuda")]
-use crate::sigverify::poh_verify_many;
-use solana_rayon_threadlimit::get_thread_count;
-#[cfg(feature = "cuda")]
-use std::sync::Mutex;
-#[cfg(feature = "cuda")]
+use std::sync::{Arc, Mutex, RwLock};
 use std::thread;
 use std::time::Instant;

@ -257,13 +252,12 @@ impl EntrySlice for [Entry] {
        res
    }

-    #[cfg(not(feature = "cuda"))]
-    fn verify(&self, start_hash: &Hash) -> bool {
-        self.verify_cpu(start_hash)
-    }
-
-    #[cfg(feature = "cuda")]
    fn verify(&self, start_hash: &Hash) -> bool {
+        let api = perf_libs::api();
+        if api.is_none() {
+            return self.verify_cpu(start_hash);
+        }
+        let api = api.unwrap();
        inc_new_counter_warn!("entry_verify-num_entries", self.len() as usize);

        // Use CPU verify if the batch length is < 1K
@ -287,7 +281,7 @@ impl EntrySlice for [Entry] {
            .collect();

        let num_hashes_vec: Vec<u64> = self
-            .into_iter()
+            .iter()
            .map(|entry| entry.num_hashes.saturating_sub(1))
            .collect();

@ -300,7 +294,7 @@ impl EntrySlice for [Entry] {
            let mut hashes = hashes_clone.lock().unwrap();
            let res;
            unsafe {
-                res = poh_verify_many(
+                res = (api.poh_verify_many)(
                    hashes.as_mut_ptr() as *mut u8,
                    num_hashes_vec.as_ptr(),
                    length,
--- a/core/src/lib.rs
+++ b/core/src/lib.rs
@ -10,10 +10,10 @@ pub mod banking_stage;
 pub mod blob_fetch_stage;
 pub mod broadcast_stage;
 pub mod chacha;
-#[cfg(cuda)]
 pub mod chacha_cuda;
 pub mod cluster_info_vote_listener;
 pub mod confidence;
+pub mod perf_libs;
 pub mod recycler;
 #[macro_use]
 pub mod contact_info;
@ -75,6 +75,9 @@ pub(crate) mod version;
 pub mod weighted_shuffle;
 pub mod window_service;

+#[macro_use]
+extern crate dlopen_derive;
+
 #[macro_use]
 extern crate solana_budget_program;

--- a/core/src/perf_libs.rs
+++ b/core/src/perf_libs.rs
@ -0,0 +1,171 @@
+use crate::packet::Packet;
+use core::ffi::c_void;
+use dlopen::symbor::{Container, SymBorApi, Symbol};
+use std::env;
+use std::ffi::OsStr;
+use std::fs;
+use std::os::raw::{c_int, c_uint};
+use std::path::{Path, PathBuf};
+use std::sync::Once;
+
+#[repr(C)]
+pub struct Elems {
+    pub elems: *const Packet,
+    pub num: u32,
+}
+
+#[derive(SymBorApi)]
+pub struct Api<'a> {
+    pub ed25519_init: Symbol<'a, unsafe extern "C" fn() -> bool>,
+    pub ed25519_set_verbose: Symbol<'a, unsafe extern "C" fn(val: bool)>,
+
+    #[allow(clippy::type_complexity)]
+    pub ed25519_verify_many: Symbol<
+        'a,
+        unsafe extern "C" fn(
+            vecs: *const Elems,
+            num: u32,          //number of vecs
+            message_size: u32, //size of each element inside the elems field of the vec
+            total_packets: u32,
+            total_signatures: u32,
+            message_lens: *const u32,
+            pubkey_offsets: *const u32,
+            signature_offsets: *const u32,
+            signed_message_offsets: *const u32,
+            out: *mut u8, //combined length of all the items in vecs
+            use_non_default_stream: u8,
+        ) -> u32,
+    >,
+
+    pub chacha_cbc_encrypt_many_sample: Symbol<
+        'a,
+        unsafe extern "C" fn(
+            input: *const u8,
+            sha_state: *mut u8,
+            in_len: usize,
+            keys: *const u8,
+            ivec: *mut u8,
+            num_keys: u32,
+            samples: *const u64,
+            num_samples: u32,
+            starting_block: u64,
+            time_us: *mut f32,
+        ),
+    >,
+
+    pub chacha_init_sha_state: Symbol<'a, unsafe extern "C" fn(sha_state: *mut u8, num_keys: u32)>,
+    pub chacha_end_sha_state:
+        Symbol<'a, unsafe extern "C" fn(sha_state_in: *const u8, out: *mut u8, num_keys: u32)>,
+
+    pub poh_verify_many: Symbol<
+        'a,
+        unsafe extern "C" fn(
+            hashes: *mut u8,
+            num_hashes_arr: *const u64,
+            num_elems: usize,
+            use_non_default_stream: u8,
+        ) -> c_int,
+    >,
+
+    pub cuda_host_register:
+        Symbol<'a, unsafe extern "C" fn(ptr: *mut c_void, size: usize, flags: c_uint) -> c_int>,
+
+    pub cuda_host_unregister: Symbol<'a, unsafe extern "C" fn(ptr: *mut c_void) -> c_int>,
+}
+
+static mut API: Option<Container<Api>> = None;
+
+fn init(name: &OsStr) {
+    static INIT_HOOK: Once = Once::new();
+
+    info!("Loading {:?}", name);
+    unsafe {
+        INIT_HOOK.call_once(|| {
+            API = Some(Container::load(name).unwrap_or_else(|err| {
+                error!("Unable to load {:?}: {}", name, err);
+                std::process::exit(1);
+            }));
+        })
+    }
+}
+
+fn locate_perf_libs() -> Option<PathBuf> {
+    let exe = env::current_exe().expect("Unable to get executable path");
+    let perf_libs = exe.parent().unwrap().join("perf-libs");
+    if perf_libs.is_dir() {
+        info!("perf-libs found at {:?}", perf_libs);
+        return Some(perf_libs);
+    }
+    warn!("{:?} does not exist", perf_libs);
+    None
+}
+
+fn find_cuda_home(perf_libs_path: &Path) -> Option<PathBuf> {
+    // Search /usr/local for a `cuda-` directory that matches a perf-libs subdirectory
+    for entry in fs::read_dir(&perf_libs_path).unwrap() {
+        if let Ok(entry) = entry {
+            let path = entry.path();
+            if !path.is_dir() {
+                continue;
+            }
+            let dir_name = path.file_name().unwrap().to_str().unwrap_or("");
+            if !dir_name.starts_with("cuda-") {
+                continue;
+            }
+
+            let cuda_home: PathBuf = ["/", "usr", "local", dir_name].iter().collect();
+            if !cuda_home.is_dir() {
+                continue;
+            }
+
+            return Some(cuda_home);
+        }
+    }
+    None
+}
+
+pub fn init_cuda() {
+    if let Some(perf_libs_path) = locate_perf_libs() {
+        if let Some(cuda_home) = find_cuda_home(&perf_libs_path) {
+            info!("CUDA installation found at {:?}", cuda_home);
+
+            let cuda_lib64_dir = cuda_home.join("lib64");
+            if cuda_lib64_dir.is_dir() {
+                let ld_library_path = cuda_lib64_dir.to_str().unwrap_or("").to_string()
+                    + ":"
+                    + &env::var("LD_LIBRARY_PATH").unwrap_or_else(|_| "".to_string());
+                info!("LD_LIBRARY_PATH set to {:?}", ld_library_path);
+
+                // Prefix LD_LIBRARY_PATH with $CUDA_HOME/lib64 directory
+                // to ensure the correct CUDA version is used
+                env::set_var("LD_LIBRARY_PATH", ld_library_path)
+            } else {
+                warn!("{:?} does not exist", cuda_lib64_dir);
+            }
+
+            let libcuda_crypt = perf_libs_path
+                .join(cuda_home.file_name().unwrap())
+                .join("libcuda-crypt.so");
+            return init(libcuda_crypt.as_os_str());
+        } else {
+            warn!("CUDA installation not found");
+        }
+    }
+
+    // Last resort!  Blindly load the shared object and hope it all works out
+    init(OsStr::new("libcuda-crypt.so"))
+}
+
+pub fn api() -> Option<&'static Container<Api<'static>>> {
+    #[cfg(test)]
+    {
+        static INIT_HOOK: Once = Once::new();
+        INIT_HOOK.call_once(|| {
+            if std::env::var("TEST_PERF_LIBS_CUDA").is_ok() {
+                init_cuda();
+            }
+        })
+    }
+
+    unsafe { API.as_ref() }
+}
--- a/core/src/sigverify.rs
+++ b/core/src/sigverify.rs
@ -1,11 +1,12 @@
 //! The `sigverify` module provides digital signature verification functions.
 //! By default, signatures are verified in parallel using all available CPU
-//! cores.  When `--features=cuda` is enabled, signature verification is
-//! offloaded to the GPU.
+//! cores.  When perf-libs are available signature verification is offloaded
+//! to the GPU.
 //!

 use crate::cuda_runtime::PinnedVec;
 use crate::packet::{Packet, Packets};
+use crate::perf_libs;
 use crate::recycler::Recycler;
 use crate::result::Result;
 use bincode::serialized_size;
@ -19,11 +20,7 @@ use solana_sdk::signature::Signature;
 use solana_sdk::transaction::Transaction;
 use std::mem::size_of;

-#[cfg(feature = "cuda")]
-use core::ffi::c_void;
 use solana_rayon_threadlimit::get_thread_count;
-#[cfg(feature = "cuda")]
-use std::os::raw::{c_int, c_uint};
 pub const NUM_THREADS: u32 = 10;
 use std::cell::RefCell;

@ -36,62 +33,16 @@ pub type TxOffset = PinnedVec<u32>;

 type TxOffsets = (TxOffset, TxOffset, TxOffset, TxOffset, Vec<Vec<u32>>);

-#[cfg(feature = "cuda")]
-#[repr(C)]
-struct Elems {
-    elems: *const Packet,
-    num: u32,
-}
-
-#[cfg(feature = "cuda")]
-#[link(name = "cuda-crypt")]
-extern "C" {
-    fn ed25519_init() -> bool;
-    fn ed25519_set_verbose(val: bool);
-    fn ed25519_verify_many(
-        vecs: *const Elems,
-        num: u32,          //number of vecs
-        message_size: u32, //size of each element inside the elems field of the vec
-        total_packets: u32,
-        total_signatures: u32,
-        message_lens: *const u32,
-        pubkey_offsets: *const u32,
-        signature_offsets: *const u32,
-        signed_message_offsets: *const u32,
-        out: *mut u8, //combined length of all the items in vecs
-        use_non_default_stream: u8,
-    ) -> u32;
-
-    pub fn chacha_cbc_encrypt_many_sample(
-        input: *const u8,
-        sha_state: *mut u8,
-        in_len: usize,
-        keys: *const u8,
-        ivec: *mut u8,
-        num_keys: u32,
-        samples: *const u64,
-        num_samples: u32,
-        starting_block: u64,
-        time_us: *mut f32,
-    );
-
-    pub fn chacha_init_sha_state(sha_state: *mut u8, num_keys: u32);
-    pub fn chacha_end_sha_state(sha_state_in: *const u8, out: *mut u8, num_keys: u32);
-
-    pub fn poh_verify_many(
-        hashes: *mut u8,
-        num_hashes_arr: *const u64,
-        num_elems: usize,
-        use_non_default_stream: u8,
-    ) -> c_int;
-
-    pub fn cuda_host_register(ptr: *mut c_void, size: usize, flags: c_uint) -> c_int;
-    pub fn cuda_host_unregister(ptr: *mut c_void) -> c_int;
-}
-
-#[cfg(not(feature = "cuda"))]
 pub fn init() {
-    // stub
+    if let Some(api) = perf_libs::api() {
+        unsafe {
+            (api.ed25519_set_verbose)(true);
+            if !(api.ed25519_init)() {
+                panic!("ed25519_init() failed");
+            }
+            (api.ed25519_set_verbose)(false);
+        }
+    }
 }

 fn verify_packet(packet: &Packet) -> u8 {
@ -130,15 +81,6 @@ fn batch_size(batches: &[Packets]) -> usize {
    batches.iter().map(|p| p.packets.len()).sum()
 }

-#[cfg(not(feature = "cuda"))]
-pub fn ed25519_verify(
-    batches: &[Packets],
-    _recycler: &Recycler<TxOffset>,
-    _recycler_out: &Recycler<PinnedVec<u8>>,
-) -> Vec<Vec<u8>> {
-    ed25519_verify_cpu(batches)
-}
-
 pub fn get_packet_offsets(packet: &Packet, current_offset: u32) -> (u32, u32, u32, u32) {
    let (sig_len, sig_size) = decode_len(&packet.data);
    let msg_start_offset = sig_size + sig_len * size_of::<Signature>();
@ -235,23 +177,17 @@ pub fn ed25519_verify_disabled(batches: &[Packets]) -> Vec<Vec<u8>> {
    rv
 }

-#[cfg(feature = "cuda")]
-pub fn init() {
-    unsafe {
-        ed25519_set_verbose(true);
-        if !ed25519_init() {
-            panic!("ed25519_init() failed");
-        }
-        ed25519_set_verbose(false);
-    }
-}
-
-#[cfg(feature = "cuda")]
 pub fn ed25519_verify(
    batches: &[Packets],
    recycler: &Recycler<TxOffset>,
    recycler_out: &Recycler<PinnedVec<u8>>,
 ) -> Vec<Vec<u8>> {
+    let api = perf_libs::api();
+    if api.is_none() {
+        return ed25519_verify_cpu(batches);
+    }
+    let api = api.unwrap();
+
    use crate::packet::PACKET_DATA_SIZE;
    let count = batch_size(batches);

@ -276,7 +212,7 @@ pub fn ed25519_verify(

    let mut num_packets = 0;
    for p in batches {
-        elems.push(Elems {
+        elems.push(perf_libs::Elems {
            elems: p.packets.as_ptr(),
            num: p.packets.len() as u32,
        });
@ -292,7 +228,7 @@ pub fn ed25519_verify(
    trace!("len offset: {}", PACKET_DATA_SIZE as u32);
    const USE_NON_DEFAULT_STREAM: u8 = 1;
    unsafe {
-        let res = ed25519_verify_many(
+        let res = (api.ed25519_verify_many)(
            elems.as_ptr(),
            elems.len() as u32,
            size_of::<Packet>() as u32,
--- a/core/src/sigverify_stage.rs
+++ b/core/src/sigverify_stage.rs
@ -3,10 +3,11 @@
 //! top-level list with a list of booleans, telling the next stage whether the
 //! signature in that packet is valid. It assumes each packet contains one
 //! transaction. All processing is done on the CPU by default and on a GPU
-//! if the `cuda` feature is enabled with `--features=cuda`.
+//! if perf-libs are available

 use crate::cuda_runtime::PinnedVec;
 use crate::packet::Packets;
+use crate::perf_libs;
 use crate::recycler::Recycler;
 use crate::result::{Error, Result};
 use crate::service::Service;
@ -21,11 +22,8 @@ use std::sync::mpsc::{Receiver, RecvTimeoutError};
 use std::sync::{Arc, Mutex};
 use std::thread::{self, Builder, JoinHandle};

-#[cfg(feature = "cuda")]
-const RECV_BATCH_MAX: usize = 5_000;
-
-#[cfg(not(feature = "cuda"))]
-const RECV_BATCH_MAX: usize = 1000;
+const RECV_BATCH_MAX_CPU: usize = 1_000;
+const RECV_BATCH_MAX_GPU: usize = 5_000;

 pub type VerifiedPackets = Vec<(Packets, Vec<u8>)>;

@ -70,7 +68,11 @@ impl SigVerifyStage {
    ) -> Result<()> {
        let (batch, len, recv_time) = streamer::recv_batch(
            &recvr.lock().expect("'recvr' lock in fn verifier"),
-            RECV_BATCH_MAX,
+            if perf_libs::api().is_some() {
+                RECV_BATCH_MAX_GPU
+            } else {
+                RECV_BATCH_MAX_CPU
+            },
        )?;
        inc_new_counter_info!("sigverify_stage-packets_received", len);

--- a/core/src/storage_stage.rs
+++ b/core/src/storage_stage.rs
@ -4,7 +4,6 @@

 use crate::bank_forks::BankForks;
 use crate::blocktree::Blocktree;
-#[cfg(cuda)]
 use crate::chacha_cuda::chacha_cbc_encrypt_file_many_keys;
 use crate::cluster_info::ClusterInfo;
 use crate::result::{Error, Result};
@ -408,11 +407,11 @@ impl StorageStage {
            samples.push(rng.gen_range(0, 10));
        }
        debug!("generated samples: {:?}", samples);
+
        // TODO: cuda required to generate the reference values
        // but if it is missing, then we need to take care not to
        // process storage mining results.
-        #[cfg(cuda)]
-        {
+        if crate::perf_libs::api().is_some() {
            // Lock the keys, since this is the IV memory,
            // it will be updated in-place by the encryption.
            // Should be overwritten by the proof signatures which replace the
@ -729,10 +728,8 @@ mod tests {
        let keypair = Keypair::new();
        let hash = Hash::default();
        let signature = keypair.sign_message(&hash.as_ref());
-        #[cfg(feature = "cuda")]
+
        let mut result = storage_state.get_mining_result(&signature);
-        #[cfg(not(feature = "cuda"))]
-        let result = storage_state.get_mining_result(&signature);

        assert_eq!(result, Hash::default());

@ -752,26 +749,27 @@ mod tests {
            .collect::<Vec<_>>();
        bank_sender.send(rooted_banks).unwrap();

-        #[cfg(feature = "cuda")]
-        for _ in 0..5 {
-            result = storage_state.get_mining_result(&signature);
-            if result != Hash::default() {
-                info!("found result = {:?} sleeping..", result);
-                break;
+        if crate::perf_libs::api().is_some() {
+            for _ in 0..5 {
+                result = storage_state.get_mining_result(&signature);
+                if result != Hash::default() {
+                    info!("found result = {:?} sleeping..", result);
+                    break;
+                }
+                info!("result = {:?} sleeping..", result);
+                sleep(Duration::new(1, 0));
            }
-            info!("result = {:?} sleeping..", result);
-            sleep(Duration::new(1, 0));
        }

        info!("joining..?");
        exit.store(true, Ordering::Relaxed);
        storage_stage.join().unwrap();

-        #[cfg(not(cuda))]
-        assert_eq!(result, Hash::default());
-
-        #[cfg(cuda)]
-        assert_ne!(result, Hash::default());
+        if crate::perf_libs::api().is_some() {
+            assert_ne!(result, Hash::default());
+        } else {
+            assert_eq!(result, Hash::default());
+        }

        remove_dir_all(ledger_path).unwrap();
    }
--- a/core/src/validator.rs
+++ b/core/src/validator.rs
@ -118,7 +118,14 @@ impl Validator {

        warn!("identity pubkey: {:?}", id);
        warn!("vote pubkey: {:?}", vote_account);
-        warn!("CUDA is {}abled", if cfg!(cuda) { "en" } else { "dis" });
+        warn!(
+            "CUDA is {}abled",
+            if crate::perf_libs::api().is_some() {
+                "en"
+            } else {
+                "dis"
+            }
+        );
        info!("entrypoint: {:?}", entrypoint_info_option);

        Self::print_node_info(&node);