Pull perf into a separate module. (#6718)

automerge
2019-11-04 20:13:43 -08:00
parent 3133ee2401
commit b825d04597
36 changed files with 742 additions and 664 deletions
--- a/perf/Cargo.toml
+++ b/perf/Cargo.toml
@ -0,0 +1,20 @@
+[package]
+name = "solana-perf"
+version = "0.21.0"
+description = "Solana Performance APIs"
+authors = ["Solana Maintainers <maintainers@solana.com>"]
+repository = "https://github.com/solana-labs/solana"
+license = "Apache-2.0"
+homepage = "https://solana.com/"
+edition = "2018"
+
+[dependencies]
+rand = "0.6.5"
+dlopen = "0.1.8"
+dlopen_derive = "0.1.4"
+log = "0.4.8"
+solana-sdk = { path = "../sdk", version = "0.21.0" }
+
+[lib]
+name = "solana_perf"
+
--- a/perf/src/cuda_runtime.rs
+++ b/perf/src/cuda_runtime.rs
@ -0,0 +1,295 @@
+// Module for cuda-related helper functions and wrappers.
+//
+// cudaHostRegister/cudaHostUnregister -
+//    apis for page-pinning memory. Cuda driver/hardware cannot overlap
+//    copies from host memory to GPU memory unless the memory is page-pinned and
+//    cannot be paged to disk. The cuda driver provides these interfaces to pin and unpin memory.
+
+use crate::perf_libs;
+use crate::recycler::Reset;
+use std::ops::{Deref, DerefMut};
+
+#[cfg(feature = "pin_gpu_memory")]
+use std::os::raw::c_int;
+
+#[cfg(feature = "pin_gpu_memory")]
+const CUDA_SUCCESS: c_int = 0;
+
+pub fn pin<T>(_mem: &mut Vec<T>) {
+    #[cfg(feature = "pin_gpu_memory")]
+    {
+        if let Some(api) = perf_libs::api() {
+            unsafe {
+                use core::ffi::c_void;
+                use std::mem::size_of;
+
+                let err = (api.cuda_host_register)(
+                    _mem.as_mut_ptr() as *mut c_void,
+                    _mem.capacity() * size_of::<T>(),
+                    0,
+                );
+                if err != CUDA_SUCCESS {
+                    error!(
+                        "cudaHostRegister error: {} ptr: {:?} bytes: {}",
+                        err,
+                        _mem.as_ptr(),
+                        _mem.capacity() * size_of::<T>()
+                    );
+                }
+            }
+        }
+    }
+}
+
+pub fn unpin<T>(_mem: *mut T) {
+    #[cfg(feature = "pin_gpu_memory")]
+    {
+        if let Some(api) = perf_libs::api() {
+            unsafe {
+                use core::ffi::c_void;
+
+                let err = (api.cuda_host_unregister)(_mem as *mut c_void);
+                if err != CUDA_SUCCESS {
+                    error!("cudaHostUnregister returned: {} ptr: {:?}", err, _mem);
+                }
+            }
+        }
+    }
+}
+
+// A vector wrapper where the underlying memory can be
+// page-pinned. Controlled by flags in case user only wants
+// to pin in certain circumstances.
+#[derive(Debug)]
+pub struct PinnedVec<T> {
+    x: Vec<T>,
+    pinned: bool,
+    pinnable: bool,
+}
+
+impl<T: Default + Clone> Reset for PinnedVec<T> {
+    fn reset(&mut self) {
+        self.resize(0, T::default());
+    }
+}
+
+impl<T: Clone> Default for PinnedVec<T> {
+    fn default() -> Self {
+        Self {
+            x: Vec::new(),
+            pinned: false,
+            pinnable: false,
+        }
+    }
+}
+
+impl<T> Deref for PinnedVec<T> {
+    type Target = Vec<T>;
+
+    fn deref(&self) -> &Self::Target {
+        &self.x
+    }
+}
+
+impl<T> DerefMut for PinnedVec<T> {
+    fn deref_mut(&mut self) -> &mut Vec<T> {
+        &mut self.x
+    }
+}
+
+pub struct PinnedIter<'a, T>(std::slice::Iter<'a, T>);
+
+pub struct PinnedIterMut<'a, T>(std::slice::IterMut<'a, T>);
+
+impl<'a, T> Iterator for PinnedIter<'a, T> {
+    type Item = &'a T;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        self.0.next()
+    }
+}
+
+impl<'a, T> Iterator for PinnedIterMut<'a, T> {
+    type Item = &'a mut T;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        self.0.next()
+    }
+}
+
+impl<'a, T> IntoIterator for &'a mut PinnedVec<T> {
+    type Item = &'a T;
+    type IntoIter = PinnedIter<'a, T>;
+
+    fn into_iter(self) -> Self::IntoIter {
+        PinnedIter(self.iter())
+    }
+}
+
+impl<'a, T> IntoIterator for &'a PinnedVec<T> {
+    type Item = &'a T;
+    type IntoIter = PinnedIter<'a, T>;
+
+    fn into_iter(self) -> Self::IntoIter {
+        PinnedIter(self.iter())
+    }
+}
+
+impl<T: Clone> PinnedVec<T> {
+    pub fn reserve_and_pin(&mut self, size: usize) {
+        if self.x.capacity() < size {
+            if self.pinned {
+                unpin(&mut self.x);
+                self.pinned = false;
+            }
+            self.x.reserve(size);
+        }
+        self.set_pinnable();
+        if !self.pinned {
+            pin(&mut self.x);
+            self.pinned = true;
+        }
+    }
+
+    pub fn set_pinnable(&mut self) {
+        self.pinnable = true;
+    }
+
+    pub fn from_vec(source: Vec<T>) -> Self {
+        Self {
+            x: source,
+            pinned: false,
+            pinnable: false,
+        }
+    }
+
+    pub fn with_capacity(capacity: usize) -> Self {
+        let x = Vec::with_capacity(capacity);
+        Self {
+            x,
+            pinned: false,
+            pinnable: false,
+        }
+    }
+
+    pub fn iter(&self) -> PinnedIter<T> {
+        PinnedIter(self.x.iter())
+    }
+
+    pub fn iter_mut(&mut self) -> PinnedIterMut<T> {
+        PinnedIterMut(self.x.iter_mut())
+    }
+
+    pub fn is_empty(&self) -> bool {
+        self.x.is_empty()
+    }
+
+    pub fn len(&self) -> usize {
+        self.x.len()
+    }
+
+    pub fn as_ptr(&self) -> *const T {
+        self.x.as_ptr()
+    }
+
+    pub fn as_mut_ptr(&mut self) -> *mut T {
+        self.x.as_mut_ptr()
+    }
+
+    pub fn push(&mut self, x: T) {
+        let old_ptr = self.x.as_mut_ptr();
+        let old_capacity = self.x.capacity();
+        // Predict realloc and unpin
+        if self.pinned && self.x.capacity() == self.x.len() {
+            unpin(old_ptr);
+            self.pinned = false;
+        }
+        self.x.push(x);
+        self.check_ptr(old_ptr, old_capacity, "push");
+    }
+
+    pub fn resize(&mut self, size: usize, elem: T) {
+        let old_ptr = self.x.as_mut_ptr();
+        let old_capacity = self.x.capacity();
+        // Predict realloc and unpin.
+        if self.pinned && self.x.capacity() < size {
+            unpin(old_ptr);
+            self.pinned = false;
+        }
+        self.x.resize(size, elem);
+        self.check_ptr(old_ptr, old_capacity, "resize");
+    }
+
+    fn check_ptr(&mut self, _old_ptr: *mut T, _old_capacity: usize, _from: &'static str) {
+        let api = perf_libs::api();
+        if api.is_some()
+            && self.pinnable
+            && (self.x.as_ptr() != _old_ptr || self.x.capacity() != _old_capacity)
+        {
+            if self.pinned {
+                unpin(_old_ptr);
+            }
+
+            trace!(
+                "pinning from check_ptr old: {} size: {} from: {}",
+                _old_capacity,
+                self.x.capacity(),
+                _from
+            );
+            pin(&mut self.x);
+            self.pinned = true;
+        }
+    }
+}
+
+impl<T: Clone> Clone for PinnedVec<T> {
+    fn clone(&self) -> Self {
+        let mut x = self.x.clone();
+        let pinned = if self.pinned {
+            pin(&mut x);
+            true
+        } else {
+            false
+        };
+        debug!(
+            "clone PinnedVec: size: {} pinned?: {} pinnable?: {}",
+            self.x.capacity(),
+            self.pinned,
+            self.pinnable
+        );
+        Self {
+            x,
+            pinned,
+            pinnable: self.pinnable,
+        }
+    }
+}
+
+impl<T> Drop for PinnedVec<T> {
+    fn drop(&mut self) {
+        if self.pinned {
+            unpin(self.x.as_mut_ptr());
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_pinned_vec() {
+        let mut mem = PinnedVec::with_capacity(10);
+        mem.set_pinnable();
+        mem.push(50);
+        mem.resize(2, 10);
+        assert_eq!(mem[0], 50);
+        assert_eq!(mem[1], 10);
+        assert_eq!(mem.len(), 2);
+        assert_eq!(mem.is_empty(), false);
+        let mut iter = mem.iter();
+        assert_eq!(*iter.next().unwrap(), 50);
+        assert_eq!(*iter.next().unwrap(), 10);
+        assert_eq!(iter.next(), None);
+    }
+}
--- a/perf/src/lib.rs
+++ b/perf/src/lib.rs
@ -0,0 +1,6 @@
+pub mod cuda_runtime;
+pub mod perf_libs;
+pub mod recycler;
+
+#[macro_use]
+extern crate log;
--- a/perf/src/perf_libs.rs
+++ b/perf/src/perf_libs.rs
@ -0,0 +1,190 @@
+use core::ffi::c_void;
+use dlopen::symbor::{Container, SymBorApi, Symbol};
+use dlopen_derive::SymBorApi;
+use log::*;
+use solana_sdk::packet::Packet;
+use std::env;
+use std::ffi::OsStr;
+use std::fs;
+use std::os::raw::{c_int, c_uint};
+use std::path::{Path, PathBuf};
+use std::sync::Once;
+
+#[repr(C)]
+pub struct Elems {
+    pub elems: *const Packet,
+    pub num: u32,
+}
+
+#[derive(SymBorApi)]
+pub struct Api<'a> {
+    pub ed25519_init: Symbol<'a, unsafe extern "C" fn() -> bool>,
+    pub ed25519_set_verbose: Symbol<'a, unsafe extern "C" fn(val: bool)>,
+
+    #[allow(clippy::type_complexity)]
+    pub ed25519_verify_many: Symbol<
+        'a,
+        unsafe extern "C" fn(
+            vecs: *const Elems,
+            num: u32,          //number of vecs
+            message_size: u32, //size of each element inside the elems field of the vec
+            total_packets: u32,
+            total_signatures: u32,
+            message_lens: *const u32,
+            pubkey_offsets: *const u32,
+            signature_offsets: *const u32,
+            signed_message_offsets: *const u32,
+            out: *mut u8, //combined length of all the items in vecs
+            use_non_default_stream: u8,
+        ) -> u32,
+    >,
+
+    #[allow(clippy::type_complexity)]
+    pub ed25519_sign_many: Symbol<
+        'a,
+        unsafe extern "C" fn(
+            vecs: *mut Elems,
+            num: u32,          //number of vecs
+            message_size: u32, //size of each element inside the elems field of the vec
+            total_packets: u32,
+            total_signatures: u32,
+            message_lens: *const u32,
+            pubkey_offsets: *const u32,
+            privkey_offsets: *const u32,
+            signed_message_offsets: *const u32,
+            sgnatures_out: *mut u8, //combined length of all the items in vecs
+            use_non_default_stream: u8,
+        ) -> u32,
+    >,
+
+    pub chacha_cbc_encrypt_many_sample: Symbol<
+        'a,
+        unsafe extern "C" fn(
+            input: *const u8,
+            sha_state: *mut u8,
+            in_len: usize,
+            keys: *const u8,
+            ivec: *mut u8,
+            num_keys: u32,
+            samples: *const u64,
+            num_samples: u32,
+            starting_block: u64,
+            time_us: *mut f32,
+        ),
+    >,
+
+    pub chacha_init_sha_state: Symbol<'a, unsafe extern "C" fn(sha_state: *mut u8, num_keys: u32)>,
+    pub chacha_end_sha_state:
+        Symbol<'a, unsafe extern "C" fn(sha_state_in: *const u8, out: *mut u8, num_keys: u32)>,
+
+    pub poh_verify_many: Symbol<
+        'a,
+        unsafe extern "C" fn(
+            hashes: *mut u8,
+            num_hashes_arr: *const u64,
+            num_elems: usize,
+            use_non_default_stream: u8,
+        ) -> c_int,
+    >,
+
+    pub cuda_host_register:
+        Symbol<'a, unsafe extern "C" fn(ptr: *mut c_void, size: usize, flags: c_uint) -> c_int>,
+
+    pub cuda_host_unregister: Symbol<'a, unsafe extern "C" fn(ptr: *mut c_void) -> c_int>,
+}
+
+static mut API: Option<Container<Api>> = None;
+
+fn init(name: &OsStr) {
+    static INIT_HOOK: Once = Once::new();
+
+    info!("Loading {:?}", name);
+    unsafe {
+        INIT_HOOK.call_once(|| {
+            API = Some(Container::load(name).unwrap_or_else(|err| {
+                error!("Unable to load {:?}: {}", name, err);
+                std::process::exit(1);
+            }));
+        })
+    }
+}
+
+fn locate_perf_libs() -> Option<PathBuf> {
+    let exe = env::current_exe().expect("Unable to get executable path");
+    let perf_libs = exe.parent().unwrap().join("perf-libs");
+    if perf_libs.is_dir() {
+        info!("perf-libs found at {:?}", perf_libs);
+        return Some(perf_libs);
+    }
+    warn!("{:?} does not exist", perf_libs);
+    None
+}
+
+fn find_cuda_home(perf_libs_path: &Path) -> Option<PathBuf> {
+    // Search /usr/local for a `cuda-` directory that matches a perf-libs subdirectory
+    for entry in fs::read_dir(&perf_libs_path).unwrap() {
+        if let Ok(entry) = entry {
+            let path = entry.path();
+            if !path.is_dir() {
+                continue;
+            }
+            let dir_name = path.file_name().unwrap().to_str().unwrap_or("");
+            if !dir_name.starts_with("cuda-") {
+                continue;
+            }
+
+            let cuda_home: PathBuf = ["/", "usr", "local", dir_name].iter().collect();
+            if !cuda_home.is_dir() {
+                continue;
+            }
+
+            return Some(cuda_home);
+        }
+    }
+    None
+}
+
+pub fn init_cuda() {
+    if let Some(perf_libs_path) = locate_perf_libs() {
+        if let Some(cuda_home) = find_cuda_home(&perf_libs_path) {
+            info!("CUDA installation found at {:?}", cuda_home);
+
+            let cuda_lib64_dir = cuda_home.join("lib64");
+            if cuda_lib64_dir.is_dir() {
+                let ld_library_path = cuda_lib64_dir.to_str().unwrap_or("").to_string()
+                    + ":"
+                    + &env::var("LD_LIBRARY_PATH").unwrap_or_else(|_| "".to_string());
+                info!("LD_LIBRARY_PATH set to {:?}", ld_library_path);
+
+                // Prefix LD_LIBRARY_PATH with $CUDA_HOME/lib64 directory
+                // to ensure the correct CUDA version is used
+                env::set_var("LD_LIBRARY_PATH", ld_library_path)
+            } else {
+                warn!("{:?} does not exist", cuda_lib64_dir);
+            }
+
+            let libcuda_crypt = perf_libs_path
+                .join(cuda_home.file_name().unwrap())
+                .join("libcuda-crypt.so");
+            return init(libcuda_crypt.as_os_str());
+        } else {
+            warn!("CUDA installation not found");
+        }
+    }
+
+    // Last resort!  Blindly load the shared object and hope it all works out
+    init(OsStr::new("libcuda-crypt.so"))
+}
+
+pub fn api() -> Option<&'static Container<Api<'static>>> {
+    {
+        static INIT_HOOK: Once = Once::new();
+        INIT_HOOK.call_once(|| {
+            if std::env::var("TEST_PERF_LIBS_CUDA").is_ok() {
+                init_cuda();
+            }
+        })
+    }
+
+    unsafe { API.as_ref() }
+}
--- a/perf/src/recycler.rs
+++ b/perf/src/recycler.rs
@ -0,0 +1,111 @@
+use rand::{thread_rng, Rng};
+use std::sync::atomic::{AtomicUsize, Ordering};
+use std::sync::{Arc, Mutex};
+
+#[derive(Debug, Default)]
+struct RecyclerStats {
+    total: AtomicUsize,
+    reuse: AtomicUsize,
+    max_gc: AtomicUsize,
+}
+
+#[derive(Debug)]
+pub struct Recycler<T> {
+    gc: Arc<Mutex<Vec<T>>>,
+    stats: Arc<RecyclerStats>,
+    id: usize,
+}
+
+impl<T: Default> Default for Recycler<T> {
+    fn default() -> Recycler<T> {
+        let id = thread_rng().gen_range(0, 1000);
+        trace!("new recycler..{}", id);
+        Recycler {
+            gc: Arc::new(Mutex::new(vec![])),
+            stats: Arc::new(RecyclerStats::default()),
+            id,
+        }
+    }
+}
+
+impl<T: Default> Clone for Recycler<T> {
+    fn clone(&self) -> Recycler<T> {
+        Recycler {
+            gc: self.gc.clone(),
+            stats: self.stats.clone(),
+            id: self.id,
+        }
+    }
+}
+
+pub trait Reset {
+    fn reset(&mut self);
+}
+
+impl<T: Default + Reset> Recycler<T> {
+    pub fn allocate(&self, name: &'static str) -> T {
+        let new = self
+            .gc
+            .lock()
+            .expect("recycler lock in pb fn allocate")
+            .pop();
+
+        if let Some(mut x) = new {
+            self.stats.reuse.fetch_add(1, Ordering::Relaxed);
+            x.reset();
+            return x;
+        }
+
+        trace!(
+            "allocating new: total {} {:?} id: {} reuse: {} max_gc: {}",
+            self.stats.total.fetch_add(1, Ordering::Relaxed),
+            name,
+            self.id,
+            self.stats.reuse.load(Ordering::Relaxed),
+            self.stats.max_gc.load(Ordering::Relaxed),
+        );
+
+        T::default()
+    }
+
+    pub fn recycle(&self, x: T) {
+        let len = {
+            let mut gc = self.gc.lock().expect("recycler lock in pub fn recycle");
+            gc.push(x);
+            gc.len()
+        };
+
+        let max_gc = self.stats.max_gc.load(Ordering::Relaxed);
+        if len > max_gc {
+            // this is not completely accurate, but for most cases should be fine.
+            self.stats
+                .max_gc
+                .compare_and_swap(max_gc, len, Ordering::Relaxed);
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    impl Reset for u64 {
+        fn reset(&mut self) {
+            *self = 10;
+        }
+    }
+
+    #[test]
+    fn test_recycler() {
+        let recycler = Recycler::default();
+        let mut y: u64 = recycler.allocate("test_recycler1");
+        assert_eq!(y, 0);
+        y = 20;
+        let recycler2 = recycler.clone();
+        recycler2.recycle(y);
+        assert_eq!(recycler.gc.lock().unwrap().len(), 1);
+        let z = recycler.allocate("test_recycler2");
+        assert_eq!(z, 10);
+        assert_eq!(recycler.gc.lock().unwrap().len(), 0);
+    }
+}