Pull perf into a separate module. (#6718)

automerge
This commit is contained in:
anatoly yakovenko
2019-11-04 20:13:43 -08:00
committed by Grimes
parent 3133ee2401
commit b825d04597
36 changed files with 742 additions and 664 deletions

20
perf/Cargo.toml Normal file
View File

@ -0,0 +1,20 @@
[package]
name = "solana-perf"
version = "0.21.0"
description = "Solana Performance APIs"
authors = ["Solana Maintainers <maintainers@solana.com>"]
repository = "https://github.com/solana-labs/solana"
license = "Apache-2.0"
homepage = "https://solana.com/"
edition = "2018"
[dependencies]
rand = "0.6.5"
dlopen = "0.1.8"
dlopen_derive = "0.1.4"
log = "0.4.8"
solana-sdk = { path = "../sdk", version = "0.21.0" }
[lib]
name = "solana_perf"

295
perf/src/cuda_runtime.rs Normal file
View File

@ -0,0 +1,295 @@
// Module for cuda-related helper functions and wrappers.
//
// cudaHostRegister/cudaHostUnregister -
// apis for page-pinning memory. Cuda driver/hardware cannot overlap
// copies from host memory to GPU memory unless the memory is page-pinned and
// cannot be paged to disk. The cuda driver provides these interfaces to pin and unpin memory.
use crate::perf_libs;
use crate::recycler::Reset;
use std::ops::{Deref, DerefMut};
#[cfg(feature = "pin_gpu_memory")]
use std::os::raw::c_int;
#[cfg(feature = "pin_gpu_memory")]
const CUDA_SUCCESS: c_int = 0;
pub fn pin<T>(_mem: &mut Vec<T>) {
#[cfg(feature = "pin_gpu_memory")]
{
if let Some(api) = perf_libs::api() {
unsafe {
use core::ffi::c_void;
use std::mem::size_of;
let err = (api.cuda_host_register)(
_mem.as_mut_ptr() as *mut c_void,
_mem.capacity() * size_of::<T>(),
0,
);
if err != CUDA_SUCCESS {
error!(
"cudaHostRegister error: {} ptr: {:?} bytes: {}",
err,
_mem.as_ptr(),
_mem.capacity() * size_of::<T>()
);
}
}
}
}
}
pub fn unpin<T>(_mem: *mut T) {
#[cfg(feature = "pin_gpu_memory")]
{
if let Some(api) = perf_libs::api() {
unsafe {
use core::ffi::c_void;
let err = (api.cuda_host_unregister)(_mem as *mut c_void);
if err != CUDA_SUCCESS {
error!("cudaHostUnregister returned: {} ptr: {:?}", err, _mem);
}
}
}
}
}
// A vector wrapper where the underlying memory can be
// page-pinned. Controlled by flags in case user only wants
// to pin in certain circumstances.
#[derive(Debug)]
pub struct PinnedVec<T> {
x: Vec<T>,
pinned: bool,
pinnable: bool,
}
impl<T: Default + Clone> Reset for PinnedVec<T> {
fn reset(&mut self) {
self.resize(0, T::default());
}
}
impl<T: Clone> Default for PinnedVec<T> {
fn default() -> Self {
Self {
x: Vec::new(),
pinned: false,
pinnable: false,
}
}
}
impl<T> Deref for PinnedVec<T> {
type Target = Vec<T>;
fn deref(&self) -> &Self::Target {
&self.x
}
}
impl<T> DerefMut for PinnedVec<T> {
fn deref_mut(&mut self) -> &mut Vec<T> {
&mut self.x
}
}
pub struct PinnedIter<'a, T>(std::slice::Iter<'a, T>);
pub struct PinnedIterMut<'a, T>(std::slice::IterMut<'a, T>);
impl<'a, T> Iterator for PinnedIter<'a, T> {
type Item = &'a T;
fn next(&mut self) -> Option<Self::Item> {
self.0.next()
}
}
impl<'a, T> Iterator for PinnedIterMut<'a, T> {
type Item = &'a mut T;
fn next(&mut self) -> Option<Self::Item> {
self.0.next()
}
}
impl<'a, T> IntoIterator for &'a mut PinnedVec<T> {
type Item = &'a T;
type IntoIter = PinnedIter<'a, T>;
fn into_iter(self) -> Self::IntoIter {
PinnedIter(self.iter())
}
}
impl<'a, T> IntoIterator for &'a PinnedVec<T> {
type Item = &'a T;
type IntoIter = PinnedIter<'a, T>;
fn into_iter(self) -> Self::IntoIter {
PinnedIter(self.iter())
}
}
impl<T: Clone> PinnedVec<T> {
pub fn reserve_and_pin(&mut self, size: usize) {
if self.x.capacity() < size {
if self.pinned {
unpin(&mut self.x);
self.pinned = false;
}
self.x.reserve(size);
}
self.set_pinnable();
if !self.pinned {
pin(&mut self.x);
self.pinned = true;
}
}
pub fn set_pinnable(&mut self) {
self.pinnable = true;
}
pub fn from_vec(source: Vec<T>) -> Self {
Self {
x: source,
pinned: false,
pinnable: false,
}
}
pub fn with_capacity(capacity: usize) -> Self {
let x = Vec::with_capacity(capacity);
Self {
x,
pinned: false,
pinnable: false,
}
}
pub fn iter(&self) -> PinnedIter<T> {
PinnedIter(self.x.iter())
}
pub fn iter_mut(&mut self) -> PinnedIterMut<T> {
PinnedIterMut(self.x.iter_mut())
}
pub fn is_empty(&self) -> bool {
self.x.is_empty()
}
pub fn len(&self) -> usize {
self.x.len()
}
pub fn as_ptr(&self) -> *const T {
self.x.as_ptr()
}
pub fn as_mut_ptr(&mut self) -> *mut T {
self.x.as_mut_ptr()
}
pub fn push(&mut self, x: T) {
let old_ptr = self.x.as_mut_ptr();
let old_capacity = self.x.capacity();
// Predict realloc and unpin
if self.pinned && self.x.capacity() == self.x.len() {
unpin(old_ptr);
self.pinned = false;
}
self.x.push(x);
self.check_ptr(old_ptr, old_capacity, "push");
}
pub fn resize(&mut self, size: usize, elem: T) {
let old_ptr = self.x.as_mut_ptr();
let old_capacity = self.x.capacity();
// Predict realloc and unpin.
if self.pinned && self.x.capacity() < size {
unpin(old_ptr);
self.pinned = false;
}
self.x.resize(size, elem);
self.check_ptr(old_ptr, old_capacity, "resize");
}
fn check_ptr(&mut self, _old_ptr: *mut T, _old_capacity: usize, _from: &'static str) {
let api = perf_libs::api();
if api.is_some()
&& self.pinnable
&& (self.x.as_ptr() != _old_ptr || self.x.capacity() != _old_capacity)
{
if self.pinned {
unpin(_old_ptr);
}
trace!(
"pinning from check_ptr old: {} size: {} from: {}",
_old_capacity,
self.x.capacity(),
_from
);
pin(&mut self.x);
self.pinned = true;
}
}
}
impl<T: Clone> Clone for PinnedVec<T> {
fn clone(&self) -> Self {
let mut x = self.x.clone();
let pinned = if self.pinned {
pin(&mut x);
true
} else {
false
};
debug!(
"clone PinnedVec: size: {} pinned?: {} pinnable?: {}",
self.x.capacity(),
self.pinned,
self.pinnable
);
Self {
x,
pinned,
pinnable: self.pinnable,
}
}
}
impl<T> Drop for PinnedVec<T> {
fn drop(&mut self) {
if self.pinned {
unpin(self.x.as_mut_ptr());
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_pinned_vec() {
let mut mem = PinnedVec::with_capacity(10);
mem.set_pinnable();
mem.push(50);
mem.resize(2, 10);
assert_eq!(mem[0], 50);
assert_eq!(mem[1], 10);
assert_eq!(mem.len(), 2);
assert_eq!(mem.is_empty(), false);
let mut iter = mem.iter();
assert_eq!(*iter.next().unwrap(), 50);
assert_eq!(*iter.next().unwrap(), 10);
assert_eq!(iter.next(), None);
}
}

6
perf/src/lib.rs Normal file
View File

@ -0,0 +1,6 @@
pub mod cuda_runtime;
pub mod perf_libs;
pub mod recycler;
#[macro_use]
extern crate log;

190
perf/src/perf_libs.rs Normal file
View File

@ -0,0 +1,190 @@
use core::ffi::c_void;
use dlopen::symbor::{Container, SymBorApi, Symbol};
use dlopen_derive::SymBorApi;
use log::*;
use solana_sdk::packet::Packet;
use std::env;
use std::ffi::OsStr;
use std::fs;
use std::os::raw::{c_int, c_uint};
use std::path::{Path, PathBuf};
use std::sync::Once;
#[repr(C)]
pub struct Elems {
pub elems: *const Packet,
pub num: u32,
}
#[derive(SymBorApi)]
pub struct Api<'a> {
pub ed25519_init: Symbol<'a, unsafe extern "C" fn() -> bool>,
pub ed25519_set_verbose: Symbol<'a, unsafe extern "C" fn(val: bool)>,
#[allow(clippy::type_complexity)]
pub ed25519_verify_many: Symbol<
'a,
unsafe extern "C" fn(
vecs: *const Elems,
num: u32, //number of vecs
message_size: u32, //size of each element inside the elems field of the vec
total_packets: u32,
total_signatures: u32,
message_lens: *const u32,
pubkey_offsets: *const u32,
signature_offsets: *const u32,
signed_message_offsets: *const u32,
out: *mut u8, //combined length of all the items in vecs
use_non_default_stream: u8,
) -> u32,
>,
#[allow(clippy::type_complexity)]
pub ed25519_sign_many: Symbol<
'a,
unsafe extern "C" fn(
vecs: *mut Elems,
num: u32, //number of vecs
message_size: u32, //size of each element inside the elems field of the vec
total_packets: u32,
total_signatures: u32,
message_lens: *const u32,
pubkey_offsets: *const u32,
privkey_offsets: *const u32,
signed_message_offsets: *const u32,
sgnatures_out: *mut u8, //combined length of all the items in vecs
use_non_default_stream: u8,
) -> u32,
>,
pub chacha_cbc_encrypt_many_sample: Symbol<
'a,
unsafe extern "C" fn(
input: *const u8,
sha_state: *mut u8,
in_len: usize,
keys: *const u8,
ivec: *mut u8,
num_keys: u32,
samples: *const u64,
num_samples: u32,
starting_block: u64,
time_us: *mut f32,
),
>,
pub chacha_init_sha_state: Symbol<'a, unsafe extern "C" fn(sha_state: *mut u8, num_keys: u32)>,
pub chacha_end_sha_state:
Symbol<'a, unsafe extern "C" fn(sha_state_in: *const u8, out: *mut u8, num_keys: u32)>,
pub poh_verify_many: Symbol<
'a,
unsafe extern "C" fn(
hashes: *mut u8,
num_hashes_arr: *const u64,
num_elems: usize,
use_non_default_stream: u8,
) -> c_int,
>,
pub cuda_host_register:
Symbol<'a, unsafe extern "C" fn(ptr: *mut c_void, size: usize, flags: c_uint) -> c_int>,
pub cuda_host_unregister: Symbol<'a, unsafe extern "C" fn(ptr: *mut c_void) -> c_int>,
}
static mut API: Option<Container<Api>> = None;
fn init(name: &OsStr) {
static INIT_HOOK: Once = Once::new();
info!("Loading {:?}", name);
unsafe {
INIT_HOOK.call_once(|| {
API = Some(Container::load(name).unwrap_or_else(|err| {
error!("Unable to load {:?}: {}", name, err);
std::process::exit(1);
}));
})
}
}
fn locate_perf_libs() -> Option<PathBuf> {
let exe = env::current_exe().expect("Unable to get executable path");
let perf_libs = exe.parent().unwrap().join("perf-libs");
if perf_libs.is_dir() {
info!("perf-libs found at {:?}", perf_libs);
return Some(perf_libs);
}
warn!("{:?} does not exist", perf_libs);
None
}
fn find_cuda_home(perf_libs_path: &Path) -> Option<PathBuf> {
// Search /usr/local for a `cuda-` directory that matches a perf-libs subdirectory
for entry in fs::read_dir(&perf_libs_path).unwrap() {
if let Ok(entry) = entry {
let path = entry.path();
if !path.is_dir() {
continue;
}
let dir_name = path.file_name().unwrap().to_str().unwrap_or("");
if !dir_name.starts_with("cuda-") {
continue;
}
let cuda_home: PathBuf = ["/", "usr", "local", dir_name].iter().collect();
if !cuda_home.is_dir() {
continue;
}
return Some(cuda_home);
}
}
None
}
pub fn init_cuda() {
if let Some(perf_libs_path) = locate_perf_libs() {
if let Some(cuda_home) = find_cuda_home(&perf_libs_path) {
info!("CUDA installation found at {:?}", cuda_home);
let cuda_lib64_dir = cuda_home.join("lib64");
if cuda_lib64_dir.is_dir() {
let ld_library_path = cuda_lib64_dir.to_str().unwrap_or("").to_string()
+ ":"
+ &env::var("LD_LIBRARY_PATH").unwrap_or_else(|_| "".to_string());
info!("LD_LIBRARY_PATH set to {:?}", ld_library_path);
// Prefix LD_LIBRARY_PATH with $CUDA_HOME/lib64 directory
// to ensure the correct CUDA version is used
env::set_var("LD_LIBRARY_PATH", ld_library_path)
} else {
warn!("{:?} does not exist", cuda_lib64_dir);
}
let libcuda_crypt = perf_libs_path
.join(cuda_home.file_name().unwrap())
.join("libcuda-crypt.so");
return init(libcuda_crypt.as_os_str());
} else {
warn!("CUDA installation not found");
}
}
// Last resort! Blindly load the shared object and hope it all works out
init(OsStr::new("libcuda-crypt.so"))
}
pub fn api() -> Option<&'static Container<Api<'static>>> {
{
static INIT_HOOK: Once = Once::new();
INIT_HOOK.call_once(|| {
if std::env::var("TEST_PERF_LIBS_CUDA").is_ok() {
init_cuda();
}
})
}
unsafe { API.as_ref() }
}

111
perf/src/recycler.rs Normal file
View File

@ -0,0 +1,111 @@
use rand::{thread_rng, Rng};
use std::sync::atomic::{AtomicUsize, Ordering};
use std::sync::{Arc, Mutex};
#[derive(Debug, Default)]
struct RecyclerStats {
total: AtomicUsize,
reuse: AtomicUsize,
max_gc: AtomicUsize,
}
#[derive(Debug)]
pub struct Recycler<T> {
gc: Arc<Mutex<Vec<T>>>,
stats: Arc<RecyclerStats>,
id: usize,
}
impl<T: Default> Default for Recycler<T> {
fn default() -> Recycler<T> {
let id = thread_rng().gen_range(0, 1000);
trace!("new recycler..{}", id);
Recycler {
gc: Arc::new(Mutex::new(vec![])),
stats: Arc::new(RecyclerStats::default()),
id,
}
}
}
impl<T: Default> Clone for Recycler<T> {
fn clone(&self) -> Recycler<T> {
Recycler {
gc: self.gc.clone(),
stats: self.stats.clone(),
id: self.id,
}
}
}
pub trait Reset {
fn reset(&mut self);
}
impl<T: Default + Reset> Recycler<T> {
pub fn allocate(&self, name: &'static str) -> T {
let new = self
.gc
.lock()
.expect("recycler lock in pb fn allocate")
.pop();
if let Some(mut x) = new {
self.stats.reuse.fetch_add(1, Ordering::Relaxed);
x.reset();
return x;
}
trace!(
"allocating new: total {} {:?} id: {} reuse: {} max_gc: {}",
self.stats.total.fetch_add(1, Ordering::Relaxed),
name,
self.id,
self.stats.reuse.load(Ordering::Relaxed),
self.stats.max_gc.load(Ordering::Relaxed),
);
T::default()
}
pub fn recycle(&self, x: T) {
let len = {
let mut gc = self.gc.lock().expect("recycler lock in pub fn recycle");
gc.push(x);
gc.len()
};
let max_gc = self.stats.max_gc.load(Ordering::Relaxed);
if len > max_gc {
// this is not completely accurate, but for most cases should be fine.
self.stats
.max_gc
.compare_and_swap(max_gc, len, Ordering::Relaxed);
}
}
}
#[cfg(test)]
mod tests {
use super::*;
impl Reset for u64 {
fn reset(&mut self) {
*self = 10;
}
}
#[test]
fn test_recycler() {
let recycler = Recycler::default();
let mut y: u64 = recycler.allocate("test_recycler1");
assert_eq!(y, 0);
y = 20;
let recycler2 = recycler.clone();
recycler2.recycle(y);
assert_eq!(recycler.gc.lock().unwrap().len(), 1);
let z = recycler.allocate("test_recycler2");
assert_eq!(z, 10);
assert_eq!(recycler.gc.lock().unwrap().len(), 0);
}
}