committed by
Grimes
parent
3133ee2401
commit
b825d04597
20
perf/Cargo.toml
Normal file
20
perf/Cargo.toml
Normal file
@ -0,0 +1,20 @@
|
||||
[package]
|
||||
name = "solana-perf"
|
||||
version = "0.21.0"
|
||||
description = "Solana Performance APIs"
|
||||
authors = ["Solana Maintainers <maintainers@solana.com>"]
|
||||
repository = "https://github.com/solana-labs/solana"
|
||||
license = "Apache-2.0"
|
||||
homepage = "https://solana.com/"
|
||||
edition = "2018"
|
||||
|
||||
[dependencies]
|
||||
rand = "0.6.5"
|
||||
dlopen = "0.1.8"
|
||||
dlopen_derive = "0.1.4"
|
||||
log = "0.4.8"
|
||||
solana-sdk = { path = "../sdk", version = "0.21.0" }
|
||||
|
||||
[lib]
|
||||
name = "solana_perf"
|
||||
|
295
perf/src/cuda_runtime.rs
Normal file
295
perf/src/cuda_runtime.rs
Normal file
@ -0,0 +1,295 @@
|
||||
// Module for cuda-related helper functions and wrappers.
|
||||
//
|
||||
// cudaHostRegister/cudaHostUnregister -
|
||||
// apis for page-pinning memory. Cuda driver/hardware cannot overlap
|
||||
// copies from host memory to GPU memory unless the memory is page-pinned and
|
||||
// cannot be paged to disk. The cuda driver provides these interfaces to pin and unpin memory.
|
||||
|
||||
use crate::perf_libs;
|
||||
use crate::recycler::Reset;
|
||||
use std::ops::{Deref, DerefMut};
|
||||
|
||||
#[cfg(feature = "pin_gpu_memory")]
|
||||
use std::os::raw::c_int;
|
||||
|
||||
#[cfg(feature = "pin_gpu_memory")]
|
||||
const CUDA_SUCCESS: c_int = 0;
|
||||
|
||||
pub fn pin<T>(_mem: &mut Vec<T>) {
|
||||
#[cfg(feature = "pin_gpu_memory")]
|
||||
{
|
||||
if let Some(api) = perf_libs::api() {
|
||||
unsafe {
|
||||
use core::ffi::c_void;
|
||||
use std::mem::size_of;
|
||||
|
||||
let err = (api.cuda_host_register)(
|
||||
_mem.as_mut_ptr() as *mut c_void,
|
||||
_mem.capacity() * size_of::<T>(),
|
||||
0,
|
||||
);
|
||||
if err != CUDA_SUCCESS {
|
||||
error!(
|
||||
"cudaHostRegister error: {} ptr: {:?} bytes: {}",
|
||||
err,
|
||||
_mem.as_ptr(),
|
||||
_mem.capacity() * size_of::<T>()
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn unpin<T>(_mem: *mut T) {
|
||||
#[cfg(feature = "pin_gpu_memory")]
|
||||
{
|
||||
if let Some(api) = perf_libs::api() {
|
||||
unsafe {
|
||||
use core::ffi::c_void;
|
||||
|
||||
let err = (api.cuda_host_unregister)(_mem as *mut c_void);
|
||||
if err != CUDA_SUCCESS {
|
||||
error!("cudaHostUnregister returned: {} ptr: {:?}", err, _mem);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// A vector wrapper where the underlying memory can be
|
||||
// page-pinned. Controlled by flags in case user only wants
|
||||
// to pin in certain circumstances.
|
||||
#[derive(Debug)]
|
||||
pub struct PinnedVec<T> {
|
||||
x: Vec<T>,
|
||||
pinned: bool,
|
||||
pinnable: bool,
|
||||
}
|
||||
|
||||
impl<T: Default + Clone> Reset for PinnedVec<T> {
|
||||
fn reset(&mut self) {
|
||||
self.resize(0, T::default());
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: Clone> Default for PinnedVec<T> {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
x: Vec::new(),
|
||||
pinned: false,
|
||||
pinnable: false,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> Deref for PinnedVec<T> {
|
||||
type Target = Vec<T>;
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
&self.x
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> DerefMut for PinnedVec<T> {
|
||||
fn deref_mut(&mut self) -> &mut Vec<T> {
|
||||
&mut self.x
|
||||
}
|
||||
}
|
||||
|
||||
pub struct PinnedIter<'a, T>(std::slice::Iter<'a, T>);
|
||||
|
||||
pub struct PinnedIterMut<'a, T>(std::slice::IterMut<'a, T>);
|
||||
|
||||
impl<'a, T> Iterator for PinnedIter<'a, T> {
|
||||
type Item = &'a T;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
self.0.next()
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, T> Iterator for PinnedIterMut<'a, T> {
|
||||
type Item = &'a mut T;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
self.0.next()
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, T> IntoIterator for &'a mut PinnedVec<T> {
|
||||
type Item = &'a T;
|
||||
type IntoIter = PinnedIter<'a, T>;
|
||||
|
||||
fn into_iter(self) -> Self::IntoIter {
|
||||
PinnedIter(self.iter())
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, T> IntoIterator for &'a PinnedVec<T> {
|
||||
type Item = &'a T;
|
||||
type IntoIter = PinnedIter<'a, T>;
|
||||
|
||||
fn into_iter(self) -> Self::IntoIter {
|
||||
PinnedIter(self.iter())
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: Clone> PinnedVec<T> {
|
||||
pub fn reserve_and_pin(&mut self, size: usize) {
|
||||
if self.x.capacity() < size {
|
||||
if self.pinned {
|
||||
unpin(&mut self.x);
|
||||
self.pinned = false;
|
||||
}
|
||||
self.x.reserve(size);
|
||||
}
|
||||
self.set_pinnable();
|
||||
if !self.pinned {
|
||||
pin(&mut self.x);
|
||||
self.pinned = true;
|
||||
}
|
||||
}
|
||||
|
||||
pub fn set_pinnable(&mut self) {
|
||||
self.pinnable = true;
|
||||
}
|
||||
|
||||
pub fn from_vec(source: Vec<T>) -> Self {
|
||||
Self {
|
||||
x: source,
|
||||
pinned: false,
|
||||
pinnable: false,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn with_capacity(capacity: usize) -> Self {
|
||||
let x = Vec::with_capacity(capacity);
|
||||
Self {
|
||||
x,
|
||||
pinned: false,
|
||||
pinnable: false,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn iter(&self) -> PinnedIter<T> {
|
||||
PinnedIter(self.x.iter())
|
||||
}
|
||||
|
||||
pub fn iter_mut(&mut self) -> PinnedIterMut<T> {
|
||||
PinnedIterMut(self.x.iter_mut())
|
||||
}
|
||||
|
||||
pub fn is_empty(&self) -> bool {
|
||||
self.x.is_empty()
|
||||
}
|
||||
|
||||
pub fn len(&self) -> usize {
|
||||
self.x.len()
|
||||
}
|
||||
|
||||
pub fn as_ptr(&self) -> *const T {
|
||||
self.x.as_ptr()
|
||||
}
|
||||
|
||||
pub fn as_mut_ptr(&mut self) -> *mut T {
|
||||
self.x.as_mut_ptr()
|
||||
}
|
||||
|
||||
pub fn push(&mut self, x: T) {
|
||||
let old_ptr = self.x.as_mut_ptr();
|
||||
let old_capacity = self.x.capacity();
|
||||
// Predict realloc and unpin
|
||||
if self.pinned && self.x.capacity() == self.x.len() {
|
||||
unpin(old_ptr);
|
||||
self.pinned = false;
|
||||
}
|
||||
self.x.push(x);
|
||||
self.check_ptr(old_ptr, old_capacity, "push");
|
||||
}
|
||||
|
||||
pub fn resize(&mut self, size: usize, elem: T) {
|
||||
let old_ptr = self.x.as_mut_ptr();
|
||||
let old_capacity = self.x.capacity();
|
||||
// Predict realloc and unpin.
|
||||
if self.pinned && self.x.capacity() < size {
|
||||
unpin(old_ptr);
|
||||
self.pinned = false;
|
||||
}
|
||||
self.x.resize(size, elem);
|
||||
self.check_ptr(old_ptr, old_capacity, "resize");
|
||||
}
|
||||
|
||||
fn check_ptr(&mut self, _old_ptr: *mut T, _old_capacity: usize, _from: &'static str) {
|
||||
let api = perf_libs::api();
|
||||
if api.is_some()
|
||||
&& self.pinnable
|
||||
&& (self.x.as_ptr() != _old_ptr || self.x.capacity() != _old_capacity)
|
||||
{
|
||||
if self.pinned {
|
||||
unpin(_old_ptr);
|
||||
}
|
||||
|
||||
trace!(
|
||||
"pinning from check_ptr old: {} size: {} from: {}",
|
||||
_old_capacity,
|
||||
self.x.capacity(),
|
||||
_from
|
||||
);
|
||||
pin(&mut self.x);
|
||||
self.pinned = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: Clone> Clone for PinnedVec<T> {
|
||||
fn clone(&self) -> Self {
|
||||
let mut x = self.x.clone();
|
||||
let pinned = if self.pinned {
|
||||
pin(&mut x);
|
||||
true
|
||||
} else {
|
||||
false
|
||||
};
|
||||
debug!(
|
||||
"clone PinnedVec: size: {} pinned?: {} pinnable?: {}",
|
||||
self.x.capacity(),
|
||||
self.pinned,
|
||||
self.pinnable
|
||||
);
|
||||
Self {
|
||||
x,
|
||||
pinned,
|
||||
pinnable: self.pinnable,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> Drop for PinnedVec<T> {
|
||||
fn drop(&mut self) {
|
||||
if self.pinned {
|
||||
unpin(self.x.as_mut_ptr());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_pinned_vec() {
|
||||
let mut mem = PinnedVec::with_capacity(10);
|
||||
mem.set_pinnable();
|
||||
mem.push(50);
|
||||
mem.resize(2, 10);
|
||||
assert_eq!(mem[0], 50);
|
||||
assert_eq!(mem[1], 10);
|
||||
assert_eq!(mem.len(), 2);
|
||||
assert_eq!(mem.is_empty(), false);
|
||||
let mut iter = mem.iter();
|
||||
assert_eq!(*iter.next().unwrap(), 50);
|
||||
assert_eq!(*iter.next().unwrap(), 10);
|
||||
assert_eq!(iter.next(), None);
|
||||
}
|
||||
}
|
6
perf/src/lib.rs
Normal file
6
perf/src/lib.rs
Normal file
@ -0,0 +1,6 @@
|
||||
pub mod cuda_runtime;
|
||||
pub mod perf_libs;
|
||||
pub mod recycler;
|
||||
|
||||
#[macro_use]
|
||||
extern crate log;
|
190
perf/src/perf_libs.rs
Normal file
190
perf/src/perf_libs.rs
Normal file
@ -0,0 +1,190 @@
|
||||
use core::ffi::c_void;
|
||||
use dlopen::symbor::{Container, SymBorApi, Symbol};
|
||||
use dlopen_derive::SymBorApi;
|
||||
use log::*;
|
||||
use solana_sdk::packet::Packet;
|
||||
use std::env;
|
||||
use std::ffi::OsStr;
|
||||
use std::fs;
|
||||
use std::os::raw::{c_int, c_uint};
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::sync::Once;
|
||||
|
||||
#[repr(C)]
|
||||
pub struct Elems {
|
||||
pub elems: *const Packet,
|
||||
pub num: u32,
|
||||
}
|
||||
|
||||
#[derive(SymBorApi)]
|
||||
pub struct Api<'a> {
|
||||
pub ed25519_init: Symbol<'a, unsafe extern "C" fn() -> bool>,
|
||||
pub ed25519_set_verbose: Symbol<'a, unsafe extern "C" fn(val: bool)>,
|
||||
|
||||
#[allow(clippy::type_complexity)]
|
||||
pub ed25519_verify_many: Symbol<
|
||||
'a,
|
||||
unsafe extern "C" fn(
|
||||
vecs: *const Elems,
|
||||
num: u32, //number of vecs
|
||||
message_size: u32, //size of each element inside the elems field of the vec
|
||||
total_packets: u32,
|
||||
total_signatures: u32,
|
||||
message_lens: *const u32,
|
||||
pubkey_offsets: *const u32,
|
||||
signature_offsets: *const u32,
|
||||
signed_message_offsets: *const u32,
|
||||
out: *mut u8, //combined length of all the items in vecs
|
||||
use_non_default_stream: u8,
|
||||
) -> u32,
|
||||
>,
|
||||
|
||||
#[allow(clippy::type_complexity)]
|
||||
pub ed25519_sign_many: Symbol<
|
||||
'a,
|
||||
unsafe extern "C" fn(
|
||||
vecs: *mut Elems,
|
||||
num: u32, //number of vecs
|
||||
message_size: u32, //size of each element inside the elems field of the vec
|
||||
total_packets: u32,
|
||||
total_signatures: u32,
|
||||
message_lens: *const u32,
|
||||
pubkey_offsets: *const u32,
|
||||
privkey_offsets: *const u32,
|
||||
signed_message_offsets: *const u32,
|
||||
sgnatures_out: *mut u8, //combined length of all the items in vecs
|
||||
use_non_default_stream: u8,
|
||||
) -> u32,
|
||||
>,
|
||||
|
||||
pub chacha_cbc_encrypt_many_sample: Symbol<
|
||||
'a,
|
||||
unsafe extern "C" fn(
|
||||
input: *const u8,
|
||||
sha_state: *mut u8,
|
||||
in_len: usize,
|
||||
keys: *const u8,
|
||||
ivec: *mut u8,
|
||||
num_keys: u32,
|
||||
samples: *const u64,
|
||||
num_samples: u32,
|
||||
starting_block: u64,
|
||||
time_us: *mut f32,
|
||||
),
|
||||
>,
|
||||
|
||||
pub chacha_init_sha_state: Symbol<'a, unsafe extern "C" fn(sha_state: *mut u8, num_keys: u32)>,
|
||||
pub chacha_end_sha_state:
|
||||
Symbol<'a, unsafe extern "C" fn(sha_state_in: *const u8, out: *mut u8, num_keys: u32)>,
|
||||
|
||||
pub poh_verify_many: Symbol<
|
||||
'a,
|
||||
unsafe extern "C" fn(
|
||||
hashes: *mut u8,
|
||||
num_hashes_arr: *const u64,
|
||||
num_elems: usize,
|
||||
use_non_default_stream: u8,
|
||||
) -> c_int,
|
||||
>,
|
||||
|
||||
pub cuda_host_register:
|
||||
Symbol<'a, unsafe extern "C" fn(ptr: *mut c_void, size: usize, flags: c_uint) -> c_int>,
|
||||
|
||||
pub cuda_host_unregister: Symbol<'a, unsafe extern "C" fn(ptr: *mut c_void) -> c_int>,
|
||||
}
|
||||
|
||||
static mut API: Option<Container<Api>> = None;
|
||||
|
||||
fn init(name: &OsStr) {
|
||||
static INIT_HOOK: Once = Once::new();
|
||||
|
||||
info!("Loading {:?}", name);
|
||||
unsafe {
|
||||
INIT_HOOK.call_once(|| {
|
||||
API = Some(Container::load(name).unwrap_or_else(|err| {
|
||||
error!("Unable to load {:?}: {}", name, err);
|
||||
std::process::exit(1);
|
||||
}));
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
fn locate_perf_libs() -> Option<PathBuf> {
|
||||
let exe = env::current_exe().expect("Unable to get executable path");
|
||||
let perf_libs = exe.parent().unwrap().join("perf-libs");
|
||||
if perf_libs.is_dir() {
|
||||
info!("perf-libs found at {:?}", perf_libs);
|
||||
return Some(perf_libs);
|
||||
}
|
||||
warn!("{:?} does not exist", perf_libs);
|
||||
None
|
||||
}
|
||||
|
||||
fn find_cuda_home(perf_libs_path: &Path) -> Option<PathBuf> {
|
||||
// Search /usr/local for a `cuda-` directory that matches a perf-libs subdirectory
|
||||
for entry in fs::read_dir(&perf_libs_path).unwrap() {
|
||||
if let Ok(entry) = entry {
|
||||
let path = entry.path();
|
||||
if !path.is_dir() {
|
||||
continue;
|
||||
}
|
||||
let dir_name = path.file_name().unwrap().to_str().unwrap_or("");
|
||||
if !dir_name.starts_with("cuda-") {
|
||||
continue;
|
||||
}
|
||||
|
||||
let cuda_home: PathBuf = ["/", "usr", "local", dir_name].iter().collect();
|
||||
if !cuda_home.is_dir() {
|
||||
continue;
|
||||
}
|
||||
|
||||
return Some(cuda_home);
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
pub fn init_cuda() {
|
||||
if let Some(perf_libs_path) = locate_perf_libs() {
|
||||
if let Some(cuda_home) = find_cuda_home(&perf_libs_path) {
|
||||
info!("CUDA installation found at {:?}", cuda_home);
|
||||
|
||||
let cuda_lib64_dir = cuda_home.join("lib64");
|
||||
if cuda_lib64_dir.is_dir() {
|
||||
let ld_library_path = cuda_lib64_dir.to_str().unwrap_or("").to_string()
|
||||
+ ":"
|
||||
+ &env::var("LD_LIBRARY_PATH").unwrap_or_else(|_| "".to_string());
|
||||
info!("LD_LIBRARY_PATH set to {:?}", ld_library_path);
|
||||
|
||||
// Prefix LD_LIBRARY_PATH with $CUDA_HOME/lib64 directory
|
||||
// to ensure the correct CUDA version is used
|
||||
env::set_var("LD_LIBRARY_PATH", ld_library_path)
|
||||
} else {
|
||||
warn!("{:?} does not exist", cuda_lib64_dir);
|
||||
}
|
||||
|
||||
let libcuda_crypt = perf_libs_path
|
||||
.join(cuda_home.file_name().unwrap())
|
||||
.join("libcuda-crypt.so");
|
||||
return init(libcuda_crypt.as_os_str());
|
||||
} else {
|
||||
warn!("CUDA installation not found");
|
||||
}
|
||||
}
|
||||
|
||||
// Last resort! Blindly load the shared object and hope it all works out
|
||||
init(OsStr::new("libcuda-crypt.so"))
|
||||
}
|
||||
|
||||
pub fn api() -> Option<&'static Container<Api<'static>>> {
|
||||
{
|
||||
static INIT_HOOK: Once = Once::new();
|
||||
INIT_HOOK.call_once(|| {
|
||||
if std::env::var("TEST_PERF_LIBS_CUDA").is_ok() {
|
||||
init_cuda();
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
unsafe { API.as_ref() }
|
||||
}
|
111
perf/src/recycler.rs
Normal file
111
perf/src/recycler.rs
Normal file
@ -0,0 +1,111 @@
|
||||
use rand::{thread_rng, Rng};
|
||||
use std::sync::atomic::{AtomicUsize, Ordering};
|
||||
use std::sync::{Arc, Mutex};
|
||||
|
||||
#[derive(Debug, Default)]
|
||||
struct RecyclerStats {
|
||||
total: AtomicUsize,
|
||||
reuse: AtomicUsize,
|
||||
max_gc: AtomicUsize,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct Recycler<T> {
|
||||
gc: Arc<Mutex<Vec<T>>>,
|
||||
stats: Arc<RecyclerStats>,
|
||||
id: usize,
|
||||
}
|
||||
|
||||
impl<T: Default> Default for Recycler<T> {
|
||||
fn default() -> Recycler<T> {
|
||||
let id = thread_rng().gen_range(0, 1000);
|
||||
trace!("new recycler..{}", id);
|
||||
Recycler {
|
||||
gc: Arc::new(Mutex::new(vec![])),
|
||||
stats: Arc::new(RecyclerStats::default()),
|
||||
id,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: Default> Clone for Recycler<T> {
|
||||
fn clone(&self) -> Recycler<T> {
|
||||
Recycler {
|
||||
gc: self.gc.clone(),
|
||||
stats: self.stats.clone(),
|
||||
id: self.id,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub trait Reset {
|
||||
fn reset(&mut self);
|
||||
}
|
||||
|
||||
impl<T: Default + Reset> Recycler<T> {
|
||||
pub fn allocate(&self, name: &'static str) -> T {
|
||||
let new = self
|
||||
.gc
|
||||
.lock()
|
||||
.expect("recycler lock in pb fn allocate")
|
||||
.pop();
|
||||
|
||||
if let Some(mut x) = new {
|
||||
self.stats.reuse.fetch_add(1, Ordering::Relaxed);
|
||||
x.reset();
|
||||
return x;
|
||||
}
|
||||
|
||||
trace!(
|
||||
"allocating new: total {} {:?} id: {} reuse: {} max_gc: {}",
|
||||
self.stats.total.fetch_add(1, Ordering::Relaxed),
|
||||
name,
|
||||
self.id,
|
||||
self.stats.reuse.load(Ordering::Relaxed),
|
||||
self.stats.max_gc.load(Ordering::Relaxed),
|
||||
);
|
||||
|
||||
T::default()
|
||||
}
|
||||
|
||||
pub fn recycle(&self, x: T) {
|
||||
let len = {
|
||||
let mut gc = self.gc.lock().expect("recycler lock in pub fn recycle");
|
||||
gc.push(x);
|
||||
gc.len()
|
||||
};
|
||||
|
||||
let max_gc = self.stats.max_gc.load(Ordering::Relaxed);
|
||||
if len > max_gc {
|
||||
// this is not completely accurate, but for most cases should be fine.
|
||||
self.stats
|
||||
.max_gc
|
||||
.compare_and_swap(max_gc, len, Ordering::Relaxed);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
impl Reset for u64 {
|
||||
fn reset(&mut self) {
|
||||
*self = 10;
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_recycler() {
|
||||
let recycler = Recycler::default();
|
||||
let mut y: u64 = recycler.allocate("test_recycler1");
|
||||
assert_eq!(y, 0);
|
||||
y = 20;
|
||||
let recycler2 = recycler.clone();
|
||||
recycler2.recycle(y);
|
||||
assert_eq!(recycler.gc.lock().unwrap().len(), 1);
|
||||
let z = recycler.allocate("test_recycler2");
|
||||
assert_eq!(z, 10);
|
||||
assert_eq!(recycler.gc.lock().unwrap().len(), 0);
|
||||
}
|
||||
}
|
Reference in New Issue
Block a user