Page-pin packet memory for cuda (#4250)
* Page-pin packet memory for cuda Bring back recyclers and pin offset buffers * Add packet recycler to streamer * Add set_pinnable to sigverify vecs to pin them * Add packets reset test * Add test for recycler and reduce the gc lock critical section * Add comments/tests to cuda_runtime * Add recycler to recv_blobs path. * Add trace/names for debug and PacketsRecycler to bench-streamer * Predict realloc and unpin beforehand. * Add helper to reserve and pin * Cap buffered packets length * Call cuda wrapper functions
This commit is contained in:
@@ -4,7 +4,9 @@
|
||||
//! offloaded to the GPU.
|
||||
//!
|
||||
|
||||
use crate::cuda_runtime::PinnedVec;
|
||||
use crate::packet::{Packet, Packets};
|
||||
use crate::recycler::Recycler;
|
||||
use crate::result::Result;
|
||||
use bincode::serialized_size;
|
||||
use rayon::ThreadPool;
|
||||
@@ -18,7 +20,10 @@ use solana_sdk::transaction::Transaction;
|
||||
use std::mem::size_of;
|
||||
|
||||
#[cfg(feature = "cuda")]
|
||||
use std::os::raw::c_int;
|
||||
use std::os::raw::{c_int, c_uint};
|
||||
|
||||
#[cfg(feature = "cuda")]
|
||||
use core::ffi::c_void;
|
||||
|
||||
pub const NUM_THREADS: u32 = 10;
|
||||
use std::cell::RefCell;
|
||||
@@ -28,7 +33,9 @@ thread_local!(static PAR_THREAD_POOL: RefCell<ThreadPool> = RefCell::new(rayon::
|
||||
.build()
|
||||
.unwrap()));
|
||||
|
||||
type TxOffsets = (Vec<u32>, Vec<u32>, Vec<u32>, Vec<u32>, Vec<Vec<u32>>);
|
||||
pub type TxOffset = PinnedVec<u32>;
|
||||
|
||||
type TxOffsets = (TxOffset, TxOffset, TxOffset, TxOffset, Vec<Vec<u32>>);
|
||||
|
||||
#[cfg(feature = "cuda")]
|
||||
#[repr(C)]
|
||||
@@ -78,6 +85,9 @@ extern "C" {
|
||||
num_elems: usize,
|
||||
use_non_default_stream: u8,
|
||||
) -> c_int;
|
||||
|
||||
pub fn cuda_host_register(ptr: *mut c_void, size: usize, flags: c_uint) -> c_int;
|
||||
pub fn cuda_host_unregister(ptr: *mut c_void) -> c_int;
|
||||
}
|
||||
|
||||
#[cfg(not(feature = "cuda"))]
|
||||
@@ -122,7 +132,11 @@ fn batch_size(batches: &[Packets]) -> usize {
|
||||
}
|
||||
|
||||
#[cfg(not(feature = "cuda"))]
|
||||
pub fn ed25519_verify(batches: &[Packets]) -> Vec<Vec<u8>> {
|
||||
pub fn ed25519_verify(
|
||||
batches: &[Packets],
|
||||
_recycler: &Recycler<TxOffset>,
|
||||
_recycler_out: &Recycler<PinnedVec<u8>>,
|
||||
) -> Vec<Vec<u8>> {
|
||||
ed25519_verify_cpu(batches)
|
||||
}
|
||||
|
||||
@@ -145,11 +159,16 @@ pub fn get_packet_offsets(packet: &Packet, current_offset: u32) -> (u32, u32, u3
|
||||
)
|
||||
}
|
||||
|
||||
pub fn generate_offsets(batches: &[Packets]) -> Result<TxOffsets> {
|
||||
let mut signature_offsets: Vec<_> = Vec::new();
|
||||
let mut pubkey_offsets: Vec<_> = Vec::new();
|
||||
let mut msg_start_offsets: Vec<_> = Vec::new();
|
||||
let mut msg_sizes: Vec<_> = Vec::new();
|
||||
pub fn generate_offsets(batches: &[Packets], recycler: &Recycler<TxOffset>) -> Result<TxOffsets> {
|
||||
debug!("allocating..");
|
||||
let mut signature_offsets: PinnedVec<_> = recycler.allocate("sig_offsets");
|
||||
signature_offsets.set_pinnable();
|
||||
let mut pubkey_offsets: PinnedVec<_> = recycler.allocate("pubkey_offsets");
|
||||
pubkey_offsets.set_pinnable();
|
||||
let mut msg_start_offsets: PinnedVec<_> = recycler.allocate("msg_start_offsets");
|
||||
msg_start_offsets.set_pinnable();
|
||||
let mut msg_sizes: PinnedVec<_> = recycler.allocate("msg_size_offsets");
|
||||
msg_sizes.set_pinnable();
|
||||
let mut current_packet = 0;
|
||||
let mut v_sig_lens = Vec::new();
|
||||
batches.iter().for_each(|p| {
|
||||
@@ -229,7 +248,11 @@ pub fn init() {
|
||||
}
|
||||
|
||||
#[cfg(feature = "cuda")]
|
||||
pub fn ed25519_verify(batches: &[Packets]) -> Vec<Vec<u8>> {
|
||||
pub fn ed25519_verify(
|
||||
batches: &[Packets],
|
||||
recycler: &Recycler<TxOffset>,
|
||||
recycler_out: &Recycler<PinnedVec<u8>>,
|
||||
) -> Vec<Vec<u8>> {
|
||||
use crate::packet::PACKET_DATA_SIZE;
|
||||
let count = batch_size(batches);
|
||||
|
||||
@@ -243,10 +266,12 @@ pub fn ed25519_verify(batches: &[Packets]) -> Vec<Vec<u8>> {
|
||||
}
|
||||
|
||||
let (signature_offsets, pubkey_offsets, msg_start_offsets, msg_sizes, sig_lens) =
|
||||
generate_offsets(batches).unwrap();
|
||||
generate_offsets(batches, recycler).unwrap();
|
||||
|
||||
debug!("CUDA ECDSA for {}", batch_size(batches));
|
||||
let mut out = Vec::new();
|
||||
debug!("allocating out..");
|
||||
let mut out = recycler_out.allocate("out_buffer");
|
||||
out.set_pinnable();
|
||||
let mut elems = Vec::new();
|
||||
let mut rvs = Vec::new();
|
||||
|
||||
@@ -303,6 +328,11 @@ pub fn ed25519_verify(batches: &[Packets]) -> Vec<Vec<u8>> {
|
||||
}
|
||||
}
|
||||
inc_new_counter_debug!("ed25519_verify_gpu", count);
|
||||
recycler_out.recycle(out);
|
||||
recycler.recycle(signature_offsets);
|
||||
recycler.recycle(pubkey_offsets);
|
||||
recycler.recycle(msg_sizes);
|
||||
recycler.recycle(msg_start_offsets);
|
||||
rvs
|
||||
}
|
||||
|
||||
@@ -320,6 +350,7 @@ pub fn make_packet_from_transaction(tx: Transaction) -> Packet {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use crate::packet::{Packet, Packets};
|
||||
use crate::recycler::Recycler;
|
||||
use crate::sigverify;
|
||||
use crate::test_tx::{test_multisig_tx, test_tx};
|
||||
use bincode::{deserialize, serialize};
|
||||
@@ -461,8 +492,10 @@ mod tests {
|
||||
|
||||
let batches = generate_packet_vec(&packet, n, 2);
|
||||
|
||||
let recycler = Recycler::default();
|
||||
let recycler_out = Recycler::default();
|
||||
// verify packets
|
||||
let ans = sigverify::ed25519_verify(&batches);
|
||||
let ans = sigverify::ed25519_verify(&batches, &recycler, &recycler_out);
|
||||
|
||||
// check result
|
||||
let ref_ans = if modify_data { 0u8 } else { 1u8 };
|
||||
@@ -499,8 +532,10 @@ mod tests {
|
||||
|
||||
batches[0].packets.push(packet);
|
||||
|
||||
let recycler = Recycler::default();
|
||||
let recycler_out = Recycler::default();
|
||||
// verify packets
|
||||
let ans = sigverify::ed25519_verify(&batches);
|
||||
let ans = sigverify::ed25519_verify(&batches, &recycler, &recycler_out);
|
||||
|
||||
// check result
|
||||
let ref_ans = 1u8;
|
||||
|
Reference in New Issue
Block a user