Changed transition to restart Rpu rather than modify bank to prevent lock contention

This commit is contained in:
Carl
2018-09-18 17:35:09 -07:00
committed by Greg Fitzgerald
parent 8c8a4ba705
commit cd2d50e06c
4 changed files with 82 additions and 39 deletions

View File

@ -76,7 +76,7 @@ pub struct Fullnode {
pub node_role: Option<NodeRole>, pub node_role: Option<NodeRole>,
keypair: Arc<Keypair>, keypair: Arc<Keypair>,
exit: Arc<AtomicBool>, exit: Arc<AtomicBool>,
rpu: Rpu, rpu: Option<Rpu>,
rpc_service: JsonRpcService, rpc_service: JsonRpcService,
ncp: Ncp, ncp: Ncp,
bank: Arc<Bank>, bank: Arc<Bank>,
@ -89,6 +89,8 @@ pub struct Fullnode {
retransmit_socket: UdpSocket, retransmit_socket: UdpSocket,
transaction_sockets: Vec<UdpSocket>, transaction_sockets: Vec<UdpSocket>,
broadcast_socket: UdpSocket, broadcast_socket: UdpSocket,
requests_socket: UdpSocket,
respond_socket: UdpSocket,
blob_recycler: BlobRecycler, blob_recycler: BlobRecycler,
} }
@ -236,13 +238,18 @@ impl Fullnode {
let mut blob_recycler = BlobRecycler::default(); let mut blob_recycler = BlobRecycler::default();
blob_recycler.set_name("fullnode::Blob"); blob_recycler.set_name("fullnode::Blob");
let rpu = Rpu::new( let rpu = Some(Rpu::new(
&bank, &bank,
node.sockets.requests, node.sockets
node.sockets.respond, .requests
.try_clone()
.expect("Failed to clone requests socket"),
node.sockets
.respond
.try_clone()
.expect("Failed to clone respond socket"),
&blob_recycler, &blob_recycler,
exit.clone(), ));
);
// TODO: this code assumes this node is the leader // TODO: this code assumes this node is the leader
let mut drone_addr = node.info.contact_info.tpu; let mut drone_addr = node.info.contact_info.tpu;
@ -364,10 +371,12 @@ impl Fullnode {
retransmit_socket: node.sockets.retransmit, retransmit_socket: node.sockets.retransmit,
transaction_sockets: node.sockets.transaction, transaction_sockets: node.sockets.transaction,
broadcast_socket: node.sockets.broadcast, broadcast_socket: node.sockets.broadcast,
requests_socket: node.sockets.requests,
respond_socket: node.sockets.respond,
} }
} }
fn leader_to_validator(&mut self) { fn leader_to_validator(&mut self) -> Result<()> {
// TODO: We can avoid building the bank again once RecordStage is // TODO: We can avoid building the bank again once RecordStage is
// integrated with BankingStage // integrated with BankingStage
let (bank, entry_height, _) = Self::new_bank_from_ledger(&self.ledger_path); let (bank, entry_height, _) = Self::new_bank_from_ledger(&self.ledger_path);
@ -384,9 +393,23 @@ impl Fullnode {
} }
} }
// Tell the RPU to serve requests out of the new bank we've created // Make a new RPU to serve requests out of the new bank we've created
// instead of the old one // instead of the old one
self.rpu.set_new_bank(self.bank.clone()); if !self.rpu.is_none() {
let old_rpu = self.rpu.take().unwrap();
old_rpu.close()?;
self.rpu = Some(Rpu::new(
&self.bank,
self.requests_socket
.try_clone()
.expect("Failed to clone requests socket"),
self.respond_socket
.try_clone()
.expect("Failed to clone respond socket"),
&self.blob_recycler,
));
}
let tvu = Tvu::new( let tvu = Tvu::new(
self.keypair.clone(), self.keypair.clone(),
&self.bank, &self.bank,
@ -409,6 +432,7 @@ impl Fullnode {
); );
let validator_state = ValidatorServices::new(tvu); let validator_state = ValidatorServices::new(tvu);
self.node_role = Some(NodeRole::Validator(validator_state)); self.node_role = Some(NodeRole::Validator(validator_state));
Ok(())
} }
pub fn handle_role_transition(&mut self) -> Result<Option<FullnodeReturnType>> { pub fn handle_role_transition(&mut self) -> Result<Option<FullnodeReturnType>> {
@ -416,7 +440,7 @@ impl Fullnode {
match node_role { match node_role {
Some(NodeRole::Leader(leader_services)) => match leader_services.join()? { Some(NodeRole::Leader(leader_services)) => match leader_services.join()? {
Some(TpuReturnType::LeaderRotation) => { Some(TpuReturnType::LeaderRotation) => {
self.leader_to_validator(); self.leader_to_validator()?;
Ok(Some(FullnodeReturnType::LeaderRotation)) Ok(Some(FullnodeReturnType::LeaderRotation))
} }
_ => Ok(None), _ => Ok(None),
@ -431,7 +455,9 @@ impl Fullnode {
//used for notifying many nodes in parallel to exit //used for notifying many nodes in parallel to exit
pub fn exit(&self) { pub fn exit(&self) {
self.exit.store(true, Ordering::Relaxed); self.exit.store(true, Ordering::Relaxed);
if let Some(ref rpu) = self.rpu {
rpu.exit();
}
match self.node_role { match self.node_role {
Some(NodeRole::Leader(ref leader_services)) => leader_services.exit(), Some(NodeRole::Leader(ref leader_services)) => leader_services.exit(),
Some(NodeRole::Validator(ref validator_services)) => validator_services.exit(), Some(NodeRole::Validator(ref validator_services)) => validator_services.exit(),
@ -471,7 +497,9 @@ impl Service for Fullnode {
type JoinReturnType = Option<FullnodeReturnType>; type JoinReturnType = Option<FullnodeReturnType>;
fn join(self) -> Result<Option<FullnodeReturnType>> { fn join(self) -> Result<Option<FullnodeReturnType>> {
self.rpu.join()?; if let Some(rpu) = self.rpu {
rpu.join()?;
}
self.ncp.join()?; self.ncp.join()?;
self.rpc_service.join()?; self.rpc_service.join()?;

View File

@ -9,7 +9,7 @@ use result::{Error, Result};
use service::Service; use service::Service;
use std::net::SocketAddr; use std::net::SocketAddr;
use std::sync::mpsc::{channel, Receiver, RecvTimeoutError}; use std::sync::mpsc::{channel, Receiver, RecvTimeoutError};
use std::sync::{Arc, RwLock}; use std::sync::Arc;
use std::thread::{self, Builder, JoinHandle}; use std::thread::{self, Builder, JoinHandle};
use std::time::Instant; use std::time::Instant;
use streamer::{self, BlobReceiver, BlobSender}; use streamer::{self, BlobReceiver, BlobSender};
@ -17,7 +17,7 @@ use timing;
pub struct RequestStage { pub struct RequestStage {
thread_hdl: JoinHandle<()>, thread_hdl: JoinHandle<()>,
pub request_processor: Arc<RwLock<RequestProcessor>>, pub request_processor: Arc<RequestProcessor>,
} }
impl RequestStage { impl RequestStage {
@ -78,18 +78,19 @@ impl RequestStage {
Ok(()) Ok(())
} }
pub fn new( pub fn new(
request_processor: Arc<RwLock<RequestProcessor>>, request_processor: RequestProcessor,
packet_receiver: Receiver<SharedPackets>, packet_receiver: Receiver<SharedPackets>,
packet_recycler: PacketRecycler, packet_recycler: PacketRecycler,
blob_recycler: BlobRecycler, blob_recycler: BlobRecycler,
) -> (Self, BlobReceiver) { ) -> (Self, BlobReceiver) {
let request_processor = Arc::new(request_processor);
let request_processor_ = request_processor.clone(); let request_processor_ = request_processor.clone();
let (blob_sender, blob_receiver) = channel(); let (blob_sender, blob_receiver) = channel();
let thread_hdl = Builder::new() let thread_hdl = Builder::new()
.name("solana-request-stage".to_string()) .name("solana-request-stage".to_string())
.spawn(move || loop { .spawn(move || loop {
if let Err(e) = Self::process_request_packets( if let Err(e) = Self::process_request_packets(
&request_processor_.read().unwrap(), &request_processor_,
&packet_receiver, &packet_receiver,
&blob_sender, &blob_sender,
&packet_recycler, &packet_recycler,

View File

@ -28,18 +28,17 @@ use packet::{BlobRecycler, PacketRecycler};
use request_processor::RequestProcessor; use request_processor::RequestProcessor;
use request_stage::RequestStage; use request_stage::RequestStage;
use service::Service; use service::Service;
use std::mem;
use std::net::UdpSocket; use std::net::UdpSocket;
use std::sync::atomic::AtomicBool; use std::sync::atomic::{AtomicBool, Ordering};
use std::sync::mpsc::channel; use std::sync::mpsc::channel;
use std::sync::{Arc, RwLock}; use std::sync::Arc;
use std::thread::{self, JoinHandle}; use std::thread::{self, JoinHandle};
use streamer; use streamer;
pub struct Rpu { pub struct Rpu {
request_stage: RequestStage, request_stage: RequestStage,
thread_hdls: Vec<JoinHandle<()>>, thread_hdls: Vec<JoinHandle<()>>,
request_processor: Arc<RwLock<RequestProcessor>>, exit: Arc<AtomicBool>,
} }
impl Rpu { impl Rpu {
@ -48,21 +47,21 @@ impl Rpu {
requests_socket: UdpSocket, requests_socket: UdpSocket,
respond_socket: UdpSocket, respond_socket: UdpSocket,
blob_recycler: &BlobRecycler, blob_recycler: &BlobRecycler,
exit: Arc<AtomicBool>,
) -> Self { ) -> Self {
let exit = Arc::new(AtomicBool::new(false));
let mut packet_recycler = PacketRecycler::default(); let mut packet_recycler = PacketRecycler::default();
packet_recycler.set_name("rpu::Packet"); packet_recycler.set_name("rpu::Packet");
let (packet_sender, packet_receiver) = channel(); let (packet_sender, packet_receiver) = channel();
let t_receiver = streamer::receiver( let t_receiver = streamer::receiver(
Arc::new(requests_socket), Arc::new(requests_socket),
exit, exit.clone(),
packet_recycler.clone(), packet_recycler.clone(),
packet_sender, packet_sender,
); );
let request_processor = Arc::new(RwLock::new(RequestProcessor::new(bank.clone()))); let request_processor = RequestProcessor::new(bank.clone());
let (request_stage, blob_receiver) = RequestStage::new( let (request_stage, blob_receiver) = RequestStage::new(
request_processor.clone(), request_processor,
packet_receiver, packet_receiver,
packet_recycler.clone(), packet_recycler.clone(),
blob_recycler.clone(), blob_recycler.clone(),
@ -80,13 +79,17 @@ impl Rpu {
Rpu { Rpu {
thread_hdls, thread_hdls,
request_stage, request_stage,
request_processor, exit,
} }
} }
pub fn set_new_bank(&self, new_bank: Arc<Bank>) { pub fn exit(&self) {
let mut w_request_procesor = self.request_processor.write().unwrap(); self.exit.store(true, Ordering::Relaxed);
mem::replace(&mut *w_request_procesor, RequestProcessor::new(new_bank)); }
pub fn close(self) -> thread::Result<()> {
self.exit();
self.join()
} }
} }

View File

@ -810,18 +810,29 @@ fn test_leader_to_validator_transition() {
// make sure the leader stops. // make sure the leader stops.
assert!(genesis_height < leader_rotation_interval); assert!(genesis_height < leader_rotation_interval);
for i in genesis_height..(leader_rotation_interval + extra_transactions) { for i in genesis_height..(leader_rotation_interval + extra_transactions) {
let expected_balance = std::cmp::min( if i < leader_rotation_interval {
leader_rotation_interval - genesis_height, // Poll to see that the bank state is updated after every transaction
i - genesis_height, // to ensure that each transaction is packaged as a single entry,
); // so that we can be sure leader rotation is triggered
let expected_balance = std::cmp::min(
leader_rotation_interval - genesis_height,
i - genesis_height + 1,
);
let result = send_tx_and_retry_get_balance(
&leader_info,
&mint,
&bob_pubkey,
1,
Some(expected_balance as i64),
);
send_tx_and_retry_get_balance( assert_eq!(result, Some(expected_balance as i64))
&leader_info, } else {
&mint, // After leader_rotation_interval entries, we don't care about the
&bob_pubkey, // number of entries generated by these transactions. These are just
1, // here for testing to make sure the leader stopped at the correct point.
Some(expected_balance as i64), send_tx_and_retry_get_balance(&leader_info, &mint, &bob_pubkey, 1, None);
); }
} }
// Wait for leader to shut down tpu and restart tvu // Wait for leader to shut down tpu and restart tvu