Correctly remove replicator from data plane after its done repairing (#4301)

* Correctly remove replicator from data plane after its done repairing

* Update discover to report nodes and replicators separately

* Fix print and condition to be spy
This commit is contained in:
Sagar Dhawan
2019-05-16 07:14:58 -07:00
committed by GitHub
parent d40b66ff7b
commit a0ffbf50a5
12 changed files with 119 additions and 57 deletions

View File

@ -242,6 +242,7 @@ impl ClusterInfo {
pub fn contact_info_trace(&self) -> String {
let now = timestamp();
let mut spy_nodes = 0;
let mut replicators = 0;
let my_id = self.my_data().id;
let nodes: Vec<_> = self
.all_peers()
@ -249,6 +250,8 @@ impl ClusterInfo {
.map(|(node, last_updated)| {
if Self::is_spy_node(&node) {
spy_nodes += 1;
} else if Self::is_replicator(&node) {
replicators += 1;
}
fn addr_to_string(addr: &SocketAddr) -> String {
if ContactInfo::is_valid_address(addr) {
@ -276,9 +279,14 @@ impl ClusterInfo {
" Node contact info | Age | Node identifier \n\
-------------------------------+---------+-----------------------------------\n\
{}\
Nodes: {}{}",
Nodes: {}{}{}",
nodes.join(""),
nodes.len() - spy_nodes,
nodes.len() - spy_nodes - replicators,
if replicators > 0 {
format!("\nReplicators: {}", replicators)
} else {
"".to_string()
},
if spy_nodes > 0 {
format!("\nSpies: {}", spy_nodes)
} else {
@ -378,7 +386,7 @@ impl ClusterInfo {
.collect()
}
/// compute broadcast table
/// all peers that have a valid tvu port.
pub fn tvu_peers(&self) -> Vec<ContactInfo> {
let me = self.my_data().id;
self.gossip
@ -392,6 +400,20 @@ impl ClusterInfo {
.collect()
}
/// all peers that have a valid storage addr
pub fn storage_peers(&self) -> Vec<ContactInfo> {
let me = self.my_data().id;
self.gossip
.crds
.table
.values()
.filter_map(|x| x.value.contact_info())
.filter(|x| ContactInfo::is_valid_address(&x.storage_addr))
.filter(|x| x.id != me)
.cloned()
.collect()
}
/// all peers that have a valid tvu
pub fn retransmit_peers(&self) -> Vec<ContactInfo> {
let me = self.my_data().id;
@ -417,9 +439,15 @@ impl ClusterInfo {
}
fn is_spy_node(contact_info: &ContactInfo) -> bool {
!ContactInfo::is_valid_address(&contact_info.tpu)
(!ContactInfo::is_valid_address(&contact_info.tpu)
|| !ContactInfo::is_valid_address(&contact_info.gossip)
|| !ContactInfo::is_valid_address(&contact_info.tvu)
|| !ContactInfo::is_valid_address(&contact_info.tvu))
&& !ContactInfo::is_valid_address(&contact_info.storage_addr)
}
pub fn is_replicator(contact_info: &ContactInfo) -> bool {
ContactInfo::is_valid_address(&contact_info.storage_addr)
&& !ContactInfo::is_valid_address(&contact_info.tpu)
}
fn sort_by_stake<S: std::hash::BuildHasher>(

View File

@ -6,7 +6,7 @@ use crate::blocktree::Blocktree;
use crate::cluster_info::FULLNODE_PORT_RANGE;
use crate::contact_info::ContactInfo;
use crate::entry::{Entry, EntrySlice};
use crate::gossip_service::discover_nodes;
use crate::gossip_service::discover_cluster;
use crate::locktower::VOTE_THRESHOLD_DEPTH;
use crate::poh_service::PohServiceConfig;
use solana_client::thin_client::create_client;
@ -30,7 +30,7 @@ pub fn spend_and_verify_all_nodes(
funding_keypair: &Keypair,
nodes: usize,
) {
let cluster_nodes = discover_nodes(&entry_point_info.gossip, nodes).unwrap();
let (cluster_nodes, _) = discover_cluster(&entry_point_info.gossip, nodes).unwrap();
assert!(cluster_nodes.len() >= nodes);
for ingress_node in &cluster_nodes {
let random_keypair = Keypair::new();
@ -81,7 +81,7 @@ pub fn send_many_transactions(node: &ContactInfo, funding_keypair: &Keypair, num
}
pub fn fullnode_exit(entry_point_info: &ContactInfo, nodes: usize) {
let cluster_nodes = discover_nodes(&entry_point_info.gossip, nodes).unwrap();
let (cluster_nodes, _) = discover_cluster(&entry_point_info.gossip, nodes).unwrap();
assert!(cluster_nodes.len() >= nodes);
for node in &cluster_nodes {
let client = create_client(node.client_facing_addr(), FULLNODE_PORT_RANGE);
@ -153,7 +153,7 @@ pub fn kill_entry_and_spend_and_verify_rest(
slot_millis: u64,
) {
solana_logger::setup();
let cluster_nodes = discover_nodes(&entry_point_info.gossip, nodes).unwrap();
let (cluster_nodes, _) = discover_cluster(&entry_point_info.gossip, nodes).unwrap();
assert!(cluster_nodes.len() >= nodes);
let client = create_client(entry_point_info.client_facing_addr(), FULLNODE_PORT_RANGE);
let first_two_epoch_slots = MINIMUM_SLOT_LENGTH * 3;

View File

@ -5,7 +5,7 @@ use crate::blocktree::{Blocktree, CompletedSlotsReceiver};
use crate::blocktree_processor::{self, BankForksInfo};
use crate::cluster_info::{ClusterInfo, Node};
use crate::contact_info::ContactInfo;
use crate::gossip_service::{discover_nodes, GossipService};
use crate::gossip_service::{discover_cluster, GossipService};
use crate::leader_schedule_cache::LeaderScheduleCache;
use crate::poh_recorder::PohRecorder;
use crate::poh_service::{PohService, PohServiceConfig};
@ -369,7 +369,7 @@ pub fn new_fullnode_for_tests() -> (Fullnode, ContactInfo, Keypair, String) {
None,
&FullnodeConfig::default(),
);
discover_nodes(&contact_info.gossip, 1).expect("Node startup failed");
discover_cluster(&contact_info.gossip, 1).expect("Node startup failed");
(node, contact_info, mint_keypair, ledger_path)
}

View File

@ -55,10 +55,11 @@ impl GossipService {
}
}
pub fn discover_nodes(
/// Discover Nodes and Replicators in a cluster
pub fn discover_cluster(
entry_point: &SocketAddr,
num_nodes: usize,
) -> std::io::Result<Vec<ContactInfo>> {
) -> std::io::Result<(Vec<ContactInfo>, Vec<ContactInfo>)> {
discover(entry_point, Some(num_nodes), Some(30), None, None)
}
@ -68,7 +69,7 @@ pub fn discover(
timeout: Option<u64>,
find_node: Option<Pubkey>,
gossip_addr: Option<&SocketAddr>,
) -> std::io::Result<Vec<ContactInfo>> {
) -> std::io::Result<(Vec<ContactInfo>, Vec<ContactInfo>)> {
let exit = Arc::new(AtomicBool::new(false));
let (gossip_service, spy_ref) = make_gossip_node(entry_point, &exit, gossip_addr);
@ -76,7 +77,8 @@ pub fn discover(
info!("Gossip entry point: {:?}", entry_point);
info!("Spy node id: {:?}", id);
let (met_criteria, secs, tvu_peers) = spy(spy_ref.clone(), num_nodes, timeout, find_node);
let (met_criteria, secs, tvu_peers, replicators) =
spy(spy_ref.clone(), num_nodes, timeout, find_node);
exit.store(true, Ordering::Relaxed);
gossip_service.join().unwrap();
@ -87,7 +89,7 @@ pub fn discover(
secs,
spy_ref.read().unwrap().contact_info_trace()
);
return Ok(tvu_peers);
return Ok((tvu_peers, replicators));
}
if !tvu_peers.is_empty() {
@ -95,7 +97,7 @@ pub fn discover(
"discover failed to match criteria by timeout...\n{}",
spy_ref.read().unwrap().contact_info_trace()
);
return Ok(tvu_peers);
return Ok((tvu_peers, replicators));
}
info!(
@ -132,10 +134,11 @@ fn spy(
num_nodes: Option<usize>,
timeout: Option<u64>,
find_node: Option<Pubkey>,
) -> (bool, u64, Vec<ContactInfo>) {
) -> (bool, u64, Vec<ContactInfo>, Vec<ContactInfo>) {
let now = Instant::now();
let mut met_criteria = false;
let mut tvu_peers: Vec<ContactInfo> = Vec::new();
let mut replicators: Vec<ContactInfo> = Vec::new();
let mut i = 0;
loop {
if let Some(secs) = timeout {
@ -143,11 +146,24 @@ fn spy(
break;
}
}
tvu_peers = spy_ref.read().unwrap().tvu_peers();
// collect tvu peers but filter out replicators since their tvu is transient and we do not want
// it to show up as a "node"
tvu_peers = spy_ref
.read()
.unwrap()
.tvu_peers()
.into_iter()
.filter(|node| !ClusterInfo::is_replicator(&node))
.collect::<Vec<_>>();
replicators = spy_ref.read().unwrap().storage_peers();
if let Some(num) = num_nodes {
if tvu_peers.len() >= num {
if tvu_peers.len() + replicators.len() >= num {
if let Some(pubkey) = find_node {
if tvu_peers.iter().any(|x| x.id == pubkey) {
if tvu_peers
.iter()
.chain(replicators.iter())
.any(|x| x.id == pubkey)
{
met_criteria = true;
break;
}
@ -158,7 +174,12 @@ fn spy(
}
}
if let Some(pubkey) = find_node {
if num_nodes.is_none() && tvu_peers.iter().any(|x| x.id == pubkey) {
if num_nodes.is_none()
&& tvu_peers
.iter()
.chain(replicators.iter())
.any(|x| x.id == pubkey)
{
met_criteria = true;
break;
}
@ -174,7 +195,12 @@ fn spy(
));
i += 1;
}
(met_criteria, now.elapsed().as_secs(), tvu_peers)
(
met_criteria,
now.elapsed().as_secs(),
tvu_peers,
replicators,
)
}
/// Makes a spy or gossip node based on whether or not a gossip_addr was passed in
@ -243,29 +269,30 @@ mod tests {
let spy_ref = Arc::new(RwLock::new(cluster_info));
let (met_criteria, secs, tvu_peers) = spy(spy_ref.clone(), None, Some(1), None);
let (met_criteria, secs, tvu_peers, _) = spy(spy_ref.clone(), None, Some(1), None);
assert_eq!(met_criteria, false);
assert_eq!(secs, 1);
assert_eq!(tvu_peers, spy_ref.read().unwrap().tvu_peers());
// Find num_nodes
let (met_criteria, _, _) = spy(spy_ref.clone(), Some(1), None, None);
let (met_criteria, _, _, _) = spy(spy_ref.clone(), Some(1), None, None);
assert_eq!(met_criteria, true);
let (met_criteria, _, _) = spy(spy_ref.clone(), Some(2), None, None);
let (met_criteria, _, _, _) = spy(spy_ref.clone(), Some(2), None, None);
assert_eq!(met_criteria, true);
// Find specific node by pubkey
let (met_criteria, _, _) = spy(spy_ref.clone(), None, None, Some(peer0));
let (met_criteria, _, _, _) = spy(spy_ref.clone(), None, None, Some(peer0));
assert_eq!(met_criteria, true);
let (met_criteria, _, _) = spy(spy_ref.clone(), None, Some(0), Some(Pubkey::new_rand()));
let (met_criteria, _, _, _) = spy(spy_ref.clone(), None, Some(0), Some(Pubkey::new_rand()));
assert_eq!(met_criteria, false);
// Find num_nodes *and* specific node by pubkey
let (met_criteria, _, _) = spy(spy_ref.clone(), Some(1), None, Some(peer0));
let (met_criteria, _, _, _) = spy(spy_ref.clone(), Some(1), None, Some(peer0));
assert_eq!(met_criteria, true);
let (met_criteria, _, _) = spy(spy_ref.clone(), Some(3), Some(0), Some(peer0));
let (met_criteria, _, _, _) = spy(spy_ref.clone(), Some(3), Some(0), Some(peer0));
assert_eq!(met_criteria, false);
let (met_criteria, _, _) = spy(spy_ref.clone(), Some(1), Some(0), Some(Pubkey::new_rand()));
let (met_criteria, _, _, _) =
spy(spy_ref.clone(), Some(1), Some(0), Some(Pubkey::new_rand()));
assert_eq!(met_criteria, false);
}
}

View File

@ -4,7 +4,7 @@ use crate::cluster_info::{Node, FULLNODE_PORT_RANGE};
use crate::contact_info::ContactInfo;
use crate::fullnode::{Fullnode, FullnodeConfig};
use crate::genesis_utils::create_genesis_block_with_leader;
use crate::gossip_service::discover_nodes;
use crate::gossip_service::discover_cluster;
use crate::replicator::Replicator;
use crate::service::Service;
use solana_client::thin_client::create_client;
@ -176,7 +176,7 @@ impl LocalCluster {
};
(0..config.num_listeners).for_each(|_| cluster.add_validator(&listener_config, 0));
discover_nodes(
discover_cluster(
&cluster.entry_point_info.gossip,
config.node_stakes.len() + config.num_listeners as usize,
)
@ -186,7 +186,7 @@ impl LocalCluster {
cluster.add_replicator();
}
discover_nodes(
discover_cluster(
&cluster.entry_point_info.gossip,
config.node_stakes.len() + config.num_replicators as usize,
)

View File

@ -26,6 +26,7 @@ use solana_sdk::hash::{Hash, Hasher};
use solana_sdk::message::Message;
use solana_sdk::signature::{Keypair, KeypairUtil, Signature};
use solana_sdk::system_transaction;
use solana_sdk::timing::timestamp;
use solana_sdk::transaction::Transaction;
use solana_sdk::transport::TransportError;
use solana_storage_api::{get_segment_from_slot, storage_instruction, SLOTS_PER_SEGMENT};
@ -209,7 +210,7 @@ impl Replicator {
);
info!("Connecting to the cluster via {:?}", cluster_entrypoint);
let nodes = crate::gossip_service::discover_nodes(&cluster_entrypoint.gossip, 1)?;
let (nodes, _) = crate::gossip_service::discover_cluster(&cluster_entrypoint.gossip, 1)?;
let client = crate::gossip_service::get_client(&nodes);
let (storage_blockhash, storage_slot) = Self::poll_for_blockhash_and_slot(&cluster_info)?;
@ -349,6 +350,7 @@ impl Replicator {
// Remove replicator from the data plane
let mut contact_info = node_info.clone();
contact_info.tvu = "0.0.0.0:0".parse().unwrap();
contact_info.wallclock = timestamp();
{
let mut cluster_info_w = cluster_info.write().unwrap();
cluster_info_w.insert_self(contact_info);