Fix repair slowness when most peers are unable to serve requests (#7287)

* Fix repair when most peers are incapable of serving requests

* Add a test for getting the lowest slot in blocktree

* Replace some more u64s with Slot
This commit is contained in:
Sagar Dhawan
2019-12-05 11:25:13 -08:00
committed by GitHub
parent d8e1a196bc
commit a95d37ea25
5 changed files with 204 additions and 60 deletions

View File

@@ -317,10 +317,10 @@ impl ClusterInfo {
)
}
pub fn push_epoch_slots(&mut self, id: Pubkey, root: u64, slots: BTreeSet<u64>) {
pub fn push_epoch_slots(&mut self, id: Pubkey, root: Slot, min: Slot, slots: BTreeSet<Slot>) {
let now = timestamp();
let entry = CrdsValue::new_signed(
CrdsData::EpochSlots(EpochSlots::new(id, root, slots, now)),
CrdsData::EpochSlots(EpochSlots::new(id, root, min, slots, now)),
&self.keypair,
);
self.gossip
@@ -489,13 +489,18 @@ impl ClusterInfo {
.collect()
}
/// all tvu peers with valid gossip addrs
fn repair_peers(&self) -> Vec<ContactInfo> {
/// all tvu peers with valid gossip addrs that likely have the slot being requested
fn repair_peers(&self, slot: Slot) -> Vec<ContactInfo> {
let me = self.my_data().id;
ClusterInfo::tvu_peers(self)
.into_iter()
.filter(|x| x.id != me)
.filter(|x| ContactInfo::is_valid_address(&x.gossip))
.filter(|x| {
self.get_epoch_state_for_node(&x.id, None)
.map(|(epoch_slots, _)| epoch_slots.lowest <= slot)
.unwrap_or_else(|| /* fallback to legacy behavior */ true)
})
.collect()
}
@@ -840,9 +845,9 @@ impl ClusterInfo {
}
pub fn repair_request(&self, repair_request: &RepairType) -> Result<(SocketAddr, Vec<u8>)> {
// find a peer that appears to be accepting replication, as indicated
// by a valid tvu port location
let valid: Vec<_> = self.repair_peers();
// find a peer that appears to be accepting replication and has the desired slot, as indicated
// by a valid tvu port location
let valid: Vec<_> = self.repair_peers(repair_request.slot());
if valid.is_empty() {
return Err(ClusterInfoError::NoPeers.into());
}
@@ -2555,6 +2560,7 @@ mod tests {
let value = CrdsValue::new_unsigned(CrdsData::EpochSlots(EpochSlots {
from: Pubkey::default(),
root: 0,
lowest: 0,
slots: btree_slots,
wallclock: 0,
}));
@@ -2571,6 +2577,7 @@ mod tests {
let mut value = CrdsValue::new_unsigned(CrdsData::EpochSlots(EpochSlots {
from: Pubkey::default(),
root: 0,
lowest: 0,
slots: BTreeSet::new(),
wallclock: 0,
}));
@@ -2588,6 +2595,7 @@ mod tests {
value.data = CrdsData::EpochSlots(EpochSlots {
from: Pubkey::default(),
root: 0,
lowest: 0,
slots,
wallclock: 0,
});
@@ -2700,6 +2708,37 @@ mod tests {
assert_eq!(pulls.get(0).unwrap().0, other_node.gossip);
}
#[test]
fn test_repair_peers() {
let node_keypair = Arc::new(Keypair::new());
let mut cluster_info = ClusterInfo::new(
ContactInfo::new_localhost(&node_keypair.pubkey(), timestamp()),
node_keypair,
);
for i in 0..10 {
let mut peer_root = 5;
let mut peer_lowest = 0;
if i >= 5 {
// make these invalid for the upcoming repair request
peer_root = 15;
peer_lowest = 10;
}
let other_node_pubkey = Pubkey::new_rand();
let other_node = ContactInfo::new_localhost(&other_node_pubkey, timestamp());
cluster_info.insert_info(other_node.clone());
let value = CrdsValue::new_unsigned(CrdsData::EpochSlots(EpochSlots::new(
other_node_pubkey,
peer_root,
peer_lowest,
BTreeSet::new(),
timestamp(),
)));
let _ = cluster_info.gossip.crds.insert(value, timestamp());
}
// only half the visible peers should be eligible to serve this repair
assert_eq!(cluster_info.repair_peers(5).len(), 5);
}
#[test]
fn test_max_bloom_size() {
assert_eq!(MAX_BLOOM_SIZE, max_bloom_size());