bumps up min number of bloom items in gossip pull requests (#17236)

When a validator starts, it has an (almost) empty crds table and it only
sends one pull-request to the entrypoint. The bloom filter in the
pull-request targets 10% false rate given the number of items. So, if
the `num_items` is very wrong, it makes a very small bloom filter with a
very high false rate:
https://github.com/solana-labs/solana/blob/2ae57c172/runtime/src/bloom.rs#L70-L80
https://github.com/solana-labs/solana/blob/2ae57c172/core/src/crds_gossip_pull.rs#L48

As a result, it is very unlikely that the validator obtains entrypoint's
contact-info in response. This exacerbates how long the validator will
loop on:
    > Waiting to adopt entrypoint shred version
https://github.com/solana-labs/solana/blob/ed51cde37/validator/src/main.rs#L390-L412

This commit increases the min number of bloom items when making gossip
pull requests. Effectively this will break the entrypoint crds table
into 64 shards, one pull-request for each, a larger bloom filter for
each shard, and increases the chances that the response will include
entrypoint's contact-info, which is needed for adopting shred version
and validator start.
This commit is contained in:
behzad nouri
2021-05-21 13:59:26 +00:00
committed by GitHub
parent 662c2aaeec
commit e8b35a4f7b
4 changed files with 89 additions and 101 deletions

View File

@@ -1687,18 +1687,12 @@ impl ClusterInfo {
Ok(())
}
fn process_entrypoints(&self, entrypoints_processed: &mut bool) {
if *entrypoints_processed {
return;
}
fn process_entrypoints(&self) -> bool {
let mut entrypoints = self.entrypoints.write().unwrap();
if entrypoints.is_empty() {
// No entrypoint specified. Nothing more to process
*entrypoints_processed = true;
return;
return true;
}
for entrypoint in entrypoints.iter_mut() {
if entrypoint.id == Pubkey::default() {
// If a pull from the entrypoint was successful it should exist in the CRDS table
@@ -1727,11 +1721,10 @@ impl ClusterInfo {
.set_shred_version(entrypoint.shred_version);
}
}
*entrypoints_processed = self.my_shred_version() != 0
self.my_shred_version() != 0
&& entrypoints
.iter()
.all(|entrypoint| entrypoint.id != Pubkey::default());
.all(|entrypoint| entrypoint.id != Pubkey::default())
}
fn handle_purge(
@@ -1867,8 +1860,7 @@ impl ClusterInfo {
}
self.handle_purge(&thread_pool, &bank_forks, &stakes);
self.process_entrypoints(&mut entrypoints_processed);
entrypoints_processed = entrypoints_processed || self.process_entrypoints();
//TODO: possibly tune this parameter
//we saw a deadlock passing an self.read().unwrap().timeout into sleep
@@ -3851,21 +3843,17 @@ mod tests {
cluster_info.set_entrypoint(entrypoint.clone());
let (pings, pulls) = cluster_info.new_pull_requests(&thread_pool, None, &HashMap::new());
assert!(pings.is_empty());
assert_eq!(1, pulls.len() as u64);
match pulls.get(0) {
Some((addr, msg)) => {
assert_eq!(*addr, entrypoint.gossip);
match msg {
Protocol::PullRequest(_, value) => {
assert!(value.verify());
assert_eq!(value.pubkey(), cluster_info.id())
}
_ => panic!("wrong protocol"),
assert_eq!(pulls.len(), 64);
for (addr, msg) in pulls {
assert_eq!(addr, entrypoint.gossip);
match msg {
Protocol::PullRequest(_, value) => {
assert!(value.verify());
assert_eq!(value.pubkey(), cluster_info.id())
}
_ => panic!("wrong protocol"),
}
None => panic!("entrypoint should be a pull destination"),
}
// now add this message back to the table and make sure after the next pull, the entrypoint is unset
let entrypoint_crdsvalue =
CrdsValue::new_unsigned(CrdsData::ContactInfo(entrypoint.clone()));
@@ -3879,7 +3867,7 @@ mod tests {
);
let (pings, pulls) = cluster_info.new_pull_requests(&thread_pool, None, &HashMap::new());
assert_eq!(pings.len(), 1);
assert_eq!(1, pulls.len() as u64);
assert_eq!(pulls.len(), 64);
assert_eq!(*cluster_info.entrypoints.read().unwrap(), vec![entrypoint]);
}
@@ -4068,24 +4056,30 @@ mod tests {
// fresh timestamp). There should only be one pull request to `other_node`
let (pings, pulls) = cluster_info.new_pull_requests(&thread_pool, None, &stakes);
assert!(pings.is_empty());
assert_eq!(1, pulls.len() as u64);
assert_eq!(pulls.get(0).unwrap().0, other_node.gossip);
assert_eq!(64, pulls.len());
assert!(pulls.into_iter().all(|(addr, _)| addr == other_node.gossip));
// Pull request 2: pretend it's been a while since we've pulled from `entrypoint`. There should
// now be two pull requests
cluster_info.entrypoints.write().unwrap()[0].wallclock = 0;
let (pings, pulls) = cluster_info.new_pull_requests(&thread_pool, None, &stakes);
assert!(pings.is_empty());
assert_eq!(2, pulls.len() as u64);
assert_eq!(pulls.get(0).unwrap().0, other_node.gossip);
assert_eq!(pulls.get(1).unwrap().0, entrypoint.gossip);
assert_eq!(pulls.len(), 64 * 2);
assert!(pulls
.iter()
.take(64)
.all(|(addr, _)| *addr == other_node.gossip));
assert!(pulls
.iter()
.skip(64)
.all(|(addr, _)| *addr == entrypoint.gossip));
// Pull request 3: `other_node` is present and `entrypoint` was just pulled from. There should
// only be one pull request to `other_node`
let (pings, pulls) = cluster_info.new_pull_requests(&thread_pool, None, &stakes);
assert!(pings.is_empty());
assert_eq!(1, pulls.len() as u64);
assert_eq!(pulls.get(0).unwrap().0, other_node.gossip);
assert_eq!(pulls.len(), 64);
assert!(pulls.into_iter().all(|(addr, _)| addr == other_node.gossip));
}
#[test]
@@ -4249,8 +4243,7 @@ mod tests {
.any(|entrypoint| *entrypoint == gossiped_entrypoint1_info));
// Adopt the entrypoint's gossiped contact info and verify
let mut entrypoints_processed = false;
ClusterInfo::process_entrypoints(&cluster_info, &mut entrypoints_processed);
let entrypoints_processed = ClusterInfo::process_entrypoints(&cluster_info);
assert_eq!(cluster_info.entrypoints.read().unwrap().len(), 2);
assert!(cluster_info
.entrypoints
@@ -4278,8 +4271,7 @@ mod tests {
// Adopt the entrypoint's gossiped contact info and verify
error!("Adopt the entrypoint's gossiped contact info and verify");
let mut entrypoints_processed = false;
ClusterInfo::process_entrypoints(&cluster_info, &mut entrypoints_processed);
let entrypoints_processed = ClusterInfo::process_entrypoints(&cluster_info);
assert_eq!(cluster_info.entrypoints.read().unwrap().len(), 2);
assert!(cluster_info
.entrypoints
@@ -4322,8 +4314,7 @@ mod tests {
cluster_info.insert_info(gossiped_entrypoint_info.clone());
// Adopt the entrypoint's gossiped contact info and verify
let mut entrypoints_processed = false;
ClusterInfo::process_entrypoints(&cluster_info, &mut entrypoints_processed);
let entrypoints_processed = ClusterInfo::process_entrypoints(&cluster_info);
assert_eq!(cluster_info.entrypoints.read().unwrap().len(), 1);
assert_eq!(
cluster_info.entrypoints.read().unwrap()[0],