Skip gossip requests with different shred version and split lock (#10240)

This commit is contained in:
sakridge
2020-05-28 11:38:13 -07:00
committed by GitHub
parent 9227874ada
commit 3f508b37fd
4 changed files with 271 additions and 60 deletions

View File

@ -218,7 +218,10 @@ struct GossipStats {
process_pull_response_count: Counter, process_pull_response_count: Counter,
process_pull_response_len: Counter, process_pull_response_len: Counter,
process_pull_response_timeout: Counter, process_pull_response_timeout: Counter,
process_pull_response_fail: Counter,
process_pull_response_success: Counter,
process_pull_requests: Counter, process_pull_requests: Counter,
generate_pull_responses: Counter,
process_prune: Counter, process_prune: Counter,
process_push_message: Counter, process_push_message: Counter,
prune_received_cache: Counter, prune_received_cache: Counter,
@ -228,6 +231,11 @@ struct GossipStats {
push_message: Counter, push_message: Counter,
new_pull_requests: Counter, new_pull_requests: Counter,
mark_pull_request: Counter, mark_pull_request: Counter,
skip_pull_response_shred_version: Counter,
skip_pull_shred_version: Counter,
skip_push_message_shred_version: Counter,
push_message_count: Counter,
push_message_value_count: Counter,
} }
pub struct ClusterInfo { pub struct ClusterInfo {
@ -1526,12 +1534,17 @@ impl ClusterInfo {
if contact_info.id == me.id() { if contact_info.id == me.id() {
warn!("PullRequest ignored, I'm talking to myself"); warn!("PullRequest ignored, I'm talking to myself");
inc_new_counter_debug!("cluster_info-window-request-loopback", 1); inc_new_counter_debug!("cluster_info-window-request-loopback", 1);
} else { } else if contact_info.shred_version == 0
|| contact_info.shred_version == me.my_shred_version()
|| me.my_shred_version() == 0
{
gossip_pull_data.push(PullData { gossip_pull_data.push(PullData {
from_addr, from_addr,
caller, caller,
filter, filter,
}); });
} else {
me.stats.skip_pull_shred_version.add_relaxed(1);
} }
} }
datapoint_debug!( datapoint_debug!(
@ -1620,6 +1633,26 @@ impl ClusterInfo {
} }
} }
fn update_data_budget(&self, stakes: &HashMap<Pubkey, u64>) {
let mut w_outbound_budget = self.outbound_budget.write().unwrap();
let now = timestamp();
const INTERVAL_MS: u64 = 100;
// allow 50kBps per staked validator, epoch slots + votes ~= 1.5kB/slot ~= 4kB/s
const BYTES_PER_INTERVAL: usize = 5000;
const MAX_BUDGET_MULTIPLE: usize = 5; // allow budget build-up to 5x the interval default
if now - w_outbound_budget.last_timestamp_ms > INTERVAL_MS {
let len = std::cmp::max(stakes.len(), 2);
w_outbound_budget.bytes += len * BYTES_PER_INTERVAL;
w_outbound_budget.bytes = std::cmp::min(
w_outbound_budget.bytes,
MAX_BUDGET_MULTIPLE * len * BYTES_PER_INTERVAL,
);
w_outbound_budget.last_timestamp_ms = now;
}
}
// Pull requests take an incoming bloom filter of contained entries from a node // Pull requests take an incoming bloom filter of contained entries from a node
// and tries to send back to them the values it detects are missing. // and tries to send back to them the values it detects are missing.
fn handle_pull_requests( fn handle_pull_requests(
@ -1632,33 +1665,19 @@ impl ClusterInfo {
let mut caller_and_filters = vec![]; let mut caller_and_filters = vec![];
let mut addrs = vec![]; let mut addrs = vec![];
let mut time = Measure::start("handle_pull_requests"); let mut time = Measure::start("handle_pull_requests");
{ me.update_data_budget(stakes);
let mut w_outbound_budget = me.outbound_budget.write().unwrap();
let now = timestamp();
const INTERVAL_MS: u64 = 100;
// allow 50kBps per staked validator, epoch slots + votes ~= 1.5kB/slot ~= 4kB/s
const BYTES_PER_INTERVAL: usize = 5000;
const MAX_BUDGET_MULTIPLE: usize = 5; // allow budget build-up to 5x the interval default
if now - w_outbound_budget.last_timestamp_ms > INTERVAL_MS {
let len = std::cmp::max(stakes.len(), 2);
w_outbound_budget.bytes += len * BYTES_PER_INTERVAL;
w_outbound_budget.bytes = std::cmp::min(
w_outbound_budget.bytes,
MAX_BUDGET_MULTIPLE * len * BYTES_PER_INTERVAL,
);
w_outbound_budget.last_timestamp_ms = now;
}
}
for pull_data in requests { for pull_data in requests {
caller_and_filters.push((pull_data.caller, pull_data.filter)); caller_and_filters.push((pull_data.caller, pull_data.filter));
addrs.push(pull_data.from_addr); addrs.push(pull_data.from_addr);
} }
let now = timestamp(); let now = timestamp();
let self_id = me.id(); let self_id = me.id();
let pull_responses = me let pull_responses = me
.time_gossip_write_lock("process_pull_reqs", &me.stats.process_pull_requests) .time_gossip_read_lock("generate_pull_responses", &me.stats.generate_pull_responses)
.generate_pull_responses(&caller_and_filters);
me.time_gossip_write_lock("process_pull_reqs", &me.stats.process_pull_requests)
.process_pull_requests(caller_and_filters, now); .process_pull_requests(caller_and_filters, now);
// Filter bad to addresses // Filter bad to addresses
@ -1755,37 +1774,94 @@ impl ClusterInfo {
Some(packets) Some(packets)
} }
// Returns (failed, timeout, success)
fn handle_pull_response( fn handle_pull_response(
me: &Self, me: &Self,
from: &Pubkey, from: &Pubkey,
data: Vec<CrdsValue>, mut crds_values: Vec<CrdsValue>,
timeouts: &HashMap<Pubkey, u64>, timeouts: &HashMap<Pubkey, u64>,
) { ) -> (usize, usize, usize) {
let len = data.len(); let len = crds_values.len();
trace!("PullResponse me: {} from: {} len={}", me.id, from, len); trace!("PullResponse me: {} from: {} len={}", me.id, from, len);
let (_fail, timeout_count) = me
if let Some(shred_version) = me.lookup_contact_info(from, |ci| ci.shred_version) {
Self::filter_by_shred_version(
from,
&mut crds_values,
shred_version,
me.my_shred_version(),
);
}
let filtered_len = crds_values.len();
let (fail, timeout_count, success) = me
.time_gossip_write_lock("process_pull", &me.stats.process_pull_response) .time_gossip_write_lock("process_pull", &me.stats.process_pull_response)
.process_pull_response(from, timeouts, data, timestamp()); .process_pull_response(from, timeouts, crds_values, timestamp());
me.stats
.skip_pull_response_shred_version
.add_relaxed((len - filtered_len) as u64);
me.stats.process_pull_response_count.add_relaxed(1); me.stats.process_pull_response_count.add_relaxed(1);
me.stats.process_pull_response_len.add_relaxed(len as u64); me.stats
.process_pull_response_len
.add_relaxed(filtered_len as u64);
me.stats me.stats
.process_pull_response_timeout .process_pull_response_timeout
.add_relaxed(timeout_count as u64); .add_relaxed(timeout_count as u64);
me.stats.process_pull_response_fail.add_relaxed(fail as u64);
me.stats
.process_pull_response_success
.add_relaxed(success as u64);
(fail, timeout_count, success)
}
fn filter_by_shred_version(
from: &Pubkey,
crds_values: &mut Vec<CrdsValue>,
shred_version: u16,
my_shred_version: u16,
) {
if my_shred_version != 0 && shred_version != 0 && shred_version != my_shred_version {
// Allow someone to update their own ContactInfo so they
// can change shred versions if needed.
crds_values.retain(|crds_value| match &crds_value.data {
CrdsData::ContactInfo(contact_info) => contact_info.id == *from,
_ => false,
});
}
} }
fn handle_push_message( fn handle_push_message(
me: &Self, me: &Self,
recycler: &PacketsRecycler, recycler: &PacketsRecycler,
from: &Pubkey, from: &Pubkey,
data: Vec<CrdsValue>, mut crds_values: Vec<CrdsValue>,
stakes: &HashMap<Pubkey, u64>, stakes: &HashMap<Pubkey, u64>,
) -> Option<Packets> { ) -> Option<Packets> {
let self_id = me.id(); let self_id = me.id();
inc_new_counter_debug!("cluster_info-push_message", 1); me.stats.push_message_count.add_relaxed(1);
let len = crds_values.len();
if let Some(shred_version) = me.lookup_contact_info(from, |ci| ci.shred_version) {
Self::filter_by_shred_version(
from,
&mut crds_values,
shred_version,
me.my_shred_version(),
);
}
let filtered_len = crds_values.len();
me.stats
.push_message_value_count
.add_relaxed(filtered_len as u64);
me.stats
.skip_push_message_shred_version
.add_relaxed((len - filtered_len) as u64);
let updated: Vec<_> = me let updated: Vec<_> = me
.time_gossip_write_lock("process_push", &me.stats.process_push_message) .time_gossip_write_lock("process_push", &me.stats.process_push_message)
.process_push_message(from, data, timestamp()); .process_push_message(from, crds_values, timestamp());
let updated_labels: Vec<_> = updated.into_iter().map(|u| u.value.label()).collect(); let updated_labels: Vec<_> = updated.into_iter().map(|u| u.value.label()).collect();
let prunes_map: HashMap<Pubkey, HashSet<Pubkey>> = me let prunes_map: HashMap<Pubkey, HashSet<Pubkey>> = me
@ -1945,6 +2021,11 @@ impl ClusterInfo {
self.stats.process_pull_requests.clear(), self.stats.process_pull_requests.clear(),
i64 i64
), ),
(
"generate_pull_responses",
self.stats.generate_pull_responses.clear(),
i64
),
("process_prune", self.stats.process_prune.clear(), i64), ("process_prune", self.stats.process_prune.clear(), i64),
( (
"process_push_message", "process_push_message",
@ -1974,6 +2055,34 @@ impl ClusterInfo {
i64 i64
), ),
); );
datapoint_info!(
"cluster_info_shred_version_skip",
(
"skip_push_message_shred_version",
self.stats.skip_push_message_shred_version.clear(),
i64
),
(
"skip_pull_response_shred_version",
self.stats.skip_pull_response_shred_version.clear(),
i64
),
(
"skip_pull_shred_version",
self.stats.skip_pull_shred_version.clear(),
i64
),
(
"push_message_count",
self.stats.push_message_count.clear(),
i64
),
(
"push_message_value_count",
self.stats.push_message_value_count.clear(),
i64
),
);
*last_print = Instant::now(); *last_print = Instant::now();
} }
@ -2289,6 +2398,91 @@ mod tests {
assert!(ClusterInfo::is_spy_node(&node)); assert!(ClusterInfo::is_spy_node(&node));
} }
#[test]
fn test_handle_pull() {
let node = Node::new_localhost();
let cluster_info = Arc::new(ClusterInfo::new_with_invalid_keypair(node.info));
let entrypoint_pubkey = Pubkey::new_rand();
let data = test_crds_values(entrypoint_pubkey);
let timeouts = HashMap::new();
assert_eq!(
(0, 0, 1),
ClusterInfo::handle_pull_response(
&cluster_info,
&entrypoint_pubkey,
data.clone(),
&timeouts
)
);
let entrypoint_pubkey2 = Pubkey::new_rand();
assert_eq!(
(1, 0, 0),
ClusterInfo::handle_pull_response(&cluster_info, &entrypoint_pubkey2, data, &timeouts)
);
}
fn test_crds_values(pubkey: Pubkey) -> Vec<CrdsValue> {
let entrypoint = ContactInfo::new_localhost(&pubkey, timestamp());
let entrypoint_crdsvalue =
CrdsValue::new_unsigned(CrdsData::ContactInfo(entrypoint.clone()));
vec![entrypoint_crdsvalue]
}
#[test]
fn test_filter_shred_version() {
let from = Pubkey::new_rand();
let my_shred_version = 1;
let other_shred_version = 1;
// Allow same shred_version
let mut values = test_crds_values(from);
ClusterInfo::filter_by_shred_version(
&from,
&mut values,
other_shred_version,
my_shred_version,
);
assert_eq!(values.len(), 1);
// Allow shred_version=0.
let other_shred_version = 0;
ClusterInfo::filter_by_shred_version(
&from,
&mut values,
other_shred_version,
my_shred_version,
);
assert_eq!(values.len(), 1);
// Change to sender's ContactInfo version, allow that.
let other_shred_version = 2;
ClusterInfo::filter_by_shred_version(
&from,
&mut values,
other_shred_version,
my_shred_version,
);
assert_eq!(values.len(), 1);
let snapshot_hash_data = CrdsValue::new_unsigned(CrdsData::SnapshotHashes(SnapshotHash {
from: Pubkey::new_rand(),
hashes: vec![],
wallclock: 0,
}));
values.push(snapshot_hash_data);
// Change to sender's ContactInfo version, allow that.
let other_shred_version = 2;
ClusterInfo::filter_by_shred_version(
&from,
&mut values,
other_shred_version,
my_shred_version,
);
assert_eq!(values.len(), 1);
}
#[test] #[test]
fn test_cluster_spy_gossip() { fn test_cluster_spy_gossip() {
//check that gossip doesn't try to push to invalid addresses //check that gossip doesn't try to push to invalid addresses

View File

@ -158,14 +158,18 @@ impl CrdsGossip {
self.pull.mark_pull_request_creation_time(from, now) self.pull.mark_pull_request_creation_time(from, now)
} }
/// process a pull request and create a response /// process a pull request and create a response
pub fn process_pull_requests( pub fn process_pull_requests(&mut self, filters: Vec<(CrdsValue, CrdsFilter)>, now: u64) {
&mut self,
filters: Vec<(CrdsValue, CrdsFilter)>,
now: u64,
) -> Vec<Vec<CrdsValue>> {
self.pull self.pull
.process_pull_requests(&mut self.crds, filters, now) .process_pull_requests(&mut self.crds, filters, now);
} }
pub fn generate_pull_responses(
&self,
filters: &[(CrdsValue, CrdsFilter)],
) -> Vec<Vec<CrdsValue>> {
self.pull.generate_pull_responses(&self.crds, filters)
}
/// process a pull response /// process a pull response
pub fn process_pull_response( pub fn process_pull_response(
&mut self, &mut self,
@ -173,7 +177,7 @@ impl CrdsGossip {
timeouts: &HashMap<Pubkey, u64>, timeouts: &HashMap<Pubkey, u64>,
response: Vec<CrdsValue>, response: Vec<CrdsValue>,
now: u64, now: u64,
) -> (usize, usize) { ) -> (usize, usize, usize) {
self.pull self.pull
.process_pull_response(&mut self.crds, from, timeouts, response, now) .process_pull_response(&mut self.crds, from, timeouts, response, now)
} }

View File

@ -204,14 +204,13 @@ impl CrdsGossipPull {
self.purged_values.push_back((hash, timestamp)) self.purged_values.push_back((hash, timestamp))
} }
/// process a pull request and create a response /// process a pull request
pub fn process_pull_requests( pub fn process_pull_requests(
&mut self, &mut self,
crds: &mut Crds, crds: &mut Crds,
requests: Vec<(CrdsValue, CrdsFilter)>, requests: Vec<(CrdsValue, CrdsFilter)>,
now: u64, now: u64,
) -> Vec<Vec<CrdsValue>> { ) {
let rv = self.filter_crds_values(crds, &requests);
requests.into_iter().for_each(|(caller, _)| { requests.into_iter().for_each(|(caller, _)| {
let key = caller.label().pubkey(); let key = caller.label().pubkey();
let old = crds.insert(caller, now); let old = crds.insert(caller, now);
@ -221,8 +220,17 @@ impl CrdsGossipPull {
} }
crds.update_record_timestamp(&key, now); crds.update_record_timestamp(&key, now);
}); });
rv
} }
/// Create gossip responses to pull requests
pub fn generate_pull_responses(
&self,
crds: &Crds,
requests: &[(CrdsValue, CrdsFilter)],
) -> Vec<Vec<CrdsValue>> {
self.filter_crds_values(crds, requests)
}
/// process a pull response /// process a pull response
pub fn process_pull_response( pub fn process_pull_response(
&mut self, &mut self,
@ -231,9 +239,10 @@ impl CrdsGossipPull {
timeouts: &HashMap<Pubkey, u64>, timeouts: &HashMap<Pubkey, u64>,
response: Vec<CrdsValue>, response: Vec<CrdsValue>,
now: u64, now: u64,
) -> (usize, usize) { ) -> (usize, usize, usize) {
let mut failed = 0; let mut failed = 0;
let mut timeout_count = 0; let mut timeout_count = 0;
let mut success = 0;
for r in response { for r in response {
let owner = r.label().pubkey(); let owner = r.label().pubkey();
// Check if the crds value is older than the msg_timeout // Check if the crds value is older than the msg_timeout
@ -274,7 +283,11 @@ impl CrdsGossipPull {
} }
} }
let old = crds.insert(r, now); let old = crds.insert(r, now);
failed += old.is_err() as usize; if old.is_err() {
failed += 1;
} else {
success += 1;
}
old.ok().map(|opt| { old.ok().map(|opt| {
crds.update_record_timestamp(&owner, now); crds.update_record_timestamp(&owner, now);
opt.map(|val| { opt.map(|val| {
@ -284,7 +297,7 @@ impl CrdsGossipPull {
}); });
} }
crds.update_record_timestamp(from, now); crds.update_record_timestamp(from, now);
(failed, timeout_count) (failed, timeout_count, success)
} }
// build a set of filters of the current crds table // build a set of filters of the current crds table
// num_filters - used to increase the likelyhood of a value in crds being added to some filter // num_filters - used to increase the likelyhood of a value in crds being added to some filter
@ -573,8 +586,9 @@ mod test {
let mut dest_crds = Crds::default(); let mut dest_crds = Crds::default();
let mut dest = CrdsGossipPull::default(); let mut dest = CrdsGossipPull::default();
let (_, filters, caller) = req.unwrap(); let (_, filters, caller) = req.unwrap();
let filters = filters.into_iter().map(|f| (caller.clone(), f)).collect(); let filters: Vec<_> = filters.into_iter().map(|f| (caller.clone(), f)).collect();
let rsp = dest.process_pull_requests(&mut dest_crds, filters, 1); let rsp = dest.generate_pull_responses(&dest_crds, &filters);
dest.process_pull_requests(&mut dest_crds, filters, 1);
assert!(rsp.iter().all(|rsp| rsp.is_empty())); assert!(rsp.iter().all(|rsp| rsp.is_empty()));
assert!(dest_crds.lookup(&caller.label()).is_some()); assert!(dest_crds.lookup(&caller.label()).is_some());
assert_eq!( assert_eq!(
@ -643,8 +657,9 @@ mod test {
PACKET_DATA_SIZE, PACKET_DATA_SIZE,
); );
let (_, filters, caller) = req.unwrap(); let (_, filters, caller) = req.unwrap();
let filters = filters.into_iter().map(|f| (caller.clone(), f)).collect(); let filters: Vec<_> = filters.into_iter().map(|f| (caller.clone(), f)).collect();
let mut rsp = dest.process_pull_requests(&mut dest_crds, filters, 0); let mut rsp = dest.generate_pull_responses(&dest_crds, &filters);
dest.process_pull_requests(&mut dest_crds, filters, 0);
// if there is a false positive this is empty // if there is a false positive this is empty
// prob should be around 0.1 per iteration // prob should be around 0.1 per iteration
if rsp.is_empty() { if rsp.is_empty() {

View File

@ -426,23 +426,21 @@ fn network_run_pull(
.map(|f| f.filter.bits.len() as usize / 8) .map(|f| f.filter.bits.len() as usize / 8)
.sum::<usize>(); .sum::<usize>();
bytes += serialized_size(&caller_info).unwrap() as usize; bytes += serialized_size(&caller_info).unwrap() as usize;
let filters = filters let filters: Vec<_> = filters
.into_iter() .into_iter()
.map(|f| (caller_info.clone(), f)) .map(|f| (caller_info.clone(), f))
.collect(); .collect();
let rsp = network let rsp: Vec<_> = network
.get(&to) .get(&to)
.map(|node| { .map(|node| {
let mut rsp = vec![]; let rsp = node
rsp.append( .lock()
&mut node .unwrap()
.lock() .generate_pull_responses(&filters)
.unwrap() .into_iter()
.process_pull_requests(filters, now) .flatten()
.into_iter() .collect();
.flatten() node.lock().unwrap().process_pull_requests(filters, now);
.collect(),
);
rsp rsp
}) })
.unwrap(); .unwrap();