checks for duplicate validator instances using gossip
This commit is contained in:
committed by
Michael Vines
parent
14e241be35
commit
8cd5eb9863
@@ -18,8 +18,8 @@ use crate::{
|
||||
crds_gossip_error::CrdsGossipError,
|
||||
crds_gossip_pull::{CrdsFilter, ProcessPullStats, CRDS_GOSSIP_PULL_CRDS_TIMEOUT_MS},
|
||||
crds_value::{
|
||||
self, CrdsData, CrdsValue, CrdsValueLabel, EpochSlotsIndex, LowestSlot, SnapshotHash,
|
||||
Version, Vote, MAX_WALLCLOCK,
|
||||
self, CrdsData, CrdsValue, CrdsValueLabel, EpochSlotsIndex, LowestSlot, NodeInstance,
|
||||
SnapshotHash, Version, Vote, MAX_WALLCLOCK,
|
||||
},
|
||||
data_budget::DataBudget,
|
||||
epoch_slots::EpochSlots,
|
||||
@@ -300,6 +300,7 @@ pub struct ClusterInfo {
|
||||
socket: UdpSocket,
|
||||
local_message_pending_push_queue: RwLock<Vec<(CrdsValue, u64)>>,
|
||||
contact_debug_interval: u64,
|
||||
instance: RwLock<NodeInstance>,
|
||||
}
|
||||
|
||||
impl Default for ClusterInfo {
|
||||
@@ -556,6 +557,7 @@ impl ClusterInfo {
|
||||
socket: UdpSocket::bind("0.0.0.0:0").unwrap(),
|
||||
local_message_pending_push_queue: RwLock::new(vec![]),
|
||||
contact_debug_interval: DEFAULT_CONTACT_DEBUG_INTERVAL,
|
||||
instance: RwLock::new(NodeInstance::new(id, timestamp())),
|
||||
};
|
||||
{
|
||||
let mut gossip = me.gossip.write().unwrap();
|
||||
@@ -590,6 +592,7 @@ impl ClusterInfo {
|
||||
.clone(),
|
||||
),
|
||||
contact_debug_interval: self.contact_debug_interval,
|
||||
instance: RwLock::new(NodeInstance::new(*new_id, timestamp())),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -614,16 +617,25 @@ impl ClusterInfo {
|
||||
) {
|
||||
let now = timestamp();
|
||||
self.my_contact_info.write().unwrap().wallclock = now;
|
||||
let entry =
|
||||
CrdsValue::new_signed(CrdsData::ContactInfo(self.my_contact_info()), &self.keypair);
|
||||
self.instance.write().unwrap().update_wallclock(now);
|
||||
let entries: Vec<_> = vec![
|
||||
CrdsData::ContactInfo(self.my_contact_info()),
|
||||
CrdsData::NodeInstance(self.instance.read().unwrap().clone()),
|
||||
]
|
||||
.into_iter()
|
||||
.map(|v| CrdsValue::new_signed(v, &self.keypair))
|
||||
.collect();
|
||||
{
|
||||
let mut local_message_pending_push_queue =
|
||||
self.local_message_pending_push_queue.write().unwrap();
|
||||
for entry in entries {
|
||||
local_message_pending_push_queue.push((entry, now));
|
||||
}
|
||||
}
|
||||
self.gossip
|
||||
.write()
|
||||
.unwrap()
|
||||
.refresh_push_active_set(stakes, gossip_validators);
|
||||
self.local_message_pending_push_queue
|
||||
.write()
|
||||
.unwrap()
|
||||
.push((entry, now));
|
||||
}
|
||||
|
||||
// TODO kill insert_info, only used by tests
|
||||
@@ -2497,8 +2509,8 @@ impl ClusterInfo {
|
||||
stakes: HashMap<Pubkey, u64>,
|
||||
feature_set: Option<&FeatureSet>,
|
||||
epoch_time_ms: u64,
|
||||
) {
|
||||
let mut timer = Measure::start("process_gossip_packets_time");
|
||||
) -> Result<()> {
|
||||
let _st = ScopedTimer::from(&self.stats.process_gossip_packets_time);
|
||||
let packets: Vec<_> = thread_pool.install(|| {
|
||||
packets
|
||||
.into_par_iter()
|
||||
@@ -2511,6 +2523,17 @@ impl ClusterInfo {
|
||||
})
|
||||
.collect()
|
||||
});
|
||||
// Check if there is a duplicate instance of
|
||||
// this node with more recent timestamp.
|
||||
let self_instance = self.instance.read().unwrap().clone();
|
||||
let check_duplicate_instance = |values: &[CrdsValue]| {
|
||||
for value in values {
|
||||
if self_instance.check_duplicate(value) {
|
||||
return Err(Error::DuplicateNodeInstance);
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
};
|
||||
// Split packets based on their types.
|
||||
let mut pull_requests = vec![];
|
||||
let mut pull_responses = vec![];
|
||||
@@ -2523,8 +2546,14 @@ impl ClusterInfo {
|
||||
Protocol::PullRequest(filter, caller) => {
|
||||
pull_requests.push((from_addr, filter, caller))
|
||||
}
|
||||
Protocol::PullResponse(from, data) => pull_responses.push((from, data)),
|
||||
Protocol::PushMessage(from, data) => push_messages.push((from, data)),
|
||||
Protocol::PullResponse(from, data) => {
|
||||
check_duplicate_instance(&data)?;
|
||||
pull_responses.push((from, data));
|
||||
}
|
||||
Protocol::PushMessage(from, data) => {
|
||||
check_duplicate_instance(&data)?;
|
||||
push_messages.push((from, data));
|
||||
}
|
||||
Protocol::PruneMessage(from, data) => prune_messages.push((from, data)),
|
||||
Protocol::PingMessage(ping) => ping_messages.push((from_addr, ping)),
|
||||
Protocol::PongMessage(pong) => pong_messages.push((from_addr, pong)),
|
||||
@@ -2549,9 +2578,7 @@ impl ClusterInfo {
|
||||
response_sender,
|
||||
feature_set,
|
||||
);
|
||||
self.stats
|
||||
.process_gossip_packets_time
|
||||
.add_measure(&mut timer);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Process messages from the network
|
||||
@@ -2598,7 +2625,7 @@ impl ClusterInfo {
|
||||
stakes,
|
||||
feature_set.as_deref(),
|
||||
epoch_time_ms,
|
||||
);
|
||||
)?;
|
||||
|
||||
self.print_reset_stats(last_print);
|
||||
|
||||
@@ -2863,25 +2890,34 @@ impl ClusterInfo {
|
||||
.build()
|
||||
.unwrap();
|
||||
let mut last_print = Instant::now();
|
||||
loop {
|
||||
let e = self.run_listen(
|
||||
while !exit.load(Ordering::Relaxed) {
|
||||
if let Err(err) = self.run_listen(
|
||||
&recycler,
|
||||
bank_forks.as_ref(),
|
||||
&requests_receiver,
|
||||
&response_sender,
|
||||
&thread_pool,
|
||||
&mut last_print,
|
||||
);
|
||||
if exit.load(Ordering::Relaxed) {
|
||||
return;
|
||||
}
|
||||
if e.is_err() {
|
||||
let r_gossip = self.gossip.read().unwrap();
|
||||
debug!(
|
||||
"{}: run_listen timeout, table size: {}",
|
||||
self.id(),
|
||||
r_gossip.crds.len()
|
||||
);
|
||||
) {
|
||||
match err {
|
||||
Error::RecvTimeoutError(_) => {
|
||||
let table_size = self.gossip.read().unwrap().crds.len();
|
||||
debug!(
|
||||
"{}: run_listen timeout, table size: {}",
|
||||
self.id(),
|
||||
table_size,
|
||||
);
|
||||
}
|
||||
Error::DuplicateNodeInstance => {
|
||||
error!(
|
||||
"duplicate running instances of the same validator node: {}",
|
||||
self.id()
|
||||
);
|
||||
exit.store(true, Ordering::Relaxed);
|
||||
return;
|
||||
}
|
||||
_ => error!("gossip run_listen failed: {}", err),
|
||||
}
|
||||
}
|
||||
thread_mem_usage::datapoint("solana-listen");
|
||||
}
|
||||
|
Reference in New Issue
Block a user