Cluster info metrics (#10215)
This commit is contained in:
@ -55,7 +55,7 @@ use solana_sdk::{
|
|||||||
clock::{Slot, DEFAULT_MS_PER_SLOT, DEFAULT_SLOTS_PER_EPOCH},
|
clock::{Slot, DEFAULT_MS_PER_SLOT, DEFAULT_SLOTS_PER_EPOCH},
|
||||||
pubkey::Pubkey,
|
pubkey::Pubkey,
|
||||||
signature::{Keypair, Signable, Signature, Signer},
|
signature::{Keypair, Signable, Signature, Signer},
|
||||||
timing::{duration_as_ms, timestamp},
|
timing::timestamp,
|
||||||
transaction::Transaction,
|
transaction::Transaction,
|
||||||
};
|
};
|
||||||
use solana_streamer::sendmmsg::multicast;
|
use solana_streamer::sendmmsg::multicast;
|
||||||
@ -66,8 +66,9 @@ use std::{
|
|||||||
collections::{HashMap, HashSet},
|
collections::{HashMap, HashSet},
|
||||||
fmt,
|
fmt,
|
||||||
net::{IpAddr, Ipv4Addr, SocketAddr, TcpListener, UdpSocket},
|
net::{IpAddr, Ipv4Addr, SocketAddr, TcpListener, UdpSocket},
|
||||||
sync::atomic::{AtomicBool, Ordering},
|
ops::{Deref, DerefMut},
|
||||||
sync::{Arc, RwLock},
|
sync::atomic::{AtomicBool, AtomicU64, Ordering},
|
||||||
|
sync::{Arc, RwLock, RwLockReadGuard, RwLockWriteGuard},
|
||||||
thread::{sleep, Builder, JoinHandle},
|
thread::{sleep, Builder, JoinHandle},
|
||||||
time::{Duration, Instant},
|
time::{Duration, Instant},
|
||||||
};
|
};
|
||||||
@ -108,6 +109,127 @@ pub struct DataBudget {
|
|||||||
// used to detect when to up the bytes budget again
|
// used to detect when to up the bytes budget again
|
||||||
}
|
}
|
||||||
|
|
||||||
|
struct GossipWriteLock<'a> {
|
||||||
|
gossip: RwLockWriteGuard<'a, CrdsGossip>,
|
||||||
|
timer: Measure,
|
||||||
|
counter: &'a Counter,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> GossipWriteLock<'a> {
|
||||||
|
fn new(
|
||||||
|
gossip: RwLockWriteGuard<'a, CrdsGossip>,
|
||||||
|
label: &'static str,
|
||||||
|
counter: &'a Counter,
|
||||||
|
) -> Self {
|
||||||
|
Self {
|
||||||
|
gossip,
|
||||||
|
timer: Measure::start(label),
|
||||||
|
counter,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> Deref for GossipWriteLock<'a> {
|
||||||
|
type Target = RwLockWriteGuard<'a, CrdsGossip>;
|
||||||
|
fn deref(&self) -> &Self::Target {
|
||||||
|
&self.gossip
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> DerefMut for GossipWriteLock<'a> {
|
||||||
|
fn deref_mut(&mut self) -> &mut Self::Target {
|
||||||
|
&mut self.gossip
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> Drop for GossipWriteLock<'a> {
|
||||||
|
fn drop(&mut self) {
|
||||||
|
self.timer.stop();
|
||||||
|
self.counter.add_measure(&mut self.timer);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
struct GossipReadLock<'a> {
|
||||||
|
gossip: RwLockReadGuard<'a, CrdsGossip>,
|
||||||
|
timer: Measure,
|
||||||
|
counter: &'a Counter,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> GossipReadLock<'a> {
|
||||||
|
fn new(
|
||||||
|
gossip: RwLockReadGuard<'a, CrdsGossip>,
|
||||||
|
label: &'static str,
|
||||||
|
counter: &'a Counter,
|
||||||
|
) -> Self {
|
||||||
|
Self {
|
||||||
|
gossip,
|
||||||
|
timer: Measure::start(label),
|
||||||
|
counter,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> Deref for GossipReadLock<'a> {
|
||||||
|
type Target = RwLockReadGuard<'a, CrdsGossip>;
|
||||||
|
fn deref(&self) -> &Self::Target {
|
||||||
|
&self.gossip
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> Drop for GossipReadLock<'a> {
|
||||||
|
fn drop(&mut self) {
|
||||||
|
self.timer.stop();
|
||||||
|
self.counter.add_measure(&mut self.timer);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Default)]
|
||||||
|
struct Counter(AtomicU64);
|
||||||
|
|
||||||
|
impl Counter {
|
||||||
|
fn add_measure(&self, x: &mut Measure) {
|
||||||
|
x.stop();
|
||||||
|
self.0.fetch_add(x.as_us(), Ordering::Relaxed);
|
||||||
|
}
|
||||||
|
fn add_relaxed(&self, x: u64) {
|
||||||
|
self.0.fetch_add(x, Ordering::Relaxed);
|
||||||
|
}
|
||||||
|
fn clear(&self) -> u64 {
|
||||||
|
self.0.swap(0, Ordering::Relaxed)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Default)]
|
||||||
|
struct GossipStats {
|
||||||
|
entrypoint: Counter,
|
||||||
|
entrypoint2: Counter,
|
||||||
|
push_vote_read: Counter,
|
||||||
|
vote_process_push: Counter,
|
||||||
|
get_votes: Counter,
|
||||||
|
get_accounts_hash: Counter,
|
||||||
|
get_snapshot_hash: Counter,
|
||||||
|
all_tvu_peers: Counter,
|
||||||
|
tvu_peers: Counter,
|
||||||
|
retransmit_peers: Counter,
|
||||||
|
repair_peers: Counter,
|
||||||
|
new_push_requests: Counter,
|
||||||
|
new_push_requests2: Counter,
|
||||||
|
process_pull_response: Counter,
|
||||||
|
process_pull_response_count: Counter,
|
||||||
|
process_pull_response_len: Counter,
|
||||||
|
process_pull_response_timeout: Counter,
|
||||||
|
process_pull_requests: Counter,
|
||||||
|
process_prune: Counter,
|
||||||
|
process_push_message: Counter,
|
||||||
|
prune_received_cache: Counter,
|
||||||
|
purge: Counter,
|
||||||
|
epoch_slots_lookup: Counter,
|
||||||
|
epoch_slots_push: Counter,
|
||||||
|
push_message: Counter,
|
||||||
|
new_pull_requests: Counter,
|
||||||
|
mark_pull_request: Counter,
|
||||||
|
}
|
||||||
|
|
||||||
pub struct ClusterInfo {
|
pub struct ClusterInfo {
|
||||||
/// The network
|
/// The network
|
||||||
pub gossip: RwLock<CrdsGossip>,
|
pub gossip: RwLock<CrdsGossip>,
|
||||||
@ -118,6 +240,7 @@ pub struct ClusterInfo {
|
|||||||
outbound_budget: RwLock<DataBudget>,
|
outbound_budget: RwLock<DataBudget>,
|
||||||
my_contact_info: RwLock<ContactInfo>,
|
my_contact_info: RwLock<ContactInfo>,
|
||||||
id: Pubkey,
|
id: Pubkey,
|
||||||
|
stats: GossipStats,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Default, Clone)]
|
#[derive(Default, Clone)]
|
||||||
@ -266,6 +389,7 @@ impl ClusterInfo {
|
|||||||
}),
|
}),
|
||||||
my_contact_info: RwLock::new(contact_info),
|
my_contact_info: RwLock::new(contact_info),
|
||||||
id,
|
id,
|
||||||
|
stats: GossipStats::default(),
|
||||||
};
|
};
|
||||||
{
|
{
|
||||||
let mut gossip = me.gossip.write().unwrap();
|
let mut gossip = me.gossip.write().unwrap();
|
||||||
@ -290,6 +414,7 @@ impl ClusterInfo {
|
|||||||
outbound_budget: RwLock::new(self.outbound_budget.read().unwrap().clone()),
|
outbound_budget: RwLock::new(self.outbound_budget.read().unwrap().clone()),
|
||||||
my_contact_info: RwLock::new(my_contact_info),
|
my_contact_info: RwLock::new(my_contact_info),
|
||||||
id: *new_id,
|
id: *new_id,
|
||||||
|
stats: GossipStats::default(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -475,9 +600,10 @@ impl ClusterInfo {
|
|||||||
let mut current_slots: Vec<_> = (0..crds_value::MAX_EPOCH_SLOTS)
|
let mut current_slots: Vec<_> = (0..crds_value::MAX_EPOCH_SLOTS)
|
||||||
.filter_map(|ix| {
|
.filter_map(|ix| {
|
||||||
Some((
|
Some((
|
||||||
self.gossip
|
self.time_gossip_read_lock(
|
||||||
.read()
|
"lookup_epoch_slots",
|
||||||
.unwrap()
|
&self.stats.epoch_slots_lookup,
|
||||||
|
)
|
||||||
.crds
|
.crds
|
||||||
.lookup(&CrdsValueLabel::EpochSlots(ix, self.id()))
|
.lookup(&CrdsValueLabel::EpochSlots(ix, self.id()))
|
||||||
.and_then(CrdsValue::epoch_slots)
|
.and_then(CrdsValue::epoch_slots)
|
||||||
@ -518,9 +644,7 @@ impl ClusterInfo {
|
|||||||
let n = slots.fill(&update[num..], now);
|
let n = slots.fill(&update[num..], now);
|
||||||
if n > 0 {
|
if n > 0 {
|
||||||
let entry = CrdsValue::new_signed(CrdsData::EpochSlots(ix, slots), &self.keypair);
|
let entry = CrdsValue::new_signed(CrdsData::EpochSlots(ix, slots), &self.keypair);
|
||||||
self.gossip
|
self.time_gossip_write_lock("epcoh_slots_push", &self.stats.epoch_slots_push)
|
||||||
.write()
|
|
||||||
.unwrap()
|
|
||||||
.process_push_message(&self.id(), vec![entry], now);
|
.process_push_message(&self.id(), vec![entry], now);
|
||||||
}
|
}
|
||||||
num += n;
|
num += n;
|
||||||
@ -531,12 +655,26 @@ impl ClusterInfo {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn time_gossip_read_lock<'a>(
|
||||||
|
&'a self,
|
||||||
|
label: &'static str,
|
||||||
|
counter: &'a Counter,
|
||||||
|
) -> GossipReadLock<'a> {
|
||||||
|
GossipReadLock::new(self.gossip.read().unwrap(), label, counter)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn time_gossip_write_lock<'a>(
|
||||||
|
&'a self,
|
||||||
|
label: &'static str,
|
||||||
|
counter: &'a Counter,
|
||||||
|
) -> GossipWriteLock<'a> {
|
||||||
|
GossipWriteLock::new(self.gossip.write().unwrap(), label, counter)
|
||||||
|
}
|
||||||
|
|
||||||
pub fn push_message(&self, message: CrdsValue) {
|
pub fn push_message(&self, message: CrdsValue) {
|
||||||
let now = message.wallclock();
|
let now = message.wallclock();
|
||||||
let id = message.pubkey();
|
let id = message.pubkey();
|
||||||
self.gossip
|
self.time_gossip_write_lock("process_push_message", &self.stats.push_message)
|
||||||
.write()
|
|
||||||
.unwrap()
|
|
||||||
.process_push_message(&id, vec![message], now);
|
.process_push_message(&id, vec![message], now);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -570,16 +708,15 @@ impl ClusterInfo {
|
|||||||
let now = timestamp();
|
let now = timestamp();
|
||||||
let vote = Vote::new(&self.id(), vote, now);
|
let vote = Vote::new(&self.id(), vote, now);
|
||||||
let vote_ix = {
|
let vote_ix = {
|
||||||
let r_gossip = self.gossip.read().unwrap();
|
let r_gossip =
|
||||||
|
self.time_gossip_read_lock("gossip_read_push_vote", &self.stats.push_vote_read);
|
||||||
let current_votes: Vec<_> = (0..crds_value::MAX_VOTES)
|
let current_votes: Vec<_> = (0..crds_value::MAX_VOTES)
|
||||||
.filter_map(|ix| r_gossip.crds.lookup(&CrdsValueLabel::Vote(ix, self.id())))
|
.filter_map(|ix| r_gossip.crds.lookup(&CrdsValueLabel::Vote(ix, self.id())))
|
||||||
.collect();
|
.collect();
|
||||||
CrdsValue::compute_vote_index(tower_index, current_votes)
|
CrdsValue::compute_vote_index(tower_index, current_votes)
|
||||||
};
|
};
|
||||||
let entry = CrdsValue::new_signed(CrdsData::Vote(vote_ix, vote), &self.keypair);
|
let entry = CrdsValue::new_signed(CrdsData::Vote(vote_ix, vote), &self.keypair);
|
||||||
self.gossip
|
self.time_gossip_write_lock("push_vote_process_push", &self.stats.vote_process_push)
|
||||||
.write()
|
|
||||||
.unwrap()
|
|
||||||
.process_push_message(&self.id(), vec![entry], now);
|
.process_push_message(&self.id(), vec![entry], now);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -591,9 +728,7 @@ impl ClusterInfo {
|
|||||||
pub fn get_votes(&self, since: u64) -> (Vec<CrdsValueLabel>, Vec<Transaction>, u64) {
|
pub fn get_votes(&self, since: u64) -> (Vec<CrdsValueLabel>, Vec<Transaction>, u64) {
|
||||||
let mut max_ts = since;
|
let mut max_ts = since;
|
||||||
let (labels, txs): (Vec<CrdsValueLabel>, Vec<Transaction>) = self
|
let (labels, txs): (Vec<CrdsValueLabel>, Vec<Transaction>) = self
|
||||||
.gossip
|
.time_gossip_read_lock("get_votes", &self.stats.get_votes)
|
||||||
.read()
|
|
||||||
.unwrap()
|
|
||||||
.crds
|
.crds
|
||||||
.table
|
.table
|
||||||
.iter()
|
.iter()
|
||||||
@ -610,9 +745,7 @@ impl ClusterInfo {
|
|||||||
}
|
}
|
||||||
|
|
||||||
pub fn get_snapshot_hash(&self, slot: Slot) -> Vec<(Pubkey, Hash)> {
|
pub fn get_snapshot_hash(&self, slot: Slot) -> Vec<(Pubkey, Hash)> {
|
||||||
self.gossip
|
self.time_gossip_read_lock("get_snapshot_hash", &self.stats.get_snapshot_hash)
|
||||||
.read()
|
|
||||||
.unwrap()
|
|
||||||
.crds
|
.crds
|
||||||
.table
|
.table
|
||||||
.values()
|
.values()
|
||||||
@ -632,9 +765,7 @@ impl ClusterInfo {
|
|||||||
where
|
where
|
||||||
F: FnOnce(&Vec<(Slot, Hash)>) -> Y,
|
F: FnOnce(&Vec<(Slot, Hash)>) -> Y,
|
||||||
{
|
{
|
||||||
self.gossip
|
self.time_gossip_read_lock("get_accounts_hash", &self.stats.get_accounts_hash)
|
||||||
.read()
|
|
||||||
.unwrap()
|
|
||||||
.crds
|
.crds
|
||||||
.table
|
.table
|
||||||
.get(&CrdsValueLabel::AccountsHashes(*pubkey))
|
.get(&CrdsValueLabel::AccountsHashes(*pubkey))
|
||||||
@ -758,9 +889,7 @@ impl ClusterInfo {
|
|||||||
|
|
||||||
/// all validators that have a valid tvu port regardless of `shred_version`.
|
/// all validators that have a valid tvu port regardless of `shred_version`.
|
||||||
pub fn all_tvu_peers(&self) -> Vec<ContactInfo> {
|
pub fn all_tvu_peers(&self) -> Vec<ContactInfo> {
|
||||||
self.gossip
|
self.time_gossip_read_lock("all_tvu_peers", &self.stats.all_tvu_peers)
|
||||||
.read()
|
|
||||||
.unwrap()
|
|
||||||
.crds
|
.crds
|
||||||
.table
|
.table
|
||||||
.values()
|
.values()
|
||||||
@ -772,9 +901,7 @@ impl ClusterInfo {
|
|||||||
|
|
||||||
/// all validators that have a valid tvu port and are on the same `shred_version`.
|
/// all validators that have a valid tvu port and are on the same `shred_version`.
|
||||||
pub fn tvu_peers(&self) -> Vec<ContactInfo> {
|
pub fn tvu_peers(&self) -> Vec<ContactInfo> {
|
||||||
self.gossip
|
self.time_gossip_read_lock("tvu_peers", &self.stats.tvu_peers)
|
||||||
.read()
|
|
||||||
.unwrap()
|
|
||||||
.crds
|
.crds
|
||||||
.table
|
.table
|
||||||
.values()
|
.values()
|
||||||
@ -790,9 +917,7 @@ impl ClusterInfo {
|
|||||||
|
|
||||||
/// all peers that have a valid tvu
|
/// all peers that have a valid tvu
|
||||||
pub fn retransmit_peers(&self) -> Vec<ContactInfo> {
|
pub fn retransmit_peers(&self) -> Vec<ContactInfo> {
|
||||||
self.gossip
|
self.time_gossip_read_lock("retransmit_peers", &self.stats.retransmit_peers)
|
||||||
.read()
|
|
||||||
.unwrap()
|
|
||||||
.crds
|
.crds
|
||||||
.table
|
.table
|
||||||
.values()
|
.values()
|
||||||
@ -809,7 +934,8 @@ impl ClusterInfo {
|
|||||||
|
|
||||||
/// all tvu peers with valid gossip addrs that likely have the slot being requested
|
/// all tvu peers with valid gossip addrs that likely have the slot being requested
|
||||||
pub fn repair_peers(&self, slot: Slot) -> Vec<ContactInfo> {
|
pub fn repair_peers(&self, slot: Slot) -> Vec<ContactInfo> {
|
||||||
ClusterInfo::tvu_peers(self)
|
let mut time = Measure::start("repair_peers");
|
||||||
|
let ret = ClusterInfo::tvu_peers(self)
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.filter(|x| {
|
.filter(|x| {
|
||||||
x.id != self.id()
|
x.id != self.id()
|
||||||
@ -822,7 +948,9 @@ impl ClusterInfo {
|
|||||||
.unwrap_or_else(|| /* fallback to legacy behavior */ true)
|
.unwrap_or_else(|| /* fallback to legacy behavior */ true)
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
.collect()
|
.collect();
|
||||||
|
self.stats.repair_peers.add_measure(&mut time);
|
||||||
|
ret
|
||||||
}
|
}
|
||||||
|
|
||||||
fn is_spy_node(contact_info: &ContactInfo) -> bool {
|
fn is_spy_node(contact_info: &ContactInfo) -> bool {
|
||||||
@ -1105,8 +1233,12 @@ impl ClusterInfo {
|
|||||||
false
|
false
|
||||||
} else {
|
} else {
|
||||||
entrypoint.wallclock = now;
|
entrypoint.wallclock = now;
|
||||||
let found_entrypoint =
|
let found_entrypoint = self
|
||||||
self.gossip.read().unwrap().crds.table.iter().any(|(_, v)| {
|
.time_gossip_read_lock("entrypoint", &self.stats.entrypoint)
|
||||||
|
.crds
|
||||||
|
.table
|
||||||
|
.iter()
|
||||||
|
.any(|(_, v)| {
|
||||||
v.value
|
v.value
|
||||||
.contact_info()
|
.contact_info()
|
||||||
.map(|ci| ci.gossip == entrypoint.gossip)
|
.map(|ci| ci.gossip == entrypoint.gossip)
|
||||||
@ -1129,12 +1261,12 @@ impl ClusterInfo {
|
|||||||
.map(|e| (e.id, e.gossip))
|
.map(|e| (e.id, e.gossip))
|
||||||
};
|
};
|
||||||
if let Some((id, gossip)) = id_and_gossip {
|
if let Some((id, gossip)) = id_and_gossip {
|
||||||
let r_gossip = self.gossip.read().unwrap();
|
let r_gossip = self.time_gossip_read_lock("entrypoint", &self.stats.entrypoint2);
|
||||||
let self_info = r_gossip
|
let self_info = r_gossip
|
||||||
.crds
|
.crds
|
||||||
.lookup(&CrdsValueLabel::ContactInfo(self.id()))
|
.lookup(&CrdsValueLabel::ContactInfo(self.id()))
|
||||||
.unwrap_or_else(|| panic!("self_id invalid {}", self.id()));
|
.unwrap_or_else(|| panic!("self_id invalid {}", self.id()));
|
||||||
return r_gossip
|
r_gossip
|
||||||
.pull
|
.pull
|
||||||
.build_crds_filters(&r_gossip.crds, MAX_BLOOM_SIZE)
|
.build_crds_filters(&r_gossip.crds, MAX_BLOOM_SIZE)
|
||||||
.into_iter()
|
.into_iter()
|
||||||
@ -1186,8 +1318,8 @@ impl ClusterInfo {
|
|||||||
fn new_pull_requests(&self, stakes: &HashMap<Pubkey, u64>) -> Vec<(SocketAddr, Protocol)> {
|
fn new_pull_requests(&self, stakes: &HashMap<Pubkey, u64>) -> Vec<(SocketAddr, Protocol)> {
|
||||||
let now = timestamp();
|
let now = timestamp();
|
||||||
let mut pulls: Vec<_> = {
|
let mut pulls: Vec<_> = {
|
||||||
let r_gossip = self.gossip.read().unwrap();
|
let r_gossip =
|
||||||
|
self.time_gossip_read_lock("new_pull_reqs", &self.stats.new_pull_requests);
|
||||||
r_gossip
|
r_gossip
|
||||||
.new_pull_request(now, stakes, MAX_BLOOM_SIZE)
|
.new_pull_request(now, stakes, MAX_BLOOM_SIZE)
|
||||||
.ok()
|
.ok()
|
||||||
@ -1211,9 +1343,7 @@ impl ClusterInfo {
|
|||||||
pulls
|
pulls
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.map(|(peer, filter, gossip, self_info)| {
|
.map(|(peer, filter, gossip, self_info)| {
|
||||||
self.gossip
|
self.time_gossip_write_lock("mark_pull", &self.stats.mark_pull_request)
|
||||||
.write()
|
|
||||||
.unwrap()
|
|
||||||
.mark_pull_request_creation_time(&peer, now);
|
.mark_pull_request_creation_time(&peer, now);
|
||||||
(gossip, Protocol::PullRequest(filter, self_info))
|
(gossip, Protocol::PullRequest(filter, self_info))
|
||||||
})
|
})
|
||||||
@ -1221,14 +1351,14 @@ impl ClusterInfo {
|
|||||||
}
|
}
|
||||||
fn new_push_requests(&self) -> Vec<(SocketAddr, Protocol)> {
|
fn new_push_requests(&self) -> Vec<(SocketAddr, Protocol)> {
|
||||||
let self_id = self.id();
|
let self_id = self.id();
|
||||||
let (_, push_messages) = self.gossip.write().unwrap().new_push_messages(timestamp());
|
let (_, push_messages) = self
|
||||||
|
.time_gossip_write_lock("new_push_requests", &self.stats.new_push_requests)
|
||||||
|
.new_push_messages(timestamp());
|
||||||
push_messages
|
push_messages
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.filter_map(|(peer, messages)| {
|
.filter_map(|(peer, messages)| {
|
||||||
let peer_label = CrdsValueLabel::ContactInfo(peer);
|
let peer_label = CrdsValueLabel::ContactInfo(peer);
|
||||||
self.gossip
|
self.time_gossip_read_lock("push_req_lookup", &self.stats.new_push_requests2)
|
||||||
.read()
|
|
||||||
.unwrap()
|
|
||||||
.crds
|
.crds
|
||||||
.lookup(&peer_label)
|
.lookup(&peer_label)
|
||||||
.and_then(CrdsValue::contact_info)
|
.and_then(CrdsValue::contact_info)
|
||||||
@ -1312,7 +1442,9 @@ impl ClusterInfo {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
let timeouts = obj.gossip.read().unwrap().make_timeouts(&stakes, timeout);
|
let timeouts = obj.gossip.read().unwrap().make_timeouts(&stakes, timeout);
|
||||||
let num_purged = obj.gossip.write().unwrap().purge(timestamp(), &timeouts);
|
let num_purged = obj
|
||||||
|
.time_gossip_write_lock("purge", &obj.stats.purge)
|
||||||
|
.purge(timestamp(), &timeouts);
|
||||||
inc_new_counter_info!("cluster_info-purge-count", num_purged);
|
inc_new_counter_info!("cluster_info-purge-count", num_purged);
|
||||||
let table_size = obj.gossip.read().unwrap().crds.table.len();
|
let table_size = obj.gossip.read().unwrap().crds.table.len();
|
||||||
datapoint_debug!(
|
datapoint_debug!(
|
||||||
@ -1454,7 +1586,9 @@ impl ClusterInfo {
|
|||||||
"cluster_info-prune_message-size",
|
"cluster_info-prune_message-size",
|
||||||
data.prunes.len()
|
data.prunes.len()
|
||||||
);
|
);
|
||||||
match me.gossip.write().unwrap().process_prune_msg(
|
match me
|
||||||
|
.time_gossip_write_lock("process_prune", &me.stats.process_prune)
|
||||||
|
.process_prune_msg(
|
||||||
&from,
|
&from,
|
||||||
&data.destination,
|
&data.destination,
|
||||||
&data.prunes,
|
&data.prunes,
|
||||||
@ -1524,9 +1658,7 @@ impl ClusterInfo {
|
|||||||
let now = timestamp();
|
let now = timestamp();
|
||||||
let self_id = me.id();
|
let self_id = me.id();
|
||||||
let pull_responses = me
|
let pull_responses = me
|
||||||
.gossip
|
.time_gossip_write_lock("process_pull_reqs", &me.stats.process_pull_requests)
|
||||||
.write()
|
|
||||||
.unwrap()
|
|
||||||
.process_pull_requests(caller_and_filters, now);
|
.process_pull_requests(caller_and_filters, now);
|
||||||
|
|
||||||
// Filter bad to addresses
|
// Filter bad to addresses
|
||||||
@ -1630,17 +1762,15 @@ impl ClusterInfo {
|
|||||||
timeouts: &HashMap<Pubkey, u64>,
|
timeouts: &HashMap<Pubkey, u64>,
|
||||||
) {
|
) {
|
||||||
let len = data.len();
|
let len = data.len();
|
||||||
let now = Instant::now();
|
trace!("PullResponse me: {} from: {} len={}", me.id, from, len);
|
||||||
let self_id = me.gossip.read().unwrap().id;
|
let (_fail, timeout_count) = me
|
||||||
trace!("PullResponse me: {} from: {} len={}", self_id, from, len);
|
.time_gossip_write_lock("process_pull", &me.stats.process_pull_response)
|
||||||
me.gossip
|
|
||||||
.write()
|
|
||||||
.unwrap()
|
|
||||||
.process_pull_response(from, timeouts, data, timestamp());
|
.process_pull_response(from, timeouts, data, timestamp());
|
||||||
inc_new_counter_debug!("cluster_info-pull_request_response", 1);
|
me.stats.process_pull_response_count.add_relaxed(1);
|
||||||
inc_new_counter_debug!("cluster_info-pull_request_response-size", len);
|
me.stats.process_pull_response_len.add_relaxed(len as u64);
|
||||||
|
me.stats
|
||||||
report_time_spent("ReceiveUpdates", &now.elapsed(), &format!(" len: {}", len));
|
.process_pull_response_timeout
|
||||||
|
.add_relaxed(timeout_count as u64);
|
||||||
}
|
}
|
||||||
|
|
||||||
fn handle_push_message(
|
fn handle_push_message(
|
||||||
@ -1653,17 +1783,13 @@ impl ClusterInfo {
|
|||||||
let self_id = me.id();
|
let self_id = me.id();
|
||||||
inc_new_counter_debug!("cluster_info-push_message", 1);
|
inc_new_counter_debug!("cluster_info-push_message", 1);
|
||||||
|
|
||||||
let updated: Vec<_> =
|
let updated: Vec<_> = me
|
||||||
me.gossip
|
.time_gossip_write_lock("process_push", &me.stats.process_push_message)
|
||||||
.write()
|
|
||||||
.unwrap()
|
|
||||||
.process_push_message(from, data, timestamp());
|
.process_push_message(from, data, timestamp());
|
||||||
|
|
||||||
let updated_labels: Vec<_> = updated.into_iter().map(|u| u.value.label()).collect();
|
let updated_labels: Vec<_> = updated.into_iter().map(|u| u.value.label()).collect();
|
||||||
let prunes_map: HashMap<Pubkey, HashSet<Pubkey>> = me
|
let prunes_map: HashMap<Pubkey, HashSet<Pubkey>> = me
|
||||||
.gossip
|
.time_gossip_write_lock("prune_received_cache", &me.stats.prune_received_cache)
|
||||||
.write()
|
|
||||||
.unwrap()
|
|
||||||
.prune_received_cache(updated_labels, stakes);
|
.prune_received_cache(updated_labels, stakes);
|
||||||
|
|
||||||
let rsp: Vec<_> = prunes_map
|
let rsp: Vec<_> = prunes_map
|
||||||
@ -1714,6 +1840,7 @@ impl ClusterInfo {
|
|||||||
requests_receiver: &PacketReceiver,
|
requests_receiver: &PacketReceiver,
|
||||||
response_sender: &PacketSender,
|
response_sender: &PacketSender,
|
||||||
thread_pool: &ThreadPool,
|
thread_pool: &ThreadPool,
|
||||||
|
last_print: &mut Instant,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
//TODO cache connections
|
//TODO cache connections
|
||||||
let timeout = Duration::new(1, 0);
|
let timeout = Duration::new(1, 0);
|
||||||
@ -1754,8 +1881,104 @@ impl ClusterInfo {
|
|||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
|
Self::print_reset_stats(obj, last_print);
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn print_reset_stats(&self, last_print: &mut Instant) {
|
||||||
|
if last_print.elapsed().as_millis() > 1000 {
|
||||||
|
datapoint_info!(
|
||||||
|
"cluster_info_stats",
|
||||||
|
("entrypoint", self.stats.entrypoint.clear(), i64),
|
||||||
|
("entrypoint2", self.stats.entrypoint2.clear(), i64),
|
||||||
|
("push_vote_read", self.stats.push_vote_read.clear(), i64),
|
||||||
|
(
|
||||||
|
"vote_process_push",
|
||||||
|
self.stats.vote_process_push.clear(),
|
||||||
|
i64
|
||||||
|
),
|
||||||
|
("get_votes", self.stats.get_votes.clear(), i64),
|
||||||
|
(
|
||||||
|
"get_accounts_hash",
|
||||||
|
self.stats.get_accounts_hash.clear(),
|
||||||
|
i64
|
||||||
|
),
|
||||||
|
("all_tvu_peers", self.stats.all_tvu_peers.clear(), i64),
|
||||||
|
("tvu_peers", self.stats.tvu_peers.clear(), i64),
|
||||||
|
);
|
||||||
|
datapoint_info!(
|
||||||
|
"cluster_info_stats2",
|
||||||
|
("retransmit_peers", self.stats.retransmit_peers.clear(), i64),
|
||||||
|
("repair_peers", self.stats.repair_peers.clear(), i64),
|
||||||
|
(
|
||||||
|
"new_push_requests",
|
||||||
|
self.stats.new_push_requests.clear(),
|
||||||
|
i64
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"new_push_requests2",
|
||||||
|
self.stats.new_push_requests2.clear(),
|
||||||
|
i64
|
||||||
|
),
|
||||||
|
("purge", self.stats.purge.clear(), i64),
|
||||||
|
(
|
||||||
|
"process_pull_resp",
|
||||||
|
self.stats.process_pull_response.clear(),
|
||||||
|
i64
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"process_pull_resp_count",
|
||||||
|
self.stats.process_pull_response_count.clear(),
|
||||||
|
i64
|
||||||
|
),
|
||||||
|
);
|
||||||
|
datapoint_info!(
|
||||||
|
"cluster_info_stats3",
|
||||||
|
(
|
||||||
|
"process_pull_resp_len",
|
||||||
|
self.stats.process_pull_response_len.clear(),
|
||||||
|
i64
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"process_pull_requests",
|
||||||
|
self.stats.process_pull_requests.clear(),
|
||||||
|
i64
|
||||||
|
),
|
||||||
|
("process_prune", self.stats.process_prune.clear(), i64),
|
||||||
|
(
|
||||||
|
"process_push_message",
|
||||||
|
self.stats.process_push_message.clear(),
|
||||||
|
i64
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"prune_received_cache",
|
||||||
|
self.stats.prune_received_cache.clear(),
|
||||||
|
i64
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"epoch_slots_lookup",
|
||||||
|
self.stats.epoch_slots_lookup.clear(),
|
||||||
|
i64
|
||||||
|
),
|
||||||
|
("epoch_slots_push", self.stats.epoch_slots_push.clear(), i64),
|
||||||
|
("push_message", self.stats.push_message.clear(), i64),
|
||||||
|
(
|
||||||
|
"new_pull_requests",
|
||||||
|
self.stats.new_pull_requests.clear(),
|
||||||
|
i64
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"mark_pull_request",
|
||||||
|
self.stats.mark_pull_request.clear(),
|
||||||
|
i64
|
||||||
|
),
|
||||||
|
);
|
||||||
|
|
||||||
|
*last_print = Instant::now();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
pub fn listen(
|
pub fn listen(
|
||||||
me: Arc<Self>,
|
me: Arc<Self>,
|
||||||
bank_forks: Option<Arc<RwLock<BankForks>>>,
|
bank_forks: Option<Arc<RwLock<BankForks>>>,
|
||||||
@ -1772,6 +1995,7 @@ impl ClusterInfo {
|
|||||||
.num_threads(get_thread_count())
|
.num_threads(get_thread_count())
|
||||||
.build()
|
.build()
|
||||||
.unwrap();
|
.unwrap();
|
||||||
|
let mut last_print = Instant::now();
|
||||||
loop {
|
loop {
|
||||||
let e = Self::run_listen(
|
let e = Self::run_listen(
|
||||||
&me,
|
&me,
|
||||||
@ -1780,6 +2004,7 @@ impl ClusterInfo {
|
|||||||
&requests_receiver,
|
&requests_receiver,
|
||||||
&response_sender,
|
&response_sender,
|
||||||
&thread_pool,
|
&thread_pool,
|
||||||
|
&mut last_print,
|
||||||
);
|
);
|
||||||
if exit.load(Ordering::Relaxed) {
|
if exit.load(Ordering::Relaxed) {
|
||||||
return;
|
return;
|
||||||
@ -2034,13 +2259,6 @@ impl Node {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn report_time_spent(label: &str, time: &Duration, extra: &str) {
|
|
||||||
let time_ms = duration_as_ms(time);
|
|
||||||
if time_ms > 100 {
|
|
||||||
info!("{} took: {} ms {}", label, time_ms, extra);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn stake_weight_peers<S: std::hash::BuildHasher>(
|
pub fn stake_weight_peers<S: std::hash::BuildHasher>(
|
||||||
peers: &mut Vec<ContactInfo>,
|
peers: &mut Vec<ContactInfo>,
|
||||||
stakes: Option<Arc<HashMap<Pubkey, u64, S>>>,
|
stakes: Option<Arc<HashMap<Pubkey, u64, S>>>,
|
||||||
|
@ -173,7 +173,7 @@ impl CrdsGossip {
|
|||||||
timeouts: &HashMap<Pubkey, u64>,
|
timeouts: &HashMap<Pubkey, u64>,
|
||||||
response: Vec<CrdsValue>,
|
response: Vec<CrdsValue>,
|
||||||
now: u64,
|
now: u64,
|
||||||
) -> usize {
|
) -> (usize, usize) {
|
||||||
self.pull
|
self.pull
|
||||||
.process_pull_response(&mut self.crds, from, timeouts, response, now)
|
.process_pull_response(&mut self.crds, from, timeouts, response, now)
|
||||||
}
|
}
|
||||||
|
@ -231,8 +231,9 @@ impl CrdsGossipPull {
|
|||||||
timeouts: &HashMap<Pubkey, u64>,
|
timeouts: &HashMap<Pubkey, u64>,
|
||||||
response: Vec<CrdsValue>,
|
response: Vec<CrdsValue>,
|
||||||
now: u64,
|
now: u64,
|
||||||
) -> usize {
|
) -> (usize, usize) {
|
||||||
let mut failed = 0;
|
let mut failed = 0;
|
||||||
|
let mut timeout_count = 0;
|
||||||
for r in response {
|
for r in response {
|
||||||
let owner = r.label().pubkey();
|
let owner = r.label().pubkey();
|
||||||
// Check if the crds value is older than the msg_timeout
|
// Check if the crds value is older than the msg_timeout
|
||||||
@ -252,10 +253,7 @@ impl CrdsGossipPull {
|
|||||||
if now > r.wallclock().checked_add(timeout).unwrap_or_else(|| 0)
|
if now > r.wallclock().checked_add(timeout).unwrap_or_else(|| 0)
|
||||||
|| now + timeout < r.wallclock()
|
|| now + timeout < r.wallclock()
|
||||||
{
|
{
|
||||||
inc_new_counter_warn!(
|
timeout_count += 1;
|
||||||
"cluster_info-gossip_pull_response_value_timeout",
|
|
||||||
1
|
|
||||||
);
|
|
||||||
failed += 1;
|
failed += 1;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
@ -264,10 +262,7 @@ impl CrdsGossipPull {
|
|||||||
// Before discarding this value, check if a ContactInfo for the owner
|
// Before discarding this value, check if a ContactInfo for the owner
|
||||||
// exists in the table. If it doesn't, that implies that this value can be discarded
|
// exists in the table. If it doesn't, that implies that this value can be discarded
|
||||||
if crds.lookup(&CrdsValueLabel::ContactInfo(owner)).is_none() {
|
if crds.lookup(&CrdsValueLabel::ContactInfo(owner)).is_none() {
|
||||||
inc_new_counter_warn!(
|
timeout_count += 1;
|
||||||
"cluster_info-gossip_pull_response_value_timeout",
|
|
||||||
1
|
|
||||||
);
|
|
||||||
failed += 1;
|
failed += 1;
|
||||||
continue;
|
continue;
|
||||||
} else {
|
} else {
|
||||||
@ -289,7 +284,7 @@ impl CrdsGossipPull {
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
crds.update_record_timestamp(from, now);
|
crds.update_record_timestamp(from, now);
|
||||||
failed
|
(failed, timeout_count)
|
||||||
}
|
}
|
||||||
// build a set of filters of the current crds table
|
// build a set of filters of the current crds table
|
||||||
// num_filters - used to increase the likelyhood of a value in crds being added to some filter
|
// num_filters - used to increase the likelyhood of a value in crds being added to some filter
|
||||||
@ -660,13 +655,15 @@ mod test {
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
assert_eq!(rsp.len(), 1);
|
assert_eq!(rsp.len(), 1);
|
||||||
let failed = node.process_pull_response(
|
let failed = node
|
||||||
|
.process_pull_response(
|
||||||
&mut node_crds,
|
&mut node_crds,
|
||||||
&node_pubkey,
|
&node_pubkey,
|
||||||
&node.make_timeouts_def(&node_pubkey, &HashMap::new(), 0, 1),
|
&node.make_timeouts_def(&node_pubkey, &HashMap::new(), 0, 1),
|
||||||
rsp.pop().unwrap(),
|
rsp.pop().unwrap(),
|
||||||
1,
|
1,
|
||||||
);
|
)
|
||||||
|
.0;
|
||||||
assert_eq!(failed, 0);
|
assert_eq!(failed, 0);
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
node_crds
|
node_crds
|
||||||
@ -827,7 +824,8 @@ mod test {
|
|||||||
&timeouts,
|
&timeouts,
|
||||||
vec![peer_entry.clone()],
|
vec![peer_entry.clone()],
|
||||||
1,
|
1,
|
||||||
),
|
)
|
||||||
|
.0,
|
||||||
0
|
0
|
||||||
);
|
);
|
||||||
|
|
||||||
@ -843,7 +841,8 @@ mod test {
|
|||||||
&timeouts,
|
&timeouts,
|
||||||
vec![peer_entry.clone(), unstaked_peer_entry],
|
vec![peer_entry.clone(), unstaked_peer_entry],
|
||||||
node.msg_timeout + 100,
|
node.msg_timeout + 100,
|
||||||
),
|
)
|
||||||
|
.0,
|
||||||
2
|
2
|
||||||
);
|
);
|
||||||
|
|
||||||
@ -856,7 +855,8 @@ mod test {
|
|||||||
&timeouts,
|
&timeouts,
|
||||||
vec![peer_entry],
|
vec![peer_entry],
|
||||||
node.msg_timeout + 1,
|
node.msg_timeout + 1,
|
||||||
),
|
)
|
||||||
|
.0,
|
||||||
0
|
0
|
||||||
);
|
);
|
||||||
|
|
||||||
@ -872,7 +872,8 @@ mod test {
|
|||||||
&timeouts,
|
&timeouts,
|
||||||
vec![peer_vote.clone()],
|
vec![peer_vote.clone()],
|
||||||
node.msg_timeout + 1,
|
node.msg_timeout + 1,
|
||||||
),
|
)
|
||||||
|
.0,
|
||||||
0
|
0
|
||||||
);
|
);
|
||||||
|
|
||||||
@ -885,7 +886,8 @@ mod test {
|
|||||||
&timeouts,
|
&timeouts,
|
||||||
vec![peer_vote],
|
vec![peer_vote],
|
||||||
node.msg_timeout + 1,
|
node.msg_timeout + 1,
|
||||||
),
|
)
|
||||||
|
.0,
|
||||||
1
|
1
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
@ -455,7 +455,8 @@ fn network_run_pull(
|
|||||||
overhead += node
|
overhead += node
|
||||||
.lock()
|
.lock()
|
||||||
.unwrap()
|
.unwrap()
|
||||||
.process_pull_response(&from, &timeouts, rsp, now);
|
.process_pull_response(&from, &timeouts, rsp, now)
|
||||||
|
.0;
|
||||||
}
|
}
|
||||||
(bytes, msgs, overhead)
|
(bytes, msgs, overhead)
|
||||||
})
|
})
|
||||||
|
Reference in New Issue
Block a user