Persist gossip contact info
This commit is contained in:
		@@ -69,8 +69,11 @@ use std::{
 | 
			
		||||
    cmp::min,
 | 
			
		||||
    collections::{hash_map::Entry, HashMap, HashSet, VecDeque},
 | 
			
		||||
    fmt::{self, Debug},
 | 
			
		||||
    fs::{self, File},
 | 
			
		||||
    io::BufReader,
 | 
			
		||||
    net::{IpAddr, Ipv4Addr, SocketAddr, TcpListener, UdpSocket},
 | 
			
		||||
    ops::{Deref, DerefMut},
 | 
			
		||||
    path::{Path, PathBuf},
 | 
			
		||||
    sync::atomic::{AtomicBool, AtomicU64, Ordering},
 | 
			
		||||
    sync::{Arc, RwLock, RwLockReadGuard, RwLockWriteGuard},
 | 
			
		||||
    thread::{sleep, Builder, JoinHandle},
 | 
			
		||||
@@ -107,7 +110,8 @@ const MAX_PRUNE_DATA_NODES: usize = 32;
 | 
			
		||||
const GOSSIP_PING_TOKEN_SIZE: usize = 32;
 | 
			
		||||
const GOSSIP_PING_CACHE_CAPACITY: usize = 16384;
 | 
			
		||||
const GOSSIP_PING_CACHE_TTL: Duration = Duration::from_secs(640);
 | 
			
		||||
pub const DEFAULT_CONTACT_DEBUG_INTERVAL: u64 = 10_000;
 | 
			
		||||
pub const DEFAULT_CONTACT_DEBUG_INTERVAL_MILLIS: u64 = 10_000;
 | 
			
		||||
pub const DEFAULT_CONTACT_SAVE_INTERVAL_MILLIS: u64 = 60_000;
 | 
			
		||||
/// Minimum serialized size of a Protocol::PullResponse packet.
 | 
			
		||||
const PULL_RESPONSE_MIN_SERIALIZED_SIZE: usize = 167;
 | 
			
		||||
 | 
			
		||||
@@ -297,8 +301,10 @@ pub struct ClusterInfo {
 | 
			
		||||
    stats: GossipStats,
 | 
			
		||||
    socket: UdpSocket,
 | 
			
		||||
    local_message_pending_push_queue: RwLock<Vec<(CrdsValue, u64)>>,
 | 
			
		||||
    contact_debug_interval: u64,
 | 
			
		||||
    contact_debug_interval: u64, // milliseconds, 0 = disabled
 | 
			
		||||
    contact_save_interval: u64,  // milliseconds, 0 = disabled
 | 
			
		||||
    instance: NodeInstance,
 | 
			
		||||
    contact_info_path: PathBuf,
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
impl Default for ClusterInfo {
 | 
			
		||||
@@ -554,8 +560,10 @@ impl ClusterInfo {
 | 
			
		||||
            stats: GossipStats::default(),
 | 
			
		||||
            socket: UdpSocket::bind("0.0.0.0:0").unwrap(),
 | 
			
		||||
            local_message_pending_push_queue: RwLock::new(vec![]),
 | 
			
		||||
            contact_debug_interval: DEFAULT_CONTACT_DEBUG_INTERVAL,
 | 
			
		||||
            contact_debug_interval: DEFAULT_CONTACT_DEBUG_INTERVAL_MILLIS,
 | 
			
		||||
            instance: NodeInstance::new(&mut thread_rng(), id, timestamp()),
 | 
			
		||||
            contact_info_path: PathBuf::default(),
 | 
			
		||||
            contact_save_interval: 0, // disabled
 | 
			
		||||
        };
 | 
			
		||||
        {
 | 
			
		||||
            let mut gossip = me.gossip.write().unwrap();
 | 
			
		||||
@@ -591,6 +599,8 @@ impl ClusterInfo {
 | 
			
		||||
            ),
 | 
			
		||||
            contact_debug_interval: self.contact_debug_interval,
 | 
			
		||||
            instance: NodeInstance::new(&mut thread_rng(), *new_id, timestamp()),
 | 
			
		||||
            contact_info_path: PathBuf::default(),
 | 
			
		||||
            contact_save_interval: 0, // disabled
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
@@ -649,6 +659,117 @@ impl ClusterInfo {
 | 
			
		||||
        *self.entrypoints.write().unwrap() = entrypoints;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    pub fn save_contact_info(&self) {
 | 
			
		||||
        let nodes = {
 | 
			
		||||
            let gossip = self.gossip.read().unwrap();
 | 
			
		||||
            let entrypoint_gossip_addrs = self
 | 
			
		||||
                .entrypoints
 | 
			
		||||
                .read()
 | 
			
		||||
                .unwrap()
 | 
			
		||||
                .iter()
 | 
			
		||||
                .map(|contact_info| contact_info.gossip)
 | 
			
		||||
                .collect::<HashSet<_>>();
 | 
			
		||||
 | 
			
		||||
            gossip
 | 
			
		||||
                .crds
 | 
			
		||||
                .get_nodes()
 | 
			
		||||
                .filter_map(|v| {
 | 
			
		||||
                    // Don't save:
 | 
			
		||||
                    // 1. Our ContactInfo. No point
 | 
			
		||||
                    // 2. Entrypoint ContactInfo. This will avoid adopting the incorrect shred
 | 
			
		||||
                    //    version on restart if the entrypoint shred version changes.  Also
 | 
			
		||||
                    //    there's not much point in saving entrypoint ContactInfo since by
 | 
			
		||||
                    //    definition that information is already available
 | 
			
		||||
                    let contact_info = v.value.contact_info().unwrap();
 | 
			
		||||
                    if contact_info.id != self.id()
 | 
			
		||||
                        && !entrypoint_gossip_addrs.contains(&contact_info.gossip)
 | 
			
		||||
                    {
 | 
			
		||||
                        return Some(v.value.clone());
 | 
			
		||||
                    }
 | 
			
		||||
                    None
 | 
			
		||||
                })
 | 
			
		||||
                .collect::<Vec<_>>()
 | 
			
		||||
        };
 | 
			
		||||
 | 
			
		||||
        if nodes.is_empty() {
 | 
			
		||||
            return;
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        let filename = self.contact_info_path.join("contact-info.bin");
 | 
			
		||||
        let tmp_filename = &filename.with_extension("tmp");
 | 
			
		||||
 | 
			
		||||
        match File::create(&tmp_filename) {
 | 
			
		||||
            Ok(mut file) => {
 | 
			
		||||
                if let Err(err) = bincode::serialize_into(&mut file, &nodes) {
 | 
			
		||||
                    warn!(
 | 
			
		||||
                        "Failed to serialize contact info info {}: {}",
 | 
			
		||||
                        tmp_filename.display(),
 | 
			
		||||
                        err
 | 
			
		||||
                    );
 | 
			
		||||
                    return;
 | 
			
		||||
                }
 | 
			
		||||
            }
 | 
			
		||||
            Err(err) => {
 | 
			
		||||
                warn!("Failed to create {}: {}", tmp_filename.display(), err);
 | 
			
		||||
                return;
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        match fs::rename(&tmp_filename, &filename) {
 | 
			
		||||
            Ok(()) => {
 | 
			
		||||
                info!(
 | 
			
		||||
                    "Saved contact info for {} nodes into {}",
 | 
			
		||||
                    nodes.len(),
 | 
			
		||||
                    filename.display()
 | 
			
		||||
                );
 | 
			
		||||
            }
 | 
			
		||||
            Err(err) => {
 | 
			
		||||
                warn!(
 | 
			
		||||
                    "Failed to rename {} to {}: {}",
 | 
			
		||||
                    tmp_filename.display(),
 | 
			
		||||
                    filename.display(),
 | 
			
		||||
                    err
 | 
			
		||||
                );
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    pub fn restore_contact_info(&mut self, contact_info_path: &Path, contact_save_interval: u64) {
 | 
			
		||||
        self.contact_info_path = contact_info_path.into();
 | 
			
		||||
        self.contact_save_interval = contact_save_interval;
 | 
			
		||||
 | 
			
		||||
        let filename = contact_info_path.join("contact-info.bin");
 | 
			
		||||
        if !filename.exists() {
 | 
			
		||||
            return;
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        let nodes: Vec<CrdsValue> = match File::open(&filename) {
 | 
			
		||||
            Ok(file) => {
 | 
			
		||||
                bincode::deserialize_from(&mut BufReader::new(file)).unwrap_or_else(|err| {
 | 
			
		||||
                    warn!("Failed to deserialize {}: {}", filename.display(), err);
 | 
			
		||||
                    vec![]
 | 
			
		||||
                })
 | 
			
		||||
            }
 | 
			
		||||
            Err(err) => {
 | 
			
		||||
                warn!("Failed to open {}: {}", filename.display(), err);
 | 
			
		||||
                vec![]
 | 
			
		||||
            }
 | 
			
		||||
        };
 | 
			
		||||
 | 
			
		||||
        info!(
 | 
			
		||||
            "Loaded contact info for {} nodes from {}",
 | 
			
		||||
            nodes.len(),
 | 
			
		||||
            filename.display()
 | 
			
		||||
        );
 | 
			
		||||
        let now = timestamp();
 | 
			
		||||
        let mut gossip = self.gossip.write().unwrap();
 | 
			
		||||
        for node in nodes {
 | 
			
		||||
            if let Err(err) = gossip.crds.insert(node, now) {
 | 
			
		||||
                warn!("crds insert failed {:?}", err);
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    pub fn id(&self) -> Pubkey {
 | 
			
		||||
        self.id
 | 
			
		||||
    }
 | 
			
		||||
@@ -1805,6 +1926,7 @@ impl ClusterInfo {
 | 
			
		||||
            .spawn(move || {
 | 
			
		||||
                let mut last_push = timestamp();
 | 
			
		||||
                let mut last_contact_info_trace = timestamp();
 | 
			
		||||
                let mut last_contact_info_save = timestamp();
 | 
			
		||||
                let mut entrypoints_processed = false;
 | 
			
		||||
                let recycler = PacketsRecycler::default();
 | 
			
		||||
                let crds_data = vec![
 | 
			
		||||
@@ -1822,7 +1944,7 @@ impl ClusterInfo {
 | 
			
		||||
                    if self.contact_debug_interval != 0
 | 
			
		||||
                        && start - last_contact_info_trace > self.contact_debug_interval
 | 
			
		||||
                    {
 | 
			
		||||
                        // Log contact info every 10 seconds
 | 
			
		||||
                        // Log contact info
 | 
			
		||||
                        info!(
 | 
			
		||||
                            "\n{}\n\n{}",
 | 
			
		||||
                            self.contact_info_trace(),
 | 
			
		||||
@@ -1831,6 +1953,13 @@ impl ClusterInfo {
 | 
			
		||||
                        last_contact_info_trace = start;
 | 
			
		||||
                    }
 | 
			
		||||
 | 
			
		||||
                    if self.contact_save_interval != 0
 | 
			
		||||
                        && start - last_contact_info_save > self.contact_save_interval
 | 
			
		||||
                    {
 | 
			
		||||
                        self.save_contact_info();
 | 
			
		||||
                        last_contact_info_save = start;
 | 
			
		||||
                    }
 | 
			
		||||
 | 
			
		||||
                    let stakes: HashMap<_, _> = match bank_forks {
 | 
			
		||||
                        Some(ref bank_forks) => {
 | 
			
		||||
                            bank_forks.read().unwrap().root_bank().staked_nodes()
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user