Add retry mechanism when downloading genesis and snapshots

This commit is contained in:
Michael Vines
2020-02-25 10:41:13 -07:00
parent 2c63cf3cbd
commit fbf78b83c4
6 changed files with 169 additions and 121 deletions

View File

@ -44,6 +44,7 @@ use solana_sdk::{
timing::timestamp, timing::timestamp,
}; };
use std::{ use std::{
collections::HashSet,
net::{IpAddr, Ipv4Addr, SocketAddr}, net::{IpAddr, Ipv4Addr, SocketAddr},
path::{Path, PathBuf}, path::{Path, PathBuf},
process, process,
@ -73,6 +74,7 @@ pub struct ValidatorConfig {
pub fixed_leader_schedule: Option<FixedSchedule>, pub fixed_leader_schedule: Option<FixedSchedule>,
pub wait_for_supermajority: bool, pub wait_for_supermajority: bool,
pub new_hard_forks: Option<Vec<Slot>>, pub new_hard_forks: Option<Vec<Slot>>,
pub trusted_validators: Option<HashSet<Pubkey>>, // None = trust all
} }
impl Default for ValidatorConfig { impl Default for ValidatorConfig {
@ -95,6 +97,7 @@ impl Default for ValidatorConfig {
fixed_leader_schedule: None, fixed_leader_schedule: None,
wait_for_supermajority: false, wait_for_supermajority: false,
new_hard_forks: None, new_hard_forks: None,
trusted_validators: None,
} }
} }
} }
@ -341,9 +344,7 @@ impl Validator {
} }
if let Some(snapshot_hash) = snapshot_hash { if let Some(snapshot_hash) = snapshot_hash {
if let Some(ref trusted_validators) = if let Some(ref trusted_validators) = config.trusted_validators {
config.snapshot_config.as_ref().unwrap().trusted_validators
{
let mut trusted = false; let mut trusted = false;
for _ in 0..10 { for _ in 0..10 {
trusted = cluster_info trusted = cluster_info

View File

@ -55,7 +55,6 @@ mod tests {
snapshot_interval_slots, snapshot_interval_slots,
snapshot_package_output_path: PathBuf::from(snapshot_output_path.path()), snapshot_package_output_path: PathBuf::from(snapshot_output_path.path()),
snapshot_path: PathBuf::from(snapshot_dir.path()), snapshot_path: PathBuf::from(snapshot_dir.path()),
trusted_validators: None,
}; };
bank_forks.set_snapshot_config(Some(snapshot_config.clone())); bank_forks.set_snapshot_config(Some(snapshot_config.clone()));
SnapshotTestConfig { SnapshotTestConfig {

View File

@ -536,7 +536,6 @@ fn load_bank_forks(
snapshot_interval_slots: 0, // Value doesn't matter snapshot_interval_slots: 0, // Value doesn't matter
snapshot_package_output_path: ledger_path.clone(), snapshot_package_output_path: ledger_path.clone(),
snapshot_path: ledger_path.clone().join("snapshot"), snapshot_path: ledger_path.clone().join("snapshot"),
trusted_validators: None,
}) })
}; };
let account_paths = if let Some(account_paths) = arg_matches.value_of("account_paths") { let account_paths = if let Some(account_paths) = arg_matches.value_of("account_paths") {

View File

@ -6,7 +6,7 @@ use log::*;
use solana_measure::measure::Measure; use solana_measure::measure::Measure;
use solana_metrics::inc_new_counter_info; use solana_metrics::inc_new_counter_info;
use solana_runtime::{bank::Bank, status_cache::MAX_CACHE_ENTRIES}; use solana_runtime::{bank::Bank, status_cache::MAX_CACHE_ENTRIES};
use solana_sdk::{clock::Slot, pubkey::Pubkey, timing}; use solana_sdk::{clock::Slot, timing};
use std::{ use std::{
collections::{HashMap, HashSet}, collections::{HashMap, HashSet},
ops::Index, ops::Index,
@ -26,10 +26,6 @@ pub struct SnapshotConfig {
// Where to place the snapshots for recent slots // Where to place the snapshots for recent slots
pub snapshot_path: PathBuf, pub snapshot_path: PathBuf,
// Validators that must vouch for a given snapshot hash before it's accepted
// None = accept any snapshot hash
pub trusted_validators: Option<HashSet<Pubkey>>,
} }
#[derive(Error, Debug)] #[derive(Error, Debug)]

View File

@ -726,12 +726,9 @@ fn test_snapshots_blockstore_floor() {
let (cluster_nodes, _) = discover_cluster(&cluster.entry_point_info.gossip, 1).unwrap(); let (cluster_nodes, _) = discover_cluster(&cluster.entry_point_info.gossip, 1).unwrap();
let mut trusted_validators = HashSet::new(); let mut trusted_validators = HashSet::new();
trusted_validators.insert(cluster_nodes[0].id); trusted_validators.insert(cluster_nodes[0].id);
if let Some(ref mut config) = validator_snapshot_test_config validator_snapshot_test_config
.validator_config .validator_config
.snapshot_config .trusted_validators = Some(trusted_validators);
{
config.trusted_validators = Some(trusted_validators);
}
cluster.add_validator( cluster.add_validator(
&validator_snapshot_test_config.validator_config, &validator_snapshot_test_config.validator_config,
@ -1012,7 +1009,6 @@ fn setup_snapshot_validator_config(
snapshot_interval_slots, snapshot_interval_slots,
snapshot_package_output_path: PathBuf::from(snapshot_output_path.path()), snapshot_package_output_path: PathBuf::from(snapshot_output_path.path()),
snapshot_path: PathBuf::from(snapshot_dir.path()), snapshot_path: PathBuf::from(snapshot_dir.path()),
trusted_validators: None,
}; };
// Create the account paths // Create the account paths

View File

@ -27,6 +27,7 @@ use solana_ledger::bank_forks::SnapshotConfig;
use solana_perf::recycler::enable_recycler_warming; use solana_perf::recycler::enable_recycler_warming;
use solana_sdk::{ use solana_sdk::{
clock::Slot, clock::Slot,
genesis_config::GenesisConfig,
hash::Hash, hash::Hash,
pubkey::Pubkey, pubkey::Pubkey,
signature::{Keypair, Signer}, signature::{Keypair, Signer},
@ -35,7 +36,7 @@ use std::{
collections::HashSet, collections::HashSet,
fs::{self, File}, fs::{self, File},
io::{self, Read}, io::{self, Read},
net::{SocketAddr, TcpListener}, net::{SocketAddr, TcpListener, UdpSocket},
path::{Path, PathBuf}, path::{Path, PathBuf},
process::exit, process::exit,
str::FromStr, str::FromStr,
@ -227,7 +228,7 @@ fn get_shred_rpc_peers(
fn get_trusted_snapshot_hashes( fn get_trusted_snapshot_hashes(
cluster_info: &Arc<RwLock<ClusterInfo>>, cluster_info: &Arc<RwLock<ClusterInfo>>,
trusted_validators: Option<&HashSet<Pubkey>>, trusted_validators: &Option<HashSet<Pubkey>>,
) -> Option<HashSet<(Slot, Hash)>> { ) -> Option<HashSet<(Slot, Hash)>> {
if let Some(trusted_validators) = trusted_validators { if let Some(trusted_validators) = trusted_validators {
let mut trusted_snapshot_hashes = HashSet::new(); let mut trusted_snapshot_hashes = HashSet::new();
@ -248,14 +249,11 @@ fn get_trusted_snapshot_hashes(
} }
} }
fn get_rpc_node( fn start_gossip_spy(
node: &Node,
identity_keypair: &Arc<Keypair>, identity_keypair: &Arc<Keypair>,
entrypoint_gossip: &SocketAddr, entrypoint_gossip: &SocketAddr,
expected_shred_version: Option<u16>, gossip_socket: UdpSocket,
trusted_validators: Option<&HashSet<Pubkey>>, ) -> (Arc<RwLock<ClusterInfo>>, Arc<AtomicBool>, GossipService) {
snapshot_not_required: bool,
) -> (ContactInfo, RpcClient, Option<(Slot, Hash)>) {
let mut cluster_info = ClusterInfo::new( let mut cluster_info = ClusterInfo::new(
ClusterInfo::spy_contact_info(&identity_keypair.pubkey()), ClusterInfo::spy_contact_info(&identity_keypair.pubkey()),
identity_keypair.clone(), identity_keypair.clone(),
@ -267,30 +265,60 @@ fn get_rpc_node(
let gossip_service = GossipService::new( let gossip_service = GossipService::new(
&cluster_info.clone(), &cluster_info.clone(),
None, None,
node.sockets.gossip.try_clone().unwrap(), gossip_socket,
&gossip_exit_flag, &gossip_exit_flag,
); );
(cluster_info, gossip_exit_flag, gossip_service)
}
let (rpc_contact_info, rpc_client, selected_snapshot_hash) = loop { fn get_rpc_node(
cluster_info: &Arc<RwLock<ClusterInfo>>,
validator_config: &ValidatorConfig,
blacklisted_rpc_nodes: &mut HashSet<Pubkey>,
snapshot_not_required: bool,
) -> (ContactInfo, Option<(Slot, Hash)>) {
let mut blacklist_timeout = Instant::now();
loop {
info!( info!(
"Searching for an RPC service, shred version={:?}...", "Searching for an RPC service, shred version={:?}...",
expected_shred_version validator_config.expected_shred_version
); );
sleep(Duration::from_secs(1)); sleep(Duration::from_secs(1));
info!("\n{}", cluster_info.read().unwrap().contact_info_trace()); info!("\n{}", cluster_info.read().unwrap().contact_info_trace());
let rpc_peers = get_shred_rpc_peers(&cluster_info, expected_shred_version); let rpc_peers = get_shred_rpc_peers(&cluster_info, validator_config.expected_shred_version);
if rpc_peers.is_empty() { let rpc_peers_total = rpc_peers.len();
info!("No RPC services found");
// Filter out blacklisted nodes
let rpc_peers: Vec<_> = rpc_peers
.into_iter()
.filter(|rpc_peer| !blacklisted_rpc_nodes.contains(&rpc_peer.id))
.collect();
let rpc_peers_blacklisted = rpc_peers_total - rpc_peers.len();
if rpc_peers_blacklisted == rpc_peers_total {
// If all nodes are blacklisted and no additional nodes are discovered after 60 seconds,
// remove the blacklist and try them all again
if blacklist_timeout.elapsed().as_secs() > 60 {
info!("Node blacklist timeout expired");
blacklisted_rpc_nodes.clear();
continue; continue;
} }
} else {
blacklist_timeout = Instant::now();
}
info!(
"Total {} RPC nodes found. {} blacklisted ",
rpc_peers_total, rpc_peers_blacklisted
);
let mut highest_snapshot_hash: Option<(Slot, Hash)> = None; let mut highest_snapshot_hash: Option<(Slot, Hash)> = None;
let eligible_rpc_peers = if snapshot_not_required { let eligible_rpc_peers = if snapshot_not_required {
rpc_peers rpc_peers
} else { } else {
let trusted_snapshot_hashes = let trusted_snapshot_hashes =
get_trusted_snapshot_hashes(&cluster_info, trusted_validators); get_trusted_snapshot_hashes(&cluster_info, &validator_config.trusted_validators);
let mut eligible_rpc_peers = vec![]; let mut eligible_rpc_peers = vec![];
@ -343,37 +371,11 @@ fn get_rpc_node(
}; };
if !eligible_rpc_peers.is_empty() { if !eligible_rpc_peers.is_empty() {
// Prefer the entrypoint's RPC service if present, otherwise pick one at random let contact_info =
let contact_info = if let Some(contact_info) = eligible_rpc_peers &eligible_rpc_peers[thread_rng().gen_range(0, eligible_rpc_peers.len())];
.iter() return (contact_info.clone(), highest_snapshot_hash);
.find(|contact_info| contact_info.gossip == *entrypoint_gossip)
{
contact_info
} else {
&eligible_rpc_peers[thread_rng().gen_range(0, eligible_rpc_peers.len())]
};
info!(
"Trying RPC service from node {}: {:?}",
contact_info.id, contact_info.rpc
);
let rpc_client = RpcClient::new_socket(contact_info.rpc);
match rpc_client.get_version() {
Ok(rpc_version) => {
info!("RPC node version: {}", rpc_version.solana_core);
break (contact_info.clone(), rpc_client, highest_snapshot_hash);
}
Err(err) => {
warn!("Failed to get RPC node's version: {}", err);
} }
} }
}
};
gossip_exit_flag.store(true, Ordering::Relaxed);
gossip_service.join().unwrap();
(rpc_contact_info, rpc_client, selected_snapshot_hash)
} }
fn check_vote_account( fn check_vote_account(
@ -448,16 +450,53 @@ fn check_vote_account(
Ok(()) Ok(())
} }
fn download_ledger( fn download_genesis(
rpc_addr: &SocketAddr,
rpc_client: &RpcClient,
ledger_path: &Path,
validator_config: &mut ValidatorConfig,
) -> Result<(), String> {
let genesis_hash = rpc_client
.get_genesis_hash()
.map_err(|err| format!("Failed to get genesis hash: {}", err))?;
if let Some(expected_genesis_hash) = validator_config.expected_genesis_hash {
if expected_genesis_hash != genesis_hash {
return Err(format!(
"Genesis hash mismatch: expected {} but local genesis hash is {}",
expected_genesis_hash, genesis_hash,
));
}
}
download_tar_bz2(&rpc_addr, "genesis.tar.bz2", &ledger_path, false)
.map_err(|err| format!("Failed to download genesis config: {}", err))?;
let genesis_config = GenesisConfig::load(&ledger_path)
.map_err(|err| format!("Failed to load genesis config: {}", err))?;
if genesis_config.hash() != genesis_hash {
return Err(format!(
"Genesis hash mismatch: expected {} but downloaded genesis hash is {}",
genesis_hash,
genesis_config.hash(),
));
}
validator_config.expected_genesis_hash = Some(genesis_hash);
Ok(())
}
fn download_snapshot(
rpc_addr: &SocketAddr, rpc_addr: &SocketAddr,
ledger_path: &Path, ledger_path: &Path,
snapshot_hash: Option<(Slot, Hash)>, snapshot_hash: Option<(Slot, Hash)>,
) -> Result<(), String> { ) -> Result<(), String> {
download_tar_bz2(rpc_addr, "genesis.tar.bz2", ledger_path, false)?; if snapshot_hash.is_none() {
return Ok(());
}
if snapshot_hash.is_some() { let snapshot_package = solana_ledger::snapshot_utils::get_snapshot_archive_path(ledger_path);
let snapshot_package =
solana_ledger::snapshot_utils::get_snapshot_archive_path(ledger_path);
if snapshot_package.exists() { if snapshot_package.exists() {
fs::remove_file(&snapshot_package) fs::remove_file(&snapshot_package)
.map_err(|err| format!("error removing {:?}: {}", snapshot_package, err))?; .map_err(|err| format!("error removing {:?}: {}", snapshot_package, err))?;
@ -469,7 +508,6 @@ fn download_ledger(
true, true,
) )
.map_err(|err| format!("Failed to fetch snapshot: {:?}", err))?; .map_err(|err| format!("Failed to fetch snapshot: {:?}", err))?;
}
Ok(()) Ok(())
} }
@ -843,6 +881,7 @@ pub fn main() {
.map(|rpc_port| (rpc_port, rpc_port + 1)), .map(|rpc_port| (rpc_port, rpc_port + 1)),
voting_disabled: matches.is_present("no_voting"), voting_disabled: matches.is_present("no_voting"),
wait_for_supermajority: !matches.is_present("no_wait_for_supermajority"), wait_for_supermajority: !matches.is_present("no_wait_for_supermajority"),
trusted_validators,
..ValidatorConfig::default() ..ValidatorConfig::default()
}; };
@ -891,7 +930,6 @@ pub fn main() {
}, },
snapshot_path, snapshot_path,
snapshot_package_output_path: ledger_path.clone(), snapshot_package_output_path: ledger_path.clone(),
trusted_validators,
}); });
if matches.is_present("limit_ledger_size") { if matches.is_present("limit_ledger_size") {
@ -1049,36 +1087,43 @@ pub fn main() {
); );
if !no_genesis_fetch { if !no_genesis_fetch {
let (rpc_contact_info, rpc_client, snapshot_hash) = get_rpc_node( let (cluster_info, gossip_exit_flag, gossip_service) = start_gossip_spy(
&node,
&identity_keypair, &identity_keypair,
&cluster_entrypoint.gossip, &cluster_entrypoint.gossip,
validator_config.expected_shred_version, node.sockets.gossip.try_clone().unwrap(),
validator_config );
.snapshot_config
.as_ref() let mut blacklisted_rpc_nodes = HashSet::new();
.unwrap() loop {
.trusted_validators let (rpc_contact_info, snapshot_hash) = get_rpc_node(
.as_ref(), &cluster_info,
&validator_config,
&mut blacklisted_rpc_nodes,
no_snapshot_fetch, no_snapshot_fetch,
); );
info!(
let genesis_hash = rpc_client.get_genesis_hash().unwrap_or_else(|err| { "Using RPC service from node {}: {:?}",
error!("Failed to get genesis hash: {}", err); rpc_contact_info.id, rpc_contact_info.rpc
exit(1);
});
if let Some(expected_genesis_hash) = validator_config.expected_genesis_hash {
if expected_genesis_hash != genesis_hash {
error!(
"Genesis hash mismatch: expected {} but local genesis hash is {}",
expected_genesis_hash, genesis_hash,
); );
exit(1); let rpc_client = RpcClient::new_socket(rpc_contact_info.rpc);
}
}
validator_config.expected_genesis_hash = Some(genesis_hash);
let result = match rpc_client.get_version() {
Ok(rpc_version) => {
info!("RPC node version: {}", rpc_version.solana_core);
Ok(())
}
Err(err) => Err(format!("Failed to get RPC node version: {}", err)),
}
.and_then(|_| {
download_genesis(
&rpc_contact_info.rpc,
&rpc_client,
&ledger_path,
&mut validator_config,
)
})
.and_then(|_| download_snapshot(&rpc_contact_info.rpc, &ledger_path, snapshot_hash))
.and_then(|_| {
if !validator_config.voting_disabled && !no_check_vote_account { if !validator_config.voting_disabled && !no_check_vote_account {
check_vote_account( check_vote_account(
&rpc_client, &rpc_client,
@ -1086,18 +1131,30 @@ pub fn main() {
&voting_keypair.pubkey(), &voting_keypair.pubkey(),
&identity_keypair.pubkey(), &identity_keypair.pubkey(),
) )
.unwrap_or_else(|err| { } else {
error!("Failed to check vote account: {}", err); Ok(())
exit(1); }
}); });
if result.is_ok() {
break;
}
warn!("{}", result.unwrap_err());
if let Some(ref trusted_validators) = validator_config.trusted_validators {
if trusted_validators.contains(&rpc_contact_info.id) {
continue; // Never blacklist a trusted node
}
} }
download_ledger(&rpc_contact_info.rpc, &ledger_path, snapshot_hash).unwrap_or_else( info!(
|err| { "Excluding {} as a future RPC candidate",
error!("Failed to initialize ledger: {}", err); rpc_contact_info.id
exit(1);
},
); );
blacklisted_rpc_nodes.insert(rpc_contact_info.id);
}
gossip_exit_flag.store(true, Ordering::Relaxed);
gossip_service.join().unwrap();
} }
} }