Add test for making sure switch doesn't happen past failure threshold (#11138)
Fix switch threshold Co-authored-by: Carl <carl@solana.com>
This commit is contained in:
@ -405,6 +405,7 @@ impl Tower {
|
|||||||
total_stake: u64,
|
total_stake: u64,
|
||||||
epoch_vote_accounts: &HashMap<Pubkey, (u64, Account)>,
|
epoch_vote_accounts: &HashMap<Pubkey, (u64, Account)>,
|
||||||
) -> SwitchForkDecision {
|
) -> SwitchForkDecision {
|
||||||
|
let root = self.lockouts.root_slot.unwrap_or(0);
|
||||||
self.last_voted_slot()
|
self.last_voted_slot()
|
||||||
.map(|last_voted_slot| {
|
.map(|last_voted_slot| {
|
||||||
let last_vote_ancestors = ancestors.get(&last_voted_slot).unwrap();
|
let last_vote_ancestors = ancestors.get(&last_voted_slot).unwrap();
|
||||||
@ -427,12 +428,22 @@ impl Tower {
|
|||||||
let mut locked_out_stake = 0;
|
let mut locked_out_stake = 0;
|
||||||
let mut locked_out_vote_accounts = HashSet::new();
|
let mut locked_out_vote_accounts = HashSet::new();
|
||||||
for (candidate_slot, descendants) in descendants.iter() {
|
for (candidate_slot, descendants) in descendants.iter() {
|
||||||
// 1) Only consider lockouts a tips of forks as that
|
// 1) Don't consider any banks that haven't been frozen yet
|
||||||
// includes all ancestors of that fork.
|
// because the needed stats are unavailable
|
||||||
// 2) Don't consider lockouts on the `last_vote` itself
|
// 2) Only consider lockouts at the latest `frozen` bank
|
||||||
// 3) Don't consider lockouts on any descendants of
|
// on each fork, as that bank will contain all the
|
||||||
|
// lockout intervals for ancestors on that fork as well.
|
||||||
|
// 3) Don't consider lockouts on the `last_vote` itself
|
||||||
|
// 4) Don't consider lockouts on any descendants of
|
||||||
// `last_vote`
|
// `last_vote`
|
||||||
if !descendants.is_empty()
|
// 5) Don't consider any banks before the root because
|
||||||
|
// all lockouts must be ancestors of `last_vote`
|
||||||
|
if !progress.get_fork_stats(*candidate_slot).map(|stats| stats.computed).unwrap_or(false)
|
||||||
|
// If any of the descendants have the `computed` flag set, then there must be a more
|
||||||
|
// recent frozen bank on this fork to use, so we can ignore this one. Otherwise,
|
||||||
|
// even if this bank has descendants, if they have not yet been frozen / stats computed,
|
||||||
|
// then use this bank as a representative for the fork.
|
||||||
|
|| descendants.iter().any(|d| progress.get_fork_stats(*d).map(|stats| stats.computed).unwrap_or(false))
|
||||||
|| *candidate_slot == last_voted_slot
|
|| *candidate_slot == last_voted_slot
|
||||||
|| ancestors
|
|| ancestors
|
||||||
.get(&candidate_slot)
|
.get(&candidate_slot)
|
||||||
@ -441,6 +452,7 @@ impl Tower {
|
|||||||
exist in the ancestors map",
|
exist in the ancestors map",
|
||||||
)
|
)
|
||||||
.contains(&last_voted_slot)
|
.contains(&last_voted_slot)
|
||||||
|
|| *candidate_slot <= root
|
||||||
{
|
{
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
@ -461,17 +473,18 @@ impl Tower {
|
|||||||
.lockout_intervals;
|
.lockout_intervals;
|
||||||
// Find any locked out intervals in this bank with endpoint >= last_vote,
|
// Find any locked out intervals in this bank with endpoint >= last_vote,
|
||||||
// implies they are locked out at last_vote
|
// implies they are locked out at last_vote
|
||||||
for (_lockout_ineterval_end, value) in lockout_intervals.range((Included(last_voted_slot), Unbounded)) {
|
for (_lockout_interval_end, value) in lockout_intervals.range((Included(last_voted_slot), Unbounded)) {
|
||||||
for (lockout_interval_start, vote_account_pubkey) in value {
|
for (lockout_interval_start, vote_account_pubkey) in value {
|
||||||
// Only count lockouts on slots that are:
|
// Only count lockouts on slots that are:
|
||||||
// 1) Not ancestors of `last_vote`
|
// 1) Not ancestors of `last_vote`
|
||||||
// 2) Not from before the current root as we can't determine if
|
// 2) Not from before the current root as we can't determine if
|
||||||
// anything before the root was an ancestor of `last_vote` or not
|
// anything before the root was an ancestor of `last_vote` or not
|
||||||
if !last_vote_ancestors.contains(lockout_interval_start)
|
if !last_vote_ancestors.contains(lockout_interval_start)
|
||||||
// The check if the key exists in the ancestors map
|
// Given a `lockout_interval_start` < root that appears in a
|
||||||
// is equivalent to checking if the key is above the
|
// bank for a `candidate_slot`, it must be that `lockout_interval_start`
|
||||||
// current root.
|
// is an ancestor of the current root, because `candidate_slot` is a
|
||||||
&& ancestors.contains_key(lockout_interval_start)
|
// descendant of the current root
|
||||||
|
&& *lockout_interval_start > root
|
||||||
&& !locked_out_vote_accounts.contains(vote_account_pubkey)
|
&& !locked_out_vote_accounts.contains(vote_account_pubkey)
|
||||||
{
|
{
|
||||||
let stake = epoch_vote_accounts
|
let stake = epoch_vote_accounts
|
||||||
@ -1107,8 +1120,11 @@ pub mod test {
|
|||||||
|
|
||||||
// Fill the BankForks according to the above fork structure
|
// Fill the BankForks according to the above fork structure
|
||||||
vote_simulator.fill_bank_forks(forks, &HashMap::new());
|
vote_simulator.fill_bank_forks(forks, &HashMap::new());
|
||||||
|
for (_, fork_progress) in vote_simulator.progress.iter_mut() {
|
||||||
|
fork_progress.fork_stats.computed = true;
|
||||||
|
}
|
||||||
let ancestors = vote_simulator.bank_forks.read().unwrap().ancestors();
|
let ancestors = vote_simulator.bank_forks.read().unwrap().ancestors();
|
||||||
let descendants = vote_simulator.bank_forks.read().unwrap().descendants();
|
let mut descendants = vote_simulator.bank_forks.read().unwrap().descendants();
|
||||||
let mut tower = Tower::new_with_key(&my_pubkey);
|
let mut tower = Tower::new_with_key(&my_pubkey);
|
||||||
|
|
||||||
// Last vote is 47
|
// Last vote is 47
|
||||||
@ -1185,6 +1201,23 @@ pub mod test {
|
|||||||
SwitchForkDecision::FailedSwitchThreshold
|
SwitchForkDecision::FailedSwitchThreshold
|
||||||
);
|
);
|
||||||
|
|
||||||
|
// Adding another validator lockout on a different fork, and the lockout
|
||||||
|
// covers the last vote would count towards the switch threshold,
|
||||||
|
// unless the bank is not the most recent frozen bank on the fork (14 is a
|
||||||
|
// frozen/computed bank > 13 on the same fork in this case)
|
||||||
|
vote_simulator.simulate_lockout_interval(13, (12, 47), &other_vote_account);
|
||||||
|
assert_eq!(
|
||||||
|
tower.check_switch_threshold(
|
||||||
|
110,
|
||||||
|
&ancestors,
|
||||||
|
&descendants,
|
||||||
|
&vote_simulator.progress,
|
||||||
|
total_stake,
|
||||||
|
bank0.epoch_vote_accounts(0).unwrap(),
|
||||||
|
),
|
||||||
|
SwitchForkDecision::FailedSwitchThreshold
|
||||||
|
);
|
||||||
|
|
||||||
// Adding another validator lockout on a different fork, and the lockout
|
// Adding another validator lockout on a different fork, and the lockout
|
||||||
// covers the last vote, should satisfy the switch threshold
|
// covers the last vote, should satisfy the switch threshold
|
||||||
vote_simulator.simulate_lockout_interval(14, (12, 47), &other_vote_account);
|
vote_simulator.simulate_lockout_interval(14, (12, 47), &other_vote_account);
|
||||||
@ -1200,10 +1233,26 @@ pub mod test {
|
|||||||
SwitchForkDecision::SwitchProof(Hash::default())
|
SwitchForkDecision::SwitchProof(Hash::default())
|
||||||
);
|
);
|
||||||
|
|
||||||
|
// Adding another unfrozen descendant of the tip of 14 should not remove
|
||||||
|
// slot 14 from consideration because it is still the most recent frozen
|
||||||
|
// bank on its fork
|
||||||
|
descendants.get_mut(&14).unwrap().insert(10000);
|
||||||
|
assert_eq!(
|
||||||
|
tower.check_switch_threshold(
|
||||||
|
110,
|
||||||
|
&ancestors,
|
||||||
|
&descendants,
|
||||||
|
&vote_simulator.progress,
|
||||||
|
total_stake,
|
||||||
|
bank0.epoch_vote_accounts(0).unwrap(),
|
||||||
|
),
|
||||||
|
SwitchForkDecision::SwitchProof(Hash::default())
|
||||||
|
);
|
||||||
|
|
||||||
// If we set a root, then any lockout intervals below the root shouldn't
|
// If we set a root, then any lockout intervals below the root shouldn't
|
||||||
// count toward the switch threshold. This means the other validator's
|
// count toward the switch threshold. This means the other validator's
|
||||||
// vote lockout no longer counts
|
// vote lockout no longer counts
|
||||||
vote_simulator.set_root(43);
|
tower.lockouts.root_slot = Some(43);
|
||||||
// Refresh ancestors and descendants for new root.
|
// Refresh ancestors and descendants for new root.
|
||||||
let ancestors = vote_simulator.bank_forks.read().unwrap().ancestors();
|
let ancestors = vote_simulator.bank_forks.read().unwrap().ancestors();
|
||||||
let descendants = vote_simulator.bank_forks.read().unwrap().descendants();
|
let descendants = vote_simulator.bank_forks.read().unwrap().descendants();
|
||||||
|
@ -304,6 +304,60 @@ pub fn check_for_new_roots(num_new_roots: usize, contact_infos: &[ContactInfo],
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn check_no_new_roots(
|
||||||
|
num_slots_to_wait: usize,
|
||||||
|
contact_infos: &[ContactInfo],
|
||||||
|
test_name: &str,
|
||||||
|
) {
|
||||||
|
assert!(!contact_infos.is_empty());
|
||||||
|
let mut roots = vec![0; contact_infos.len()];
|
||||||
|
let max_slot = contact_infos
|
||||||
|
.iter()
|
||||||
|
.enumerate()
|
||||||
|
.map(|(i, ingress_node)| {
|
||||||
|
let client = create_client(ingress_node.client_facing_addr(), VALIDATOR_PORT_RANGE);
|
||||||
|
let initial_root = client
|
||||||
|
.get_slot()
|
||||||
|
.unwrap_or_else(|_| panic!("get_slot for {} failed", ingress_node.id));
|
||||||
|
roots[i] = initial_root;
|
||||||
|
client
|
||||||
|
.get_slot_with_commitment(CommitmentConfig::recent())
|
||||||
|
.unwrap_or_else(|_| panic!("get_slot for {} failed", ingress_node.id))
|
||||||
|
})
|
||||||
|
.max()
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
let end_slot = max_slot + num_slots_to_wait as u64;
|
||||||
|
let mut current_slot;
|
||||||
|
let mut last_print = Instant::now();
|
||||||
|
let client = create_client(contact_infos[0].client_facing_addr(), VALIDATOR_PORT_RANGE);
|
||||||
|
loop {
|
||||||
|
current_slot = client
|
||||||
|
.get_slot_with_commitment(CommitmentConfig::recent())
|
||||||
|
.unwrap_or_else(|_| panic!("get_slot for {} failed", contact_infos[0].id));
|
||||||
|
if current_slot > end_slot {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
if last_print.elapsed().as_secs() > 3 {
|
||||||
|
info!(
|
||||||
|
"{} current slot: {}, waiting for slot: {}",
|
||||||
|
test_name, current_slot, end_slot
|
||||||
|
);
|
||||||
|
last_print = Instant::now();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (i, ingress_node) in contact_infos.iter().enumerate() {
|
||||||
|
let client = create_client(ingress_node.client_facing_addr(), VALIDATOR_PORT_RANGE);
|
||||||
|
assert_eq!(
|
||||||
|
client
|
||||||
|
.get_slot()
|
||||||
|
.unwrap_or_else(|_| panic!("get_slot for {} failed", ingress_node.id)),
|
||||||
|
roots[i]
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
fn poll_all_nodes_for_signature(
|
fn poll_all_nodes_for_signature(
|
||||||
entry_point_info: &ContactInfo,
|
entry_point_info: &ContactInfo,
|
||||||
cluster_nodes: &[ContactInfo],
|
cluster_nodes: &[ContactInfo],
|
||||||
|
@ -360,6 +360,25 @@ impl LocalCluster {
|
|||||||
info!("{} done waiting for roots", test_name);
|
info!("{} done waiting for roots", test_name);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn check_no_new_roots(&self, num_slots_to_wait: usize, test_name: &str) {
|
||||||
|
let alive_node_contact_infos: Vec<_> = self
|
||||||
|
.validators
|
||||||
|
.values()
|
||||||
|
.map(|v| v.info.contact_info.clone())
|
||||||
|
.collect();
|
||||||
|
assert!(!alive_node_contact_infos.is_empty());
|
||||||
|
info!("{} discovering nodes", test_name);
|
||||||
|
let cluster_nodes = discover_cluster(
|
||||||
|
&alive_node_contact_infos[0].gossip,
|
||||||
|
alive_node_contact_infos.len(),
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
info!("{} discovered {} nodes", test_name, cluster_nodes.len());
|
||||||
|
info!("{} making sure no new roots on any nodes", test_name);
|
||||||
|
cluster_tests::check_no_new_roots(num_slots_to_wait, &alive_node_contact_infos, test_name);
|
||||||
|
info!("{} done waiting for roots", test_name);
|
||||||
|
}
|
||||||
|
|
||||||
fn transfer_with_client(
|
fn transfer_with_client(
|
||||||
client: &ThinClient,
|
client: &ThinClient,
|
||||||
source_keypair: &Keypair,
|
source_keypair: &Keypair,
|
||||||
|
@ -4,8 +4,10 @@ use serial_test_derive::serial;
|
|||||||
use solana_client::rpc_client::RpcClient;
|
use solana_client::rpc_client::RpcClient;
|
||||||
use solana_client::thin_client::create_client;
|
use solana_client::thin_client::create_client;
|
||||||
use solana_core::{
|
use solana_core::{
|
||||||
broadcast_stage::BroadcastStageType, consensus::VOTE_THRESHOLD_DEPTH,
|
broadcast_stage::BroadcastStageType,
|
||||||
gossip_service::discover_cluster, validator::ValidatorConfig,
|
consensus::{SWITCH_FORK_THRESHOLD, VOTE_THRESHOLD_DEPTH},
|
||||||
|
gossip_service::discover_cluster,
|
||||||
|
validator::ValidatorConfig,
|
||||||
};
|
};
|
||||||
use solana_download_utils::download_snapshot;
|
use solana_download_utils::download_snapshot;
|
||||||
use solana_ledger::{
|
use solana_ledger::{
|
||||||
@ -407,6 +409,137 @@ fn test_kill_heaviest_partition() {
|
|||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[allow(clippy::assertions_on_constants)]
|
||||||
|
fn run_kill_partition_switch_threshold<F>(
|
||||||
|
failures_stake: u64,
|
||||||
|
alive_stake_1: u64,
|
||||||
|
alive_stake_2: u64,
|
||||||
|
on_partition_resolved: F,
|
||||||
|
) where
|
||||||
|
F: Fn(&mut LocalCluster),
|
||||||
|
{
|
||||||
|
// Needs to be at least 1/3 or there will be no overlap
|
||||||
|
// with the confirmation supermajority 2/3
|
||||||
|
assert!(SWITCH_FORK_THRESHOLD >= 1f64 / 3f64);
|
||||||
|
info!(
|
||||||
|
"stakes: {} {} {}",
|
||||||
|
failures_stake, alive_stake_1, alive_stake_2
|
||||||
|
);
|
||||||
|
|
||||||
|
// This test:
|
||||||
|
// 1) Spins up three partitions
|
||||||
|
// 2) Kills the first partition with the stake `failures_stake`
|
||||||
|
// 5) runs `on_partition_resolved`
|
||||||
|
let mut leader_schedule = vec![];
|
||||||
|
let num_slots_per_validator = 8;
|
||||||
|
let partitions: [&[usize]; 3] = [
|
||||||
|
&[(failures_stake as usize)],
|
||||||
|
&[(alive_stake_1 as usize)],
|
||||||
|
&[(alive_stake_2 as usize)],
|
||||||
|
];
|
||||||
|
let validator_keys: Vec<_> = iter::repeat_with(|| Arc::new(Keypair::new()))
|
||||||
|
.take(partitions.len())
|
||||||
|
.collect();
|
||||||
|
for (i, k) in validator_keys.iter().enumerate() {
|
||||||
|
let num_slots = {
|
||||||
|
if i == 0 {
|
||||||
|
// Set up the leader to have 50% of the slots
|
||||||
|
num_slots_per_validator * (partitions.len() - 1)
|
||||||
|
} else {
|
||||||
|
num_slots_per_validator
|
||||||
|
}
|
||||||
|
};
|
||||||
|
for _ in 0..num_slots {
|
||||||
|
leader_schedule.push(k.pubkey())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
info!("leader_schedule: {}", leader_schedule.len());
|
||||||
|
|
||||||
|
let validator_to_kill = validator_keys[0].pubkey();
|
||||||
|
let on_partition_start = |cluster: &mut LocalCluster| {
|
||||||
|
info!("Killing validator with id: {}", validator_to_kill);
|
||||||
|
cluster.exit_node(&validator_to_kill);
|
||||||
|
};
|
||||||
|
run_cluster_partition(
|
||||||
|
&partitions,
|
||||||
|
Some((
|
||||||
|
LeaderSchedule::new_from_schedule(leader_schedule),
|
||||||
|
validator_keys,
|
||||||
|
)),
|
||||||
|
on_partition_start,
|
||||||
|
on_partition_resolved,
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
#[serial]
|
||||||
|
fn test_kill_partition_switch_threshold_no_progress() {
|
||||||
|
let max_switch_threshold_failure_pct = 1.0 - 2.0 * SWITCH_FORK_THRESHOLD;
|
||||||
|
let total_stake = 10_000;
|
||||||
|
let max_failures_stake = (max_switch_threshold_failure_pct * total_stake as f64) as u64;
|
||||||
|
|
||||||
|
let failures_stake = max_failures_stake;
|
||||||
|
let total_alive_stake = total_stake - failures_stake;
|
||||||
|
let alive_stake_1 = total_alive_stake / 2;
|
||||||
|
let alive_stake_2 = total_alive_stake - alive_stake_1;
|
||||||
|
|
||||||
|
// Check that no new roots were set 400 slots after partition resolves (gives time
|
||||||
|
// for lockouts built during partition to resolve and gives validators an opportunity
|
||||||
|
// to try and switch forks)
|
||||||
|
let on_partition_resolved = |cluster: &mut LocalCluster| {
|
||||||
|
cluster.check_no_new_roots(400, &"PARTITION_TEST");
|
||||||
|
};
|
||||||
|
|
||||||
|
// This kills `max_failures_stake`, so no progress should be made
|
||||||
|
run_kill_partition_switch_threshold(
|
||||||
|
failures_stake,
|
||||||
|
alive_stake_1,
|
||||||
|
alive_stake_2,
|
||||||
|
on_partition_resolved,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
#[serial]
|
||||||
|
fn test_kill_partition_switch_threshold() {
|
||||||
|
let max_switch_threshold_failure_pct = 1.0 - 2.0 * SWITCH_FORK_THRESHOLD;
|
||||||
|
let total_stake = 10_000;
|
||||||
|
|
||||||
|
// Kill `< max_failures_stake` of the validators
|
||||||
|
let max_failures_stake = (max_switch_threshold_failure_pct * total_stake as f64) as u64;
|
||||||
|
let failures_stake = max_failures_stake - 1;
|
||||||
|
let total_alive_stake = total_stake - failures_stake;
|
||||||
|
|
||||||
|
// Partition the remaining alive validators, should still make progress
|
||||||
|
// once the partition resolves
|
||||||
|
let alive_stake_1 = total_alive_stake / 2;
|
||||||
|
let alive_stake_2 = total_alive_stake - alive_stake_1;
|
||||||
|
let bigger = std::cmp::max(alive_stake_1, alive_stake_2);
|
||||||
|
let smaller = std::cmp::min(alive_stake_1, alive_stake_2);
|
||||||
|
|
||||||
|
// At least one of the forks must have > SWITCH_FORK_THRESHOLD in order
|
||||||
|
// to guarantee switching proofs can be created. Make sure the other fork
|
||||||
|
// is <= SWITCH_FORK_THRESHOLD to make sure progress can be made. Caches
|
||||||
|
// bugs such as liveness issues bank-weighted fork choice, which may stall
|
||||||
|
// because the fork with less stake could have more weight, but other fork would:
|
||||||
|
// 1) Not be able to generate a switching proof
|
||||||
|
// 2) Other more staked fork stops voting, so doesn't catch up in bank weight.
|
||||||
|
assert!(
|
||||||
|
bigger as f64 / total_stake as f64 > SWITCH_FORK_THRESHOLD
|
||||||
|
&& smaller as f64 / total_stake as f64 <= SWITCH_FORK_THRESHOLD
|
||||||
|
);
|
||||||
|
|
||||||
|
let on_partition_resolved = |cluster: &mut LocalCluster| {
|
||||||
|
cluster.check_for_new_roots(16, &"PARTITION_TEST");
|
||||||
|
};
|
||||||
|
run_kill_partition_switch_threshold(
|
||||||
|
failures_stake,
|
||||||
|
alive_stake_1,
|
||||||
|
alive_stake_2,
|
||||||
|
on_partition_resolved,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
#[serial]
|
#[serial]
|
||||||
fn test_two_unbalanced_stakes() {
|
fn test_two_unbalanced_stakes() {
|
||||||
|
Reference in New Issue
Block a user