Add test for making sure switch doesn't happen past failure threshold (#11138)

Fix switch threshold Co-authored-by: Carl <carl@solana.com>
2020-07-21 23:04:24 -07:00
parent 3fd16cea34
commit e556f85178
4 changed files with 269 additions and 14 deletions
--- a/core/src/consensus.rs
+++ b/core/src/consensus.rs
@@ -405,6 +405,7 @@ impl Tower {
        total_stake: u64,
        epoch_vote_accounts: &HashMap<Pubkey, (u64, Account)>,
    ) -> SwitchForkDecision {
        let root = self.lockouts.root_slot.unwrap_or(0);
        self.last_voted_slot()
            .map(|last_voted_slot| {
                let last_vote_ancestors = ancestors.get(&last_voted_slot).unwrap();
@@ -427,12 +428,22 @@ impl Tower {
                let mut locked_out_stake = 0;
                let mut locked_out_vote_accounts = HashSet::new();
                for (candidate_slot, descendants) in descendants.iter() {
-                    // 1) Only consider lockouts a tips of forks as that
+                    // 1) Don't consider any banks that haven't been frozen yet
-                    //    includes all ancestors of that fork.
+                    //    because the needed stats are unavailable
-                    // 2) Don't consider lockouts on the `last_vote` itself
+                    // 2) Only consider lockouts at the latest `frozen` bank
-                    // 3) Don't consider lockouts on any descendants of
+                    //    on each fork, as that bank will contain all the
                    //    lockout intervals for ancestors on that fork as well.
                    // 3) Don't consider lockouts on the `last_vote` itself
                    // 4) Don't consider lockouts on any descendants of
                    //    `last_vote`
-                    if !descendants.is_empty()
+                    // 5) Don't consider any banks before the root because
                    //    all lockouts must be ancestors of `last_vote`
                    if !progress.get_fork_stats(*candidate_slot).map(|stats| stats.computed).unwrap_or(false)
                        // If any of the descendants have the `computed` flag set, then there must be a more
                        // recent frozen bank on this fork to use, so we can ignore this one. Otherwise,
                        // even if this bank has descendants, if they have not yet been frozen / stats computed,
                        // then use this bank as a representative for the fork.
                        || descendants.iter().any(|d| progress.get_fork_stats(*d).map(|stats| stats.computed).unwrap_or(false))
                        || *candidate_slot == last_voted_slot
                        || ancestors
                            .get(&candidate_slot)
@@ -441,6 +452,7 @@ impl Tower {
                                exist in the ancestors map",
                            )
                            .contains(&last_voted_slot)
                        || *candidate_slot <= root
                    {
                        continue;
                    }
@@ -461,17 +473,18 @@ impl Tower {
                        .lockout_intervals;
                    // Find any locked out intervals in this bank with endpoint >= last_vote,
                    // implies they are locked out at last_vote
-                    for (_lockout_ineterval_end, value) in lockout_intervals.range((Included(last_voted_slot), Unbounded)) {
+                    for (_lockout_interval_end, value) in lockout_intervals.range((Included(last_voted_slot), Unbounded)) {
                        for (lockout_interval_start, vote_account_pubkey) in value {
                            // Only count lockouts on slots that are:
                            // 1) Not ancestors of `last_vote`
                            // 2) Not from before the current root as we can't determine if
                            // anything before the root was an ancestor of `last_vote` or not
                            if !last_vote_ancestors.contains(lockout_interval_start)
-                                // The check if the key exists in the ancestors map
+                                // Given a `lockout_interval_start` < root that appears in a
-                                // is equivalent to checking if the key is above the
+                                // bank for a `candidate_slot`, it must be that `lockout_interval_start`
-                                // current root.
+                                // is an ancestor of the current root, because `candidate_slot` is a
-                                && ancestors.contains_key(lockout_interval_start)
+                                // descendant of the current root
                                && *lockout_interval_start > root
                                && !locked_out_vote_accounts.contains(vote_account_pubkey)
                            {
                                let stake = epoch_vote_accounts
@@ -1107,8 +1120,11 @@ pub mod test {
        // Fill the BankForks according to the above fork structure
        vote_simulator.fill_bank_forks(forks, &HashMap::new());
        for (_, fork_progress) in vote_simulator.progress.iter_mut() {
            fork_progress.fork_stats.computed = true;
        }
        let ancestors = vote_simulator.bank_forks.read().unwrap().ancestors();
-        let descendants = vote_simulator.bank_forks.read().unwrap().descendants();
+        let mut descendants = vote_simulator.bank_forks.read().unwrap().descendants();
        let mut tower = Tower::new_with_key(&my_pubkey);
        // Last vote is 47
@@ -1185,6 +1201,23 @@ pub mod test {
            SwitchForkDecision::FailedSwitchThreshold
        );
        // Adding another validator lockout on a different fork, and the lockout
        // covers the last vote would count towards the switch threshold,
        // unless the bank is not the most recent frozen bank on the fork (14 is a
        // frozen/computed bank > 13 on the same fork in this case)
        vote_simulator.simulate_lockout_interval(13, (12, 47), &other_vote_account);
        assert_eq!(
            tower.check_switch_threshold(
                110,
                &ancestors,
                &descendants,
                &vote_simulator.progress,
                total_stake,
                bank0.epoch_vote_accounts(0).unwrap(),
            ),
            SwitchForkDecision::FailedSwitchThreshold
        );
        // Adding another validator lockout on a different fork, and the lockout
        // covers the last vote, should satisfy the switch threshold
        vote_simulator.simulate_lockout_interval(14, (12, 47), &other_vote_account);
@@ -1200,10 +1233,26 @@ pub mod test {
            SwitchForkDecision::SwitchProof(Hash::default())
        );
        // Adding another unfrozen descendant of the tip of 14 should not remove
        // slot 14 from consideration because it is still the most recent frozen
        // bank on its fork
        descendants.get_mut(&14).unwrap().insert(10000);
        assert_eq!(
            tower.check_switch_threshold(
                110,
                &ancestors,
                &descendants,
                &vote_simulator.progress,
                total_stake,
                bank0.epoch_vote_accounts(0).unwrap(),
            ),
            SwitchForkDecision::SwitchProof(Hash::default())
        );
        // If we set a root, then any lockout intervals below the root shouldn't
        // count toward the switch threshold. This means the other validator's
        // vote lockout no longer counts
-        vote_simulator.set_root(43);
+        tower.lockouts.root_slot = Some(43);
        // Refresh ancestors and descendants for new root.
        let ancestors = vote_simulator.bank_forks.read().unwrap().ancestors();
        let descendants = vote_simulator.bank_forks.read().unwrap().descendants();
--- a/local-cluster/src/cluster_tests.rs
+++ b/local-cluster/src/cluster_tests.rs
@@ -304,6 +304,60 @@ pub fn check_for_new_roots(num_new_roots: usize, contact_infos: &[ContactInfo],
    }
 }
 pub fn check_no_new_roots(
    num_slots_to_wait: usize,
    contact_infos: &[ContactInfo],
    test_name: &str,
 ) {
    assert!(!contact_infos.is_empty());
    let mut roots = vec![0; contact_infos.len()];
    let max_slot = contact_infos
        .iter()
        .enumerate()
        .map(|(i, ingress_node)| {
            let client = create_client(ingress_node.client_facing_addr(), VALIDATOR_PORT_RANGE);
            let initial_root = client
                .get_slot()
                .unwrap_or_else(|_| panic!("get_slot for {} failed", ingress_node.id));
            roots[i] = initial_root;
            client
                .get_slot_with_commitment(CommitmentConfig::recent())
                .unwrap_or_else(|_| panic!("get_slot for {} failed", ingress_node.id))
        })
        .max()
        .unwrap();
    let end_slot = max_slot + num_slots_to_wait as u64;
    let mut current_slot;
    let mut last_print = Instant::now();
    let client = create_client(contact_infos[0].client_facing_addr(), VALIDATOR_PORT_RANGE);
    loop {
        current_slot = client
            .get_slot_with_commitment(CommitmentConfig::recent())
            .unwrap_or_else(|_| panic!("get_slot for {} failed", contact_infos[0].id));
        if current_slot > end_slot {
            break;
        }
        if last_print.elapsed().as_secs() > 3 {
            info!(
                "{} current slot: {}, waiting for slot: {}",
                test_name, current_slot, end_slot
            );
            last_print = Instant::now();
        }
    }
    for (i, ingress_node) in contact_infos.iter().enumerate() {
        let client = create_client(ingress_node.client_facing_addr(), VALIDATOR_PORT_RANGE);
        assert_eq!(
            client
                .get_slot()
                .unwrap_or_else(|_| panic!("get_slot for {} failed", ingress_node.id)),
            roots[i]
        );
    }
 }
 fn poll_all_nodes_for_signature(
    entry_point_info: &ContactInfo,
    cluster_nodes: &[ContactInfo],
--- a/local-cluster/src/local_cluster.rs
+++ b/local-cluster/src/local_cluster.rs
@@ -360,6 +360,25 @@ impl LocalCluster {
        info!("{} done waiting for roots", test_name);
    }
    pub fn check_no_new_roots(&self, num_slots_to_wait: usize, test_name: &str) {
        let alive_node_contact_infos: Vec<_> = self
            .validators
            .values()
            .map(|v| v.info.contact_info.clone())
            .collect();
        assert!(!alive_node_contact_infos.is_empty());
        info!("{} discovering nodes", test_name);
        let cluster_nodes = discover_cluster(
            &alive_node_contact_infos[0].gossip,
            alive_node_contact_infos.len(),
        )
        .unwrap();
        info!("{} discovered {} nodes", test_name, cluster_nodes.len());
        info!("{} making sure no new roots on any nodes", test_name);
        cluster_tests::check_no_new_roots(num_slots_to_wait, &alive_node_contact_infos, test_name);
        info!("{} done waiting for roots", test_name);
    }
    fn transfer_with_client(
        client: &ThinClient,
        source_keypair: &Keypair,
--- a/local-cluster/tests/local_cluster.rs
+++ b/local-cluster/tests/local_cluster.rs
@@ -4,8 +4,10 @@ use serial_test_derive::serial;
 use solana_client::rpc_client::RpcClient;
 use solana_client::thin_client::create_client;
 use solana_core::{
-    broadcast_stage::BroadcastStageType, consensus::VOTE_THRESHOLD_DEPTH,
+    broadcast_stage::BroadcastStageType,
-    gossip_service::discover_cluster, validator::ValidatorConfig,
+    consensus::{SWITCH_FORK_THRESHOLD, VOTE_THRESHOLD_DEPTH},
    gossip_service::discover_cluster,
    validator::ValidatorConfig,
 };
 use solana_download_utils::download_snapshot;
 use solana_ledger::{
@@ -407,6 +409,137 @@ fn test_kill_heaviest_partition() {
    )
 }
 #[allow(clippy::assertions_on_constants)]
 fn run_kill_partition_switch_threshold<F>(
    failures_stake: u64,
    alive_stake_1: u64,
    alive_stake_2: u64,
    on_partition_resolved: F,
 ) where
    F: Fn(&mut LocalCluster),
 {
    // Needs to be at least 1/3 or there will be no overlap
    // with the confirmation supermajority 2/3
    assert!(SWITCH_FORK_THRESHOLD >= 1f64 / 3f64);
    info!(
        "stakes: {} {} {}",
        failures_stake, alive_stake_1, alive_stake_2
    );
    // This test:
    // 1) Spins up three partitions
    // 2) Kills the first partition with the stake `failures_stake`
    // 5) runs `on_partition_resolved`
    let mut leader_schedule = vec![];
    let num_slots_per_validator = 8;
    let partitions: [&[usize]; 3] = [
        &[(failures_stake as usize)],
        &[(alive_stake_1 as usize)],
        &[(alive_stake_2 as usize)],
    ];
    let validator_keys: Vec<_> = iter::repeat_with(|| Arc::new(Keypair::new()))
        .take(partitions.len())
        .collect();
    for (i, k) in validator_keys.iter().enumerate() {
        let num_slots = {
            if i == 0 {
                // Set up the leader to have 50% of the slots
                num_slots_per_validator * (partitions.len() - 1)
            } else {
                num_slots_per_validator
            }
        };
        for _ in 0..num_slots {
            leader_schedule.push(k.pubkey())
        }
    }
    info!("leader_schedule: {}", leader_schedule.len());
    let validator_to_kill = validator_keys[0].pubkey();
    let on_partition_start = |cluster: &mut LocalCluster| {
        info!("Killing validator with id: {}", validator_to_kill);
        cluster.exit_node(&validator_to_kill);
    };
    run_cluster_partition(
        &partitions,
        Some((
            LeaderSchedule::new_from_schedule(leader_schedule),
            validator_keys,
        )),
        on_partition_start,
        on_partition_resolved,
    )
 }
 #[test]
 #[serial]
 fn test_kill_partition_switch_threshold_no_progress() {
    let max_switch_threshold_failure_pct = 1.0 - 2.0 * SWITCH_FORK_THRESHOLD;
    let total_stake = 10_000;
    let max_failures_stake = (max_switch_threshold_failure_pct * total_stake as f64) as u64;
    let failures_stake = max_failures_stake;
    let total_alive_stake = total_stake - failures_stake;
    let alive_stake_1 = total_alive_stake / 2;
    let alive_stake_2 = total_alive_stake - alive_stake_1;
    // Check that no new roots were set 400 slots after partition resolves (gives time
    // for lockouts built during partition to resolve and gives validators an opportunity
    // to try and switch forks)
    let on_partition_resolved = |cluster: &mut LocalCluster| {
        cluster.check_no_new_roots(400, &"PARTITION_TEST");
    };
    // This kills `max_failures_stake`, so no progress should be made
    run_kill_partition_switch_threshold(
        failures_stake,
        alive_stake_1,
        alive_stake_2,
        on_partition_resolved,
    );
 }
 #[test]
 #[serial]
 fn test_kill_partition_switch_threshold() {
    let max_switch_threshold_failure_pct = 1.0 - 2.0 * SWITCH_FORK_THRESHOLD;
    let total_stake = 10_000;
    // Kill `< max_failures_stake` of the validators
    let max_failures_stake = (max_switch_threshold_failure_pct * total_stake as f64) as u64;
    let failures_stake = max_failures_stake - 1;
    let total_alive_stake = total_stake - failures_stake;
    // Partition the remaining alive validators, should still make progress
    // once the partition resolves
    let alive_stake_1 = total_alive_stake / 2;
    let alive_stake_2 = total_alive_stake - alive_stake_1;
    let bigger = std::cmp::max(alive_stake_1, alive_stake_2);
    let smaller = std::cmp::min(alive_stake_1, alive_stake_2);
    // At least one of the forks must have > SWITCH_FORK_THRESHOLD in order
    // to guarantee switching proofs can be created. Make sure the other fork
    // is <= SWITCH_FORK_THRESHOLD to make sure progress can be made. Caches
    // bugs such as liveness issues bank-weighted fork choice, which may stall
    // because the fork with less stake could have more weight, but other fork would:
    // 1) Not be able to generate a switching proof
    // 2) Other more staked fork stops voting, so doesn't catch up in bank weight.
    assert!(
        bigger as f64 / total_stake as f64 > SWITCH_FORK_THRESHOLD
            && smaller as f64 / total_stake as f64 <= SWITCH_FORK_THRESHOLD
    );
    let on_partition_resolved = |cluster: &mut LocalCluster| {
        cluster.check_for_new_roots(16, &"PARTITION_TEST");
    };
    run_kill_partition_switch_threshold(
        failures_stake,
        alive_stake_1,
        alive_stake_2,
        on_partition_resolved,
    );
 }
 #[test]
 #[serial]
 fn test_two_unbalanced_stakes() {