From bca1d517354fb07e136dd42de2f2110a775b29f7 Mon Sep 17 00:00:00 2001
From: carllin <carl@solana.com>
Date: Wed, 16 Feb 2022 16:13:58 -0500
Subject: [PATCH] Fix flaky optimistic confirmation tests (#23178)

---
 local-cluster/tests/common.rs               |  36 +++--
 local-cluster/tests/local_cluster.rs        |  12 +-
 local-cluster/tests/local_cluster_flakey.rs | 154 +++++++++++---------
 local-cluster/tests/local_cluster_slow.rs   |   9 +-
 4 files changed, 117 insertions(+), 94 deletions(-)
diff --git a/local-cluster/tests/common.rs b/local-cluster/tests/common.rs
index 48389861ae..b16447ef82 100644
--- a/local-cluster/tests/common.rs
+++ b/local-cluster/tests/common.rs
@@ -166,7 +166,8 @@ pub fn run_kill_partition_switch_threshold<C>(
         .flat_map(|stakes_and_slots| stakes_and_slots.iter().map(|(_, num_slots)| *num_slots))
         .collect();
 
-    let (leader_schedule, validator_keys) = create_custom_leader_schedule(&num_slots_per_validator);
+    let (leader_schedule, validator_keys) =
+        create_custom_leader_schedule_with_random_keys(&num_slots_per_validator);
 
     info!(
         "Validator ids: {:?}",
@@ -206,23 +207,32 @@ pub fn run_kill_partition_switch_threshold<C>(
 }
 
 pub fn create_custom_leader_schedule(
-    validator_num_slots: &[usize],
-) -> (LeaderSchedule, Vec<Arc<Keypair>>) {
+    validator_key_to_slots: impl Iterator<Item = (Pubkey, usize)>,
+) -> LeaderSchedule {
     let mut leader_schedule = vec![];
-    let validator_keys: Vec<_> = iter::repeat_with(|| Arc::new(Keypair::new()))
-        .take(validator_num_slots.len())
-        .collect();
-    for (k, num_slots) in validator_keys.iter().zip(validator_num_slots.iter()) {
-        for _ in 0..*num_slots {
-            leader_schedule.push(k.pubkey())
+    for (k, num_slots) in validator_key_to_slots {
+        for _ in 0..num_slots {
+            leader_schedule.push(k)
         }
     }
 
     info!("leader_schedule: {}", leader_schedule.len());
-    (
-        LeaderSchedule::new_from_schedule(leader_schedule),
-        validator_keys,
-    )
+    LeaderSchedule::new_from_schedule(leader_schedule)
+}
+
+pub fn create_custom_leader_schedule_with_random_keys(
+    validator_num_slots: &[usize],
+) -> (LeaderSchedule, Vec<Arc<Keypair>>) {
+    let validator_keys: Vec<_> = iter::repeat_with(|| Arc::new(Keypair::new()))
+        .take(validator_num_slots.len())
+        .collect();
+    let leader_schedule = create_custom_leader_schedule(
+        validator_keys
+            .iter()
+            .map(|k| k.pubkey())
+            .zip(validator_num_slots.iter().cloned()),
+    );
+    (leader_schedule, validator_keys)
 }
 
 /// This function runs a network, initiates a partition based on a
diff --git a/local-cluster/tests/local_cluster.rs b/local-cluster/tests/local_cluster.rs
index 7a8c4b12c4..57b3ab409e 100644
--- a/local-cluster/tests/local_cluster.rs
+++ b/local-cluster/tests/local_cluster.rs
@@ -2,9 +2,9 @@
 use {
     assert_matches::assert_matches,
     common::{
-        copy_blocks, create_custom_leader_schedule, last_vote_in_tower, ms_for_n_slots,
-        open_blockstore, purge_slots, remove_tower, restore_tower, run_cluster_partition,
-        run_kill_partition_switch_threshold, test_faulty_node,
+        copy_blocks, create_custom_leader_schedule_with_random_keys, last_vote_in_tower,
+        ms_for_n_slots, open_blockstore, purge_slots, remove_tower, restore_tower,
+        run_cluster_partition, run_kill_partition_switch_threshold, test_faulty_node,
         wait_for_last_vote_in_tower_to_land_in_ledger, RUST_LOG_FILTER,
     },
     crossbeam_channel::{unbounded, Receiver},
@@ -2489,8 +2489,10 @@ fn test_run_test_load_program_accounts_partition_root() {
 fn run_test_load_program_accounts_partition(scan_commitment: CommitmentConfig) {
     let num_slots_per_validator = 8;
     let partitions: [Vec<usize>; 2] = [vec![1], vec![1]];
-    let (leader_schedule, validator_keys) =
-        create_custom_leader_schedule(&[num_slots_per_validator, num_slots_per_validator]);
+    let (leader_schedule, validator_keys) = create_custom_leader_schedule_with_random_keys(&[
+        num_slots_per_validator,
+        num_slots_per_validator,
+    ]);
 
     let (update_client_sender, update_client_receiver) = unbounded();
     let (scan_client_sender, scan_client_receiver) = unbounded();
diff --git a/local-cluster/tests/local_cluster_flakey.rs b/local-cluster/tests/local_cluster_flakey.rs
index ae21ff9899..f38cc943ad 100644
--- a/local-cluster/tests/local_cluster_flakey.rs
+++ b/local-cluster/tests/local_cluster_flakey.rs
@@ -3,8 +3,8 @@
 #![allow(clippy::integer_arithmetic)]
 use {
     common::{
-        copy_blocks, last_vote_in_tower, open_blockstore, purge_slots, remove_tower,
-        wait_for_last_vote_in_tower_to_land_in_ledger, RUST_LOG_FILTER,
+        copy_blocks, create_custom_leader_schedule, last_vote_in_tower, open_blockstore,
+        purge_slots, remove_tower, wait_for_last_vote_in_tower_to_land_in_ledger, RUST_LOG_FILTER,
     },
     log::*,
     serial_test::serial,
@@ -13,13 +13,17 @@ use {
         ancestor_iterator::AncestorIterator,
         blockstore::Blockstore,
         blockstore_db::{AccessType, BlockstoreOptions},
+        leader_schedule::FixedSchedule,
     },
     solana_local_cluster::{
         cluster::Cluster,
         local_cluster::{ClusterConfig, LocalCluster},
         validator_configs::*,
     },
-    solana_sdk::signature::{Keypair, Signer},
+    solana_sdk::{
+        clock::Slot,
+        signature::{Keypair, Signer},
+    },
     solana_streamer::socket::SocketAddrSpace,
     std::{
         sync::Arc,
@@ -62,27 +66,22 @@ fn test_optimistic_confirmation_violation_without_tower() {
 //
 // S0 -> S1 -> S2 -> S4
 //
-// Step 3: Then restart `A` which had 31% of the stake. With the tower, from `A`'s
-// perspective it sees:
+// Step 3: Then restart `A` which had 31% of the stake, and remove S3 from its ledger, so
+// that it only sees `C`'s fork at S2. From `A`'s perspective it sees:
 //
-// S0 -> S1 -> S2 -> S3 (voted)
+// S0 -> S1 -> S2
 //             |
 //             -> S4 -> S5 (C's vote for S4)
 //
 // The fork choice rule weights look like:
 //
-// S0 -> S1 -> S2 (ABC) -> S3
+// S0 -> S1 -> S2 (ABC)
 //             |
 //             -> S4 (C) -> S5
 //
 // Step 5:
 // Without the persisted tower:
-//    `A` would choose to vote on the fork with `S4 -> S5`. This is true even if `A`
-//    generates a new fork starting at slot `S3` because `C` has more stake than `A`
-//    so `A` will eventually pick the fork `C` is on.
-//
-//    Furthermore `B`'s vote on `S3` is not observable because there are no
-//    descendants of slot `S3`, so that fork will not be chosen over `C`'s fork
+//    `A` would choose to vote on the fork with `S4 -> S5`.
 //
 // With the persisted tower:
 //    `A` should not be able to generate a switching proof.
@@ -94,6 +93,10 @@ fn do_test_optimistic_confirmation_violation_with_or_without_tower(with_tower: b
     let slots_per_epoch = 2048;
     let node_stakes = vec![31, 36, 33, 0];
 
+    let base_slot: Slot = 26; // S2
+    let next_slot_on_a: Slot = 27; // S3
+    let truncated_slots: Slot = 100; // just enough to purge all following slots after the S2 and S3
+
     // Each pubkeys are prefixed with A, B, C and D.
     // D is needed to:
     // 1) Propagate A's votes for S2 to validator C after A shuts down so that
@@ -128,10 +131,29 @@ fn do_test_optimistic_confirmation_violation_with_or_without_tower(with_tower: b
     // because if validator A votes past `next_slot_on_a`, and then we copy over validator B's ledger
     // below only for slots <= `next_slot_on_a`, validator A will not know how it's last vote chains
     // to the otehr forks, and may violate switching proofs on restart.
-    let mut validator_configs =
-        make_identical_validator_configs(&ValidatorConfig::default_for_test(), node_stakes.len());
+    let mut default_config = ValidatorConfig::default_for_test();
+    // Split leader schedule 50-50 between validators B and C, don't give validator A any slots because
+    // it's going to be deleting its ledger, so may create versions of slots it's already created, but
+    // on a different fork.
+    let validator_to_slots = vec![
+        // Ensure validator b is leader for slots <= `next_slot_on_a`
+        (validator_b_pubkey, next_slot_on_a as usize + 1),
+        (validator_c_pubkey, next_slot_on_a as usize + 1),
+    ];
 
-    validator_configs[0].voting_disabled = true;
+    let leader_schedule = create_custom_leader_schedule(validator_to_slots.into_iter());
+    for slot in 0..=next_slot_on_a {
+        assert_eq!(leader_schedule[slot], validator_b_pubkey);
+    }
+
+    default_config.fixed_leader_schedule = Some(FixedSchedule {
+        start_epoch: 0,
+        leader_schedule: Arc::new(leader_schedule),
+    });
+    let mut validator_configs =
+        make_identical_validator_configs(&default_config, node_stakes.len());
+
+    // Disable voting on validator C
     validator_configs[2].voting_disabled = true;
 
     let mut config = ClusterConfig {
@@ -146,10 +168,6 @@ fn do_test_optimistic_confirmation_violation_with_or_without_tower(with_tower: b
     };
     let mut cluster = LocalCluster::new(&mut config, SocketAddrSpace::Unspecified);
 
-    let base_slot = 26; // S2
-    let next_slot_on_a = 27; // S3
-    let truncated_slots = 100; // just enough to purge all following slots after the S2 and S3
-
     let val_a_ledger_path = cluster.ledger_path(&validator_a_pubkey);
     let val_b_ledger_path = cluster.ledger_path(&validator_b_pubkey);
     let val_c_ledger_path = cluster.ledger_path(&validator_c_pubkey);
@@ -167,43 +185,65 @@ fn do_test_optimistic_confirmation_violation_with_or_without_tower(with_tower: b
         validator_c_pubkey, val_c_ledger_path
     );
 
-    // Immediately kill validator A, and C
-    info!("Exiting validators A and C");
-    let mut validator_a_info = cluster.exit_node(&validator_a_pubkey);
+    // Immediately kill validator C. No need to kill validator A because
+    // 1) It has no slots in the leader schedule, so no way to make forks
+    // 2) We need it to vote
+    info!("Exiting validator C");
     let mut validator_c_info = cluster.exit_node(&validator_c_pubkey);
 
     // Step 1:
-    // Let validator B, (D) run for a while.
+    // Let validator A, B, (D) run. Wait for both `A` and `B` to have voted on `next_slot_on_a` or
+    // one of its descendants
+    info!(
+        "Waiting on both validators A and B to vote on fork at slot {}",
+        next_slot_on_a
+    );
     let now = Instant::now();
+    let mut last_b_vote = 0;
+    let mut last_a_vote = 0;
     loop {
         let elapsed = now.elapsed();
         assert!(
             elapsed <= Duration::from_secs(30),
-            "Validator B failed to vote on any slot >= {} in {} secs",
+            "One of the validators failed to vote on a slot >= {} in {} secs,
+            last validator A vote: {},
+            last validator B vote: {}",
             next_slot_on_a,
-            elapsed.as_secs()
+            elapsed.as_secs(),
+            last_a_vote,
+            last_b_vote,
         );
         sleep(Duration::from_millis(100));
 
         if let Some((last_vote, _)) = last_vote_in_tower(&val_b_ledger_path, &validator_b_pubkey) {
+            last_b_vote = last_vote;
+            if last_vote < next_slot_on_a {
+                continue;
+            }
+        }
+
+        if let Some((last_vote, _)) = last_vote_in_tower(&val_a_ledger_path, &validator_a_pubkey) {
+            last_a_vote = last_vote;
             if last_vote >= next_slot_on_a {
                 break;
             }
         }
     }
-    // kill B
+
+    // kill A and B
     let _validator_b_info = cluster.exit_node(&validator_b_pubkey);
+    let validator_a_info = cluster.exit_node(&validator_a_pubkey);
 
     // Step 2:
-    // Stop validator and truncate ledger, copy over B's ledger to A
-    info!("truncate validator C's ledger");
+    // Stop validator and truncate ledger, copy over B's ledger to C
+    info!("Create validator C's ledger");
     {
         // first copy from validator B's ledger
         std::fs::remove_dir_all(&validator_c_info.info.ledger_path).unwrap();
         let mut opt = fs_extra::dir::CopyOptions::new();
         opt.copy_inside = true;
         fs_extra::dir::copy(&val_b_ledger_path, &val_c_ledger_path, &opt).unwrap();
-        // Remove B's tower in the C's new copied ledger
+        // Remove B's tower in C's new copied ledger
         remove_tower(&val_c_ledger_path, &validator_b_pubkey);
 
         let blockstore = open_blockstore(&val_c_ledger_path);
@@ -224,44 +264,14 @@ fn do_test_optimistic_confirmation_violation_with_or_without_tower(with_tower: b
         purge_slots(&a_blockstore, next_slot_on_a + 1, truncated_slots);
     }
 
-    // Step 3:
-    // Restart A with voting enabled so that it can vote on B's fork
-    // up to `next_slot_on_a`, thereby optimistcally confirming `next_slot_on_a`
-    info!("Restarting A");
-    validator_a_info.config.voting_disabled = false;
-    cluster.restart_node(
-        &validator_a_pubkey,
-        validator_a_info,
-        SocketAddrSpace::Unspecified,
-    );
+    // This should be guaranteed because we waited for validator `A` to vote on a slot > `next_slot_on_a`
+    // before killing it earlier.
+    info!("Checking A's tower for a vote on slot descended from slot `next_slot_on_a`");
+    let last_vote_slot = last_vote_in_tower(&val_a_ledger_path, &validator_a_pubkey)
+        .unwrap()
+        .0;
+    assert!(last_vote_slot >= next_slot_on_a);
 
-    info!("Waiting for A to vote on slot descended from slot `next_slot_on_a`");
-    let now = Instant::now();
-    loop {
-        if let Some((last_vote_slot, _)) =
-            last_vote_in_tower(&val_a_ledger_path, &validator_a_pubkey)
-        {
-            if last_vote_slot >= next_slot_on_a {
-                info!(
-                    "Validator A has caught up and voted on slot: {}",
-                    last_vote_slot
-                );
-                break;
-            }
-        }
-
-        if now.elapsed().as_secs() >= 30 {
-            panic!(
-                "Validator A has not seen optimistic confirmation slot > {} in 30 seconds",
-                next_slot_on_a
-            );
-        }
-
-        sleep(Duration::from_millis(20));
-    }
-
-    info!("Killing A");
-    let validator_a_info = cluster.exit_node(&validator_a_pubkey);
     {
         let blockstore = open_blockstore(&val_a_ledger_path);
         purge_slots(&blockstore, next_slot_on_a + 1, truncated_slots);
@@ -270,9 +280,9 @@ fn do_test_optimistic_confirmation_violation_with_or_without_tower(with_tower: b
             remove_tower(&val_a_ledger_path, &validator_a_pubkey);
 
             // Remove next_slot_on_a from ledger to force validator A to select
-            // votes_on_c_fork. Otherwise the validator A will immediately vote
-            // for 27 on restart, because it hasn't gotten the heavier fork from
-            // validator C yet.
+            // votes_on_c_fork. Otherwise, in the test case without a tower,
+            // the validator A will immediately vote for 27 on restart, because it
+            // hasn't gotten the heavier fork from validator C yet.
             // Then it will be stuck on 27 unable to switch because C doesn't
             // have enough stake to generate a switching proof
             purge_slots(&blockstore, next_slot_on_a, truncated_slots);
@@ -281,7 +291,7 @@ fn do_test_optimistic_confirmation_violation_with_or_without_tower(with_tower: b
         }
     }
 
-    // Step 4:
+    // Step 3:
     // Run validator C only to make it produce and vote on its own fork.
     info!("Restart validator C again!!!");
     validator_c_info.config.voting_disabled = false;
@@ -308,7 +318,7 @@ fn do_test_optimistic_confirmation_violation_with_or_without_tower(with_tower: b
     assert!(!votes_on_c_fork.is_empty());
     info!("collected validator C's votes: {:?}", votes_on_c_fork);
 
-    // Step 5:
+    // Step 4:
     // verify whether there was violation or not
     info!("Restart validator A again!!!");
     cluster.restart_node(
diff --git a/local-cluster/tests/local_cluster_slow.rs b/local-cluster/tests/local_cluster_slow.rs
index 2f2ae10187..e586371094 100644
--- a/local-cluster/tests/local_cluster_slow.rs
+++ b/local-cluster/tests/local_cluster_slow.rs
@@ -3,9 +3,10 @@
 #![allow(clippy::integer_arithmetic)]
 use {
     common::{
-        copy_blocks, create_custom_leader_schedule, last_vote_in_tower, ms_for_n_slots,
-        open_blockstore, restore_tower, run_cluster_partition, run_kill_partition_switch_threshold,
-        test_faulty_node, wait_for_last_vote_in_tower_to_land_in_ledger, RUST_LOG_FILTER,
+        copy_blocks, create_custom_leader_schedule_with_random_keys, last_vote_in_tower,
+        ms_for_n_slots, open_blockstore, restore_tower, run_cluster_partition,
+        run_kill_partition_switch_threshold, test_faulty_node,
+        wait_for_last_vote_in_tower_to_land_in_ledger, RUST_LOG_FILTER,
     },
     log::*,
     serial_test::serial,
@@ -279,7 +280,7 @@ fn test_kill_heaviest_partition() {
     // 4) Check for recovery
     let num_slots_per_validator = 8;
     let partitions: [Vec<usize>; 4] = [vec![11], vec![10], vec![10], vec![10]];
-    let (leader_schedule, validator_keys) = create_custom_leader_schedule(&[
+    let (leader_schedule, validator_keys) = create_custom_leader_schedule_with_random_keys(&[
         num_slots_per_validator * (partitions.len() - 1),
         num_slots_per_validator,
         num_slots_per_validator,