From bca1d517354fb07e136dd42de2f2110a775b29f7 Mon Sep 17 00:00:00 2001 From: carllin Date: Wed, 16 Feb 2022 16:13:58 -0500 Subject: [PATCH] Fix flaky optimistic confirmation tests (#23178) --- local-cluster/tests/common.rs | 36 +++-- local-cluster/tests/local_cluster.rs | 12 +- local-cluster/tests/local_cluster_flakey.rs | 154 +++++++++++--------- local-cluster/tests/local_cluster_slow.rs | 9 +- 4 files changed, 117 insertions(+), 94 deletions(-) diff --git a/local-cluster/tests/common.rs b/local-cluster/tests/common.rs index 48389861ae..b16447ef82 100644 --- a/local-cluster/tests/common.rs +++ b/local-cluster/tests/common.rs @@ -166,7 +166,8 @@ pub fn run_kill_partition_switch_threshold( .flat_map(|stakes_and_slots| stakes_and_slots.iter().map(|(_, num_slots)| *num_slots)) .collect(); - let (leader_schedule, validator_keys) = create_custom_leader_schedule(&num_slots_per_validator); + let (leader_schedule, validator_keys) = + create_custom_leader_schedule_with_random_keys(&num_slots_per_validator); info!( "Validator ids: {:?}", @@ -206,23 +207,32 @@ pub fn run_kill_partition_switch_threshold( } pub fn create_custom_leader_schedule( - validator_num_slots: &[usize], -) -> (LeaderSchedule, Vec>) { + validator_key_to_slots: impl Iterator, +) -> LeaderSchedule { let mut leader_schedule = vec![]; - let validator_keys: Vec<_> = iter::repeat_with(|| Arc::new(Keypair::new())) - .take(validator_num_slots.len()) - .collect(); - for (k, num_slots) in validator_keys.iter().zip(validator_num_slots.iter()) { - for _ in 0..*num_slots { - leader_schedule.push(k.pubkey()) + for (k, num_slots) in validator_key_to_slots { + for _ in 0..num_slots { + leader_schedule.push(k) } } info!("leader_schedule: {}", leader_schedule.len()); - ( - LeaderSchedule::new_from_schedule(leader_schedule), - validator_keys, - ) + LeaderSchedule::new_from_schedule(leader_schedule) +} + +pub fn create_custom_leader_schedule_with_random_keys( + validator_num_slots: &[usize], +) -> (LeaderSchedule, Vec>) { + let validator_keys: Vec<_> = iter::repeat_with(|| Arc::new(Keypair::new())) + .take(validator_num_slots.len()) + .collect(); + let leader_schedule = create_custom_leader_schedule( + validator_keys + .iter() + .map(|k| k.pubkey()) + .zip(validator_num_slots.iter().cloned()), + ); + (leader_schedule, validator_keys) } /// This function runs a network, initiates a partition based on a diff --git a/local-cluster/tests/local_cluster.rs b/local-cluster/tests/local_cluster.rs index 7a8c4b12c4..57b3ab409e 100644 --- a/local-cluster/tests/local_cluster.rs +++ b/local-cluster/tests/local_cluster.rs @@ -2,9 +2,9 @@ use { assert_matches::assert_matches, common::{ - copy_blocks, create_custom_leader_schedule, last_vote_in_tower, ms_for_n_slots, - open_blockstore, purge_slots, remove_tower, restore_tower, run_cluster_partition, - run_kill_partition_switch_threshold, test_faulty_node, + copy_blocks, create_custom_leader_schedule_with_random_keys, last_vote_in_tower, + ms_for_n_slots, open_blockstore, purge_slots, remove_tower, restore_tower, + run_cluster_partition, run_kill_partition_switch_threshold, test_faulty_node, wait_for_last_vote_in_tower_to_land_in_ledger, RUST_LOG_FILTER, }, crossbeam_channel::{unbounded, Receiver}, @@ -2489,8 +2489,10 @@ fn test_run_test_load_program_accounts_partition_root() { fn run_test_load_program_accounts_partition(scan_commitment: CommitmentConfig) { let num_slots_per_validator = 8; let partitions: [Vec; 2] = [vec![1], vec![1]]; - let (leader_schedule, validator_keys) = - create_custom_leader_schedule(&[num_slots_per_validator, num_slots_per_validator]); + let (leader_schedule, validator_keys) = create_custom_leader_schedule_with_random_keys(&[ + num_slots_per_validator, + num_slots_per_validator, + ]); let (update_client_sender, update_client_receiver) = unbounded(); let (scan_client_sender, scan_client_receiver) = unbounded(); diff --git a/local-cluster/tests/local_cluster_flakey.rs b/local-cluster/tests/local_cluster_flakey.rs index ae21ff9899..f38cc943ad 100644 --- a/local-cluster/tests/local_cluster_flakey.rs +++ b/local-cluster/tests/local_cluster_flakey.rs @@ -3,8 +3,8 @@ #![allow(clippy::integer_arithmetic)] use { common::{ - copy_blocks, last_vote_in_tower, open_blockstore, purge_slots, remove_tower, - wait_for_last_vote_in_tower_to_land_in_ledger, RUST_LOG_FILTER, + copy_blocks, create_custom_leader_schedule, last_vote_in_tower, open_blockstore, + purge_slots, remove_tower, wait_for_last_vote_in_tower_to_land_in_ledger, RUST_LOG_FILTER, }, log::*, serial_test::serial, @@ -13,13 +13,17 @@ use { ancestor_iterator::AncestorIterator, blockstore::Blockstore, blockstore_db::{AccessType, BlockstoreOptions}, + leader_schedule::FixedSchedule, }, solana_local_cluster::{ cluster::Cluster, local_cluster::{ClusterConfig, LocalCluster}, validator_configs::*, }, - solana_sdk::signature::{Keypair, Signer}, + solana_sdk::{ + clock::Slot, + signature::{Keypair, Signer}, + }, solana_streamer::socket::SocketAddrSpace, std::{ sync::Arc, @@ -62,27 +66,22 @@ fn test_optimistic_confirmation_violation_without_tower() { // // S0 -> S1 -> S2 -> S4 // -// Step 3: Then restart `A` which had 31% of the stake. With the tower, from `A`'s -// perspective it sees: +// Step 3: Then restart `A` which had 31% of the stake, and remove S3 from its ledger, so +// that it only sees `C`'s fork at S2. From `A`'s perspective it sees: // -// S0 -> S1 -> S2 -> S3 (voted) +// S0 -> S1 -> S2 // | // -> S4 -> S5 (C's vote for S4) // // The fork choice rule weights look like: // -// S0 -> S1 -> S2 (ABC) -> S3 +// S0 -> S1 -> S2 (ABC) // | // -> S4 (C) -> S5 // // Step 5: // Without the persisted tower: -// `A` would choose to vote on the fork with `S4 -> S5`. This is true even if `A` -// generates a new fork starting at slot `S3` because `C` has more stake than `A` -// so `A` will eventually pick the fork `C` is on. -// -// Furthermore `B`'s vote on `S3` is not observable because there are no -// descendants of slot `S3`, so that fork will not be chosen over `C`'s fork +// `A` would choose to vote on the fork with `S4 -> S5`. // // With the persisted tower: // `A` should not be able to generate a switching proof. @@ -94,6 +93,10 @@ fn do_test_optimistic_confirmation_violation_with_or_without_tower(with_tower: b let slots_per_epoch = 2048; let node_stakes = vec![31, 36, 33, 0]; + let base_slot: Slot = 26; // S2 + let next_slot_on_a: Slot = 27; // S3 + let truncated_slots: Slot = 100; // just enough to purge all following slots after the S2 and S3 + // Each pubkeys are prefixed with A, B, C and D. // D is needed to: // 1) Propagate A's votes for S2 to validator C after A shuts down so that @@ -128,10 +131,29 @@ fn do_test_optimistic_confirmation_violation_with_or_without_tower(with_tower: b // because if validator A votes past `next_slot_on_a`, and then we copy over validator B's ledger // below only for slots <= `next_slot_on_a`, validator A will not know how it's last vote chains // to the otehr forks, and may violate switching proofs on restart. - let mut validator_configs = - make_identical_validator_configs(&ValidatorConfig::default_for_test(), node_stakes.len()); + let mut default_config = ValidatorConfig::default_for_test(); + // Split leader schedule 50-50 between validators B and C, don't give validator A any slots because + // it's going to be deleting its ledger, so may create versions of slots it's already created, but + // on a different fork. + let validator_to_slots = vec![ + // Ensure validator b is leader for slots <= `next_slot_on_a` + (validator_b_pubkey, next_slot_on_a as usize + 1), + (validator_c_pubkey, next_slot_on_a as usize + 1), + ]; - validator_configs[0].voting_disabled = true; + let leader_schedule = create_custom_leader_schedule(validator_to_slots.into_iter()); + for slot in 0..=next_slot_on_a { + assert_eq!(leader_schedule[slot], validator_b_pubkey); + } + + default_config.fixed_leader_schedule = Some(FixedSchedule { + start_epoch: 0, + leader_schedule: Arc::new(leader_schedule), + }); + let mut validator_configs = + make_identical_validator_configs(&default_config, node_stakes.len()); + + // Disable voting on validator C validator_configs[2].voting_disabled = true; let mut config = ClusterConfig { @@ -146,10 +168,6 @@ fn do_test_optimistic_confirmation_violation_with_or_without_tower(with_tower: b }; let mut cluster = LocalCluster::new(&mut config, SocketAddrSpace::Unspecified); - let base_slot = 26; // S2 - let next_slot_on_a = 27; // S3 - let truncated_slots = 100; // just enough to purge all following slots after the S2 and S3 - let val_a_ledger_path = cluster.ledger_path(&validator_a_pubkey); let val_b_ledger_path = cluster.ledger_path(&validator_b_pubkey); let val_c_ledger_path = cluster.ledger_path(&validator_c_pubkey); @@ -167,43 +185,65 @@ fn do_test_optimistic_confirmation_violation_with_or_without_tower(with_tower: b validator_c_pubkey, val_c_ledger_path ); - // Immediately kill validator A, and C - info!("Exiting validators A and C"); - let mut validator_a_info = cluster.exit_node(&validator_a_pubkey); + // Immediately kill validator C. No need to kill validator A because + // 1) It has no slots in the leader schedule, so no way to make forks + // 2) We need it to vote + info!("Exiting validator C"); let mut validator_c_info = cluster.exit_node(&validator_c_pubkey); // Step 1: - // Let validator B, (D) run for a while. + // Let validator A, B, (D) run. Wait for both `A` and `B` to have voted on `next_slot_on_a` or + // one of its descendants + info!( + "Waiting on both validators A and B to vote on fork at slot {}", + next_slot_on_a + ); let now = Instant::now(); + let mut last_b_vote = 0; + let mut last_a_vote = 0; loop { let elapsed = now.elapsed(); assert!( elapsed <= Duration::from_secs(30), - "Validator B failed to vote on any slot >= {} in {} secs", + "One of the validators failed to vote on a slot >= {} in {} secs, + last validator A vote: {}, + last validator B vote: {}", next_slot_on_a, - elapsed.as_secs() + elapsed.as_secs(), + last_a_vote, + last_b_vote, ); sleep(Duration::from_millis(100)); if let Some((last_vote, _)) = last_vote_in_tower(&val_b_ledger_path, &validator_b_pubkey) { + last_b_vote = last_vote; + if last_vote < next_slot_on_a { + continue; + } + } + + if let Some((last_vote, _)) = last_vote_in_tower(&val_a_ledger_path, &validator_a_pubkey) { + last_a_vote = last_vote; if last_vote >= next_slot_on_a { break; } } } - // kill B + + // kill A and B let _validator_b_info = cluster.exit_node(&validator_b_pubkey); + let validator_a_info = cluster.exit_node(&validator_a_pubkey); // Step 2: - // Stop validator and truncate ledger, copy over B's ledger to A - info!("truncate validator C's ledger"); + // Stop validator and truncate ledger, copy over B's ledger to C + info!("Create validator C's ledger"); { // first copy from validator B's ledger std::fs::remove_dir_all(&validator_c_info.info.ledger_path).unwrap(); let mut opt = fs_extra::dir::CopyOptions::new(); opt.copy_inside = true; fs_extra::dir::copy(&val_b_ledger_path, &val_c_ledger_path, &opt).unwrap(); - // Remove B's tower in the C's new copied ledger + // Remove B's tower in C's new copied ledger remove_tower(&val_c_ledger_path, &validator_b_pubkey); let blockstore = open_blockstore(&val_c_ledger_path); @@ -224,44 +264,14 @@ fn do_test_optimistic_confirmation_violation_with_or_without_tower(with_tower: b purge_slots(&a_blockstore, next_slot_on_a + 1, truncated_slots); } - // Step 3: - // Restart A with voting enabled so that it can vote on B's fork - // up to `next_slot_on_a`, thereby optimistcally confirming `next_slot_on_a` - info!("Restarting A"); - validator_a_info.config.voting_disabled = false; - cluster.restart_node( - &validator_a_pubkey, - validator_a_info, - SocketAddrSpace::Unspecified, - ); + // This should be guaranteed because we waited for validator `A` to vote on a slot > `next_slot_on_a` + // before killing it earlier. + info!("Checking A's tower for a vote on slot descended from slot `next_slot_on_a`"); + let last_vote_slot = last_vote_in_tower(&val_a_ledger_path, &validator_a_pubkey) + .unwrap() + .0; + assert!(last_vote_slot >= next_slot_on_a); - info!("Waiting for A to vote on slot descended from slot `next_slot_on_a`"); - let now = Instant::now(); - loop { - if let Some((last_vote_slot, _)) = - last_vote_in_tower(&val_a_ledger_path, &validator_a_pubkey) - { - if last_vote_slot >= next_slot_on_a { - info!( - "Validator A has caught up and voted on slot: {}", - last_vote_slot - ); - break; - } - } - - if now.elapsed().as_secs() >= 30 { - panic!( - "Validator A has not seen optimistic confirmation slot > {} in 30 seconds", - next_slot_on_a - ); - } - - sleep(Duration::from_millis(20)); - } - - info!("Killing A"); - let validator_a_info = cluster.exit_node(&validator_a_pubkey); { let blockstore = open_blockstore(&val_a_ledger_path); purge_slots(&blockstore, next_slot_on_a + 1, truncated_slots); @@ -270,9 +280,9 @@ fn do_test_optimistic_confirmation_violation_with_or_without_tower(with_tower: b remove_tower(&val_a_ledger_path, &validator_a_pubkey); // Remove next_slot_on_a from ledger to force validator A to select - // votes_on_c_fork. Otherwise the validator A will immediately vote - // for 27 on restart, because it hasn't gotten the heavier fork from - // validator C yet. + // votes_on_c_fork. Otherwise, in the test case without a tower, + // the validator A will immediately vote for 27 on restart, because it + // hasn't gotten the heavier fork from validator C yet. // Then it will be stuck on 27 unable to switch because C doesn't // have enough stake to generate a switching proof purge_slots(&blockstore, next_slot_on_a, truncated_slots); @@ -281,7 +291,7 @@ fn do_test_optimistic_confirmation_violation_with_or_without_tower(with_tower: b } } - // Step 4: + // Step 3: // Run validator C only to make it produce and vote on its own fork. info!("Restart validator C again!!!"); validator_c_info.config.voting_disabled = false; @@ -308,7 +318,7 @@ fn do_test_optimistic_confirmation_violation_with_or_without_tower(with_tower: b assert!(!votes_on_c_fork.is_empty()); info!("collected validator C's votes: {:?}", votes_on_c_fork); - // Step 5: + // Step 4: // verify whether there was violation or not info!("Restart validator A again!!!"); cluster.restart_node( diff --git a/local-cluster/tests/local_cluster_slow.rs b/local-cluster/tests/local_cluster_slow.rs index 2f2ae10187..e586371094 100644 --- a/local-cluster/tests/local_cluster_slow.rs +++ b/local-cluster/tests/local_cluster_slow.rs @@ -3,9 +3,10 @@ #![allow(clippy::integer_arithmetic)] use { common::{ - copy_blocks, create_custom_leader_schedule, last_vote_in_tower, ms_for_n_slots, - open_blockstore, restore_tower, run_cluster_partition, run_kill_partition_switch_threshold, - test_faulty_node, wait_for_last_vote_in_tower_to_land_in_ledger, RUST_LOG_FILTER, + copy_blocks, create_custom_leader_schedule_with_random_keys, last_vote_in_tower, + ms_for_n_slots, open_blockstore, restore_tower, run_cluster_partition, + run_kill_partition_switch_threshold, test_faulty_node, + wait_for_last_vote_in_tower_to_land_in_ledger, RUST_LOG_FILTER, }, log::*, serial_test::serial, @@ -279,7 +280,7 @@ fn test_kill_heaviest_partition() { // 4) Check for recovery let num_slots_per_validator = 8; let partitions: [Vec; 4] = [vec![11], vec![10], vec![10], vec![10]]; - let (leader_schedule, validator_keys) = create_custom_leader_schedule(&[ + let (leader_schedule, validator_keys) = create_custom_leader_schedule_with_random_keys(&[ num_slots_per_validator * (partitions.len() - 1), num_slots_per_validator, num_slots_per_validator,