361 lines
		
	
	
		
			13 KiB
		
	
	
	
		
			Rust
		
	
	
	
	
	
		
		
			
		
	
	
			361 lines
		
	
	
		
			13 KiB
		
	
	
	
		
			Rust
		
	
	
	
	
	
|   | //! Move flakey tests here so that when they fail, there's less to retry in CI
 | ||
|  | //! because these tests are run separately from the rest of local cluster tests.
 | ||
|  | #![allow(clippy::integer_arithmetic)]
 | ||
|  | use {
 | ||
|  |     common::{
 | ||
|  |         copy_blocks, last_vote_in_tower, open_blockstore, purge_slots, remove_tower,
 | ||
|  |         wait_for_last_vote_in_tower_to_land_in_ledger, RUST_LOG_FILTER,
 | ||
|  |     },
 | ||
|  |     log::*,
 | ||
|  |     serial_test::serial,
 | ||
|  |     solana_core::validator::ValidatorConfig,
 | ||
|  |     solana_ledger::{
 | ||
|  |         ancestor_iterator::AncestorIterator,
 | ||
|  |         blockstore::Blockstore,
 | ||
|  |         blockstore_db::{AccessType, BlockstoreOptions},
 | ||
|  |     },
 | ||
|  |     solana_local_cluster::{
 | ||
|  |         cluster::Cluster,
 | ||
|  |         local_cluster::{ClusterConfig, LocalCluster},
 | ||
|  |         validator_configs::*,
 | ||
|  |     },
 | ||
|  |     solana_sdk::signature::{Keypair, Signer},
 | ||
|  |     solana_streamer::socket::SocketAddrSpace,
 | ||
|  |     std::{
 | ||
|  |         sync::Arc,
 | ||
|  |         thread::sleep,
 | ||
|  |         time::{Duration, Instant},
 | ||
|  |     },
 | ||
|  | };
 | ||
|  | 
 | ||
|  | mod common;
 | ||
|  | 
 | ||
|  | #[test]
 | ||
|  | #[serial]
 | ||
|  | fn test_no_optimistic_confirmation_violation_with_tower() {
 | ||
|  |     do_test_optimistic_confirmation_violation_with_or_without_tower(true);
 | ||
|  | }
 | ||
|  | 
 | ||
|  | #[test]
 | ||
|  | #[serial]
 | ||
|  | fn test_optimistic_confirmation_violation_without_tower() {
 | ||
|  |     do_test_optimistic_confirmation_violation_with_or_without_tower(false);
 | ||
|  | }
 | ||
|  | 
 | ||
|  | // A bit convoluted test case; but this roughly follows this test theoretical scenario:
 | ||
|  | //
 | ||
|  | // Step 1: You have validator A + B with 31% and 36% of the stake. Run only validator B:
 | ||
|  | //
 | ||
|  | //  S0 -> S1 -> S2 -> S3 (B vote)
 | ||
|  | //
 | ||
|  | // Step 2: Turn off B, and truncate the ledger after slot `S3` (simulate votes not
 | ||
|  | // landing in next slot).
 | ||
|  | // Copy ledger fully to validator A and validator C
 | ||
|  | //
 | ||
|  | // Step 3: Turn on A, and have it vote up to S3. Truncate anything past slot `S3`.
 | ||
|  | //
 | ||
|  | //  S0 -> S1 -> S2 -> S3 (A & B vote, optimistically confirmed)
 | ||
|  | //
 | ||
|  | // Step 4:
 | ||
|  | // Start validator C with 33% of the stake with same ledger, but only up to slot S2.
 | ||
|  | // Have `C` generate some blocks like:
 | ||
|  | //
 | ||
|  | // S0 -> S1 -> S2 -> S4
 | ||
|  | //
 | ||
|  | // Step 3: Then restart `A` which had 31% of the stake. With the tower, from `A`'s
 | ||
|  | // perspective it sees:
 | ||
|  | //
 | ||
|  | // S0 -> S1 -> S2 -> S3 (voted)
 | ||
|  | //             |
 | ||
|  | //             -> S4 -> S5 (C's vote for S4)
 | ||
|  | //
 | ||
|  | // The fork choice rule weights look like:
 | ||
|  | //
 | ||
|  | // S0 -> S1 -> S2 (ABC) -> S3
 | ||
|  | //             |
 | ||
|  | //             -> S4 (C) -> S5
 | ||
|  | //
 | ||
|  | // Step 5:
 | ||
|  | // Without the persisted tower:
 | ||
|  | //    `A` would choose to vote on the fork with `S4 -> S5`. This is true even if `A`
 | ||
|  | //    generates a new fork starting at slot `S3` because `C` has more stake than `A`
 | ||
|  | //    so `A` will eventually pick the fork `C` is on.
 | ||
|  | //
 | ||
|  | //    Furthermore `B`'s vote on `S3` is not observable because there are no
 | ||
|  | //    descendants of slot `S3`, so that fork will not be chosen over `C`'s fork
 | ||
|  | //
 | ||
|  | // With the persisted tower:
 | ||
|  | //    `A` should not be able to generate a switching proof.
 | ||
|  | //
 | ||
|  | fn do_test_optimistic_confirmation_violation_with_or_without_tower(with_tower: bool) {
 | ||
|  |     solana_logger::setup_with_default(RUST_LOG_FILTER);
 | ||
|  | 
 | ||
|  |     // First set up the cluster with 4 nodes
 | ||
|  |     let slots_per_epoch = 2048;
 | ||
|  |     let node_stakes = vec![31, 36, 33, 0];
 | ||
|  | 
 | ||
|  |     // Each pubkeys are prefixed with A, B, C and D.
 | ||
|  |     // D is needed to:
 | ||
|  |     // 1) Propagate A's votes for S2 to validator C after A shuts down so that
 | ||
|  |     // C can avoid NoPropagatedConfirmation errors and continue to generate blocks
 | ||
|  |     // 2) Provide gossip discovery for `A` when it restarts because `A` will restart
 | ||
|  |     // at a different gossip port than the entrypoint saved in C's gossip table
 | ||
|  |     let validator_keys = vec![
 | ||
|  |         "28bN3xyvrP4E8LwEgtLjhnkb7cY4amQb6DrYAbAYjgRV4GAGgkVM2K7wnxnAS7WDneuavza7x21MiafLu1HkwQt4",
 | ||
|  |         "2saHBBoTkLMmttmPQP8KfBkcCw45S5cwtV3wTdGCscRC8uxdgvHxpHiWXKx4LvJjNJtnNcbSv5NdheokFFqnNDt8",
 | ||
|  |         "4mx9yoFBeYasDKBGDWCTWGJdWuJCKbgqmuP8bN9umybCh5Jzngw7KQxe99Rf5uzfyzgba1i65rJW4Wqk7Ab5S8ye",
 | ||
|  |         "3zsEPEDsjfEay7te9XqNjRTCE7vwuT6u4DHzBJC19yp7GS8BuNRMRjnpVrKCBzb3d44kxc4KPGSHkCmk6tEfswCg",
 | ||
|  |     ]
 | ||
|  |     .iter()
 | ||
|  |     .map(|s| (Arc::new(Keypair::from_base58_string(s)), true))
 | ||
|  |     .take(node_stakes.len())
 | ||
|  |     .collect::<Vec<_>>();
 | ||
|  |     let validators = validator_keys
 | ||
|  |         .iter()
 | ||
|  |         .map(|(kp, _)| kp.pubkey())
 | ||
|  |         .collect::<Vec<_>>();
 | ||
|  |     let (validator_a_pubkey, validator_b_pubkey, validator_c_pubkey) =
 | ||
|  |         (validators[0], validators[1], validators[2]);
 | ||
|  | 
 | ||
|  |     // Disable voting on all validators other than validator B to ensure neither of the below two
 | ||
|  |     // scenarios occur:
 | ||
|  |     // 1. If the cluster immediately forks on restart while we're killing validators A and C,
 | ||
|  |     // with Validator B on one side, and `A` and `C` on a heavier fork, it's possible that the lockouts
 | ||
|  |     // on `A` and `C`'s latest votes do not extend past validator B's latest vote. Then validator B
 | ||
|  |     // will be stuck unable to vote, but also unable generate a switching proof to the heavier fork.
 | ||
|  |     //
 | ||
|  |     // 2. Validator A doesn't vote past `next_slot_on_a` before we can kill it. This is essential
 | ||
|  |     // because if validator A votes past `next_slot_on_a`, and then we copy over validator B's ledger
 | ||
|  |     // below only for slots <= `next_slot_on_a`, validator A will not know how it's last vote chains
 | ||
|  |     // to the otehr forks, and may violate switching proofs on restart.
 | ||
|  |     let mut validator_configs =
 | ||
|  |         make_identical_validator_configs(&ValidatorConfig::default(), node_stakes.len());
 | ||
|  | 
 | ||
|  |     validator_configs[0].voting_disabled = true;
 | ||
|  |     validator_configs[2].voting_disabled = true;
 | ||
|  | 
 | ||
|  |     let mut config = ClusterConfig {
 | ||
|  |         cluster_lamports: 100_000,
 | ||
|  |         node_stakes,
 | ||
|  |         validator_configs,
 | ||
|  |         validator_keys: Some(validator_keys),
 | ||
|  |         slots_per_epoch,
 | ||
|  |         stakers_slot_offset: slots_per_epoch,
 | ||
|  |         skip_warmup_slots: true,
 | ||
|  |         ..ClusterConfig::default()
 | ||
|  |     };
 | ||
|  |     let mut cluster = LocalCluster::new(&mut config, SocketAddrSpace::Unspecified);
 | ||
|  | 
 | ||
|  |     let base_slot = 26; // S2
 | ||
|  |     let next_slot_on_a = 27; // S3
 | ||
|  |     let truncated_slots = 100; // just enough to purge all following slots after the S2 and S3
 | ||
|  | 
 | ||
|  |     let val_a_ledger_path = cluster.ledger_path(&validator_a_pubkey);
 | ||
|  |     let val_b_ledger_path = cluster.ledger_path(&validator_b_pubkey);
 | ||
|  |     let val_c_ledger_path = cluster.ledger_path(&validator_c_pubkey);
 | ||
|  | 
 | ||
|  |     info!(
 | ||
|  |         "val_a {} ledger path {:?}",
 | ||
|  |         validator_a_pubkey, val_a_ledger_path
 | ||
|  |     );
 | ||
|  |     info!(
 | ||
|  |         "val_b {} ledger path {:?}",
 | ||
|  |         validator_b_pubkey, val_b_ledger_path
 | ||
|  |     );
 | ||
|  |     info!(
 | ||
|  |         "val_c {} ledger path {:?}",
 | ||
|  |         validator_c_pubkey, val_c_ledger_path
 | ||
|  |     );
 | ||
|  | 
 | ||
|  |     // Immediately kill validator A, and C
 | ||
|  |     info!("Exiting validators A and C");
 | ||
|  |     let mut validator_a_info = cluster.exit_node(&validator_a_pubkey);
 | ||
|  |     let mut validator_c_info = cluster.exit_node(&validator_c_pubkey);
 | ||
|  | 
 | ||
|  |     // Step 1:
 | ||
|  |     // Let validator B, (D) run for a while.
 | ||
|  |     let now = Instant::now();
 | ||
|  |     loop {
 | ||
|  |         let elapsed = now.elapsed();
 | ||
|  |         assert!(
 | ||
|  |             elapsed <= Duration::from_secs(30),
 | ||
|  |             "Validator B failed to vote on any slot >= {} in {} secs",
 | ||
|  |             next_slot_on_a,
 | ||
|  |             elapsed.as_secs()
 | ||
|  |         );
 | ||
|  |         sleep(Duration::from_millis(100));
 | ||
|  | 
 | ||
|  |         if let Some((last_vote, _)) = last_vote_in_tower(&val_b_ledger_path, &validator_b_pubkey) {
 | ||
|  |             if last_vote >= next_slot_on_a {
 | ||
|  |                 break;
 | ||
|  |             }
 | ||
|  |         }
 | ||
|  |     }
 | ||
|  |     // kill B
 | ||
|  |     let _validator_b_info = cluster.exit_node(&validator_b_pubkey);
 | ||
|  | 
 | ||
|  |     // Step 2:
 | ||
|  |     // Stop validator and truncate ledger, copy over B's ledger to A
 | ||
|  |     info!("truncate validator C's ledger");
 | ||
|  |     {
 | ||
|  |         // first copy from validator B's ledger
 | ||
|  |         std::fs::remove_dir_all(&validator_c_info.info.ledger_path).unwrap();
 | ||
|  |         let mut opt = fs_extra::dir::CopyOptions::new();
 | ||
|  |         opt.copy_inside = true;
 | ||
|  |         fs_extra::dir::copy(&val_b_ledger_path, &val_c_ledger_path, &opt).unwrap();
 | ||
|  |         // Remove B's tower in the C's new copied ledger
 | ||
|  |         remove_tower(&val_c_ledger_path, &validator_b_pubkey);
 | ||
|  | 
 | ||
|  |         let blockstore = open_blockstore(&val_c_ledger_path);
 | ||
|  |         purge_slots(&blockstore, base_slot + 1, truncated_slots);
 | ||
|  |     }
 | ||
|  |     info!("Create validator A's ledger");
 | ||
|  |     {
 | ||
|  |         // Find latest vote in B, and wait for it to reach blockstore
 | ||
|  |         let b_last_vote =
 | ||
|  |             wait_for_last_vote_in_tower_to_land_in_ledger(&val_b_ledger_path, &validator_b_pubkey);
 | ||
|  | 
 | ||
|  |         // Now we copy these blocks to A
 | ||
|  |         let b_blockstore = open_blockstore(&val_b_ledger_path);
 | ||
|  |         let a_blockstore = open_blockstore(&val_a_ledger_path);
 | ||
|  |         copy_blocks(b_last_vote, &b_blockstore, &a_blockstore);
 | ||
|  | 
 | ||
|  |         // Purge uneccessary slots
 | ||
|  |         purge_slots(&a_blockstore, next_slot_on_a + 1, truncated_slots);
 | ||
|  |     }
 | ||
|  | 
 | ||
|  |     // Step 3:
 | ||
|  |     // Restart A with voting enabled so that it can vote on B's fork
 | ||
|  |     // up to `next_slot_on_a`, thereby optimistcally confirming `next_slot_on_a`
 | ||
|  |     info!("Restarting A");
 | ||
|  |     validator_a_info.config.voting_disabled = false;
 | ||
|  |     cluster.restart_node(
 | ||
|  |         &validator_a_pubkey,
 | ||
|  |         validator_a_info,
 | ||
|  |         SocketAddrSpace::Unspecified,
 | ||
|  |     );
 | ||
|  | 
 | ||
|  |     info!("Waiting for A to vote on slot descended from slot `next_slot_on_a`");
 | ||
|  |     let now = Instant::now();
 | ||
|  |     loop {
 | ||
|  |         if let Some((last_vote_slot, _)) =
 | ||
|  |             last_vote_in_tower(&val_a_ledger_path, &validator_a_pubkey)
 | ||
|  |         {
 | ||
|  |             if last_vote_slot >= next_slot_on_a {
 | ||
|  |                 info!(
 | ||
|  |                     "Validator A has caught up and voted on slot: {}",
 | ||
|  |                     last_vote_slot
 | ||
|  |                 );
 | ||
|  |                 break;
 | ||
|  |             }
 | ||
|  |         }
 | ||
|  | 
 | ||
|  |         if now.elapsed().as_secs() >= 30 {
 | ||
|  |             panic!(
 | ||
|  |                 "Validator A has not seen optimistic confirmation slot > {} in 30 seconds",
 | ||
|  |                 next_slot_on_a
 | ||
|  |             );
 | ||
|  |         }
 | ||
|  | 
 | ||
|  |         sleep(Duration::from_millis(20));
 | ||
|  |     }
 | ||
|  | 
 | ||
|  |     info!("Killing A");
 | ||
|  |     let validator_a_info = cluster.exit_node(&validator_a_pubkey);
 | ||
|  |     {
 | ||
|  |         let blockstore = open_blockstore(&val_a_ledger_path);
 | ||
|  |         purge_slots(&blockstore, next_slot_on_a + 1, truncated_slots);
 | ||
|  |         if !with_tower {
 | ||
|  |             info!("Removing tower!");
 | ||
|  |             remove_tower(&val_a_ledger_path, &validator_a_pubkey);
 | ||
|  | 
 | ||
|  |             // Remove next_slot_on_a from ledger to force validator A to select
 | ||
|  |             // votes_on_c_fork. Otherwise the validator A will immediately vote
 | ||
|  |             // for 27 on restart, because it hasn't gotten the heavier fork from
 | ||
|  |             // validator C yet.
 | ||
|  |             // Then it will be stuck on 27 unable to switch because C doesn't
 | ||
|  |             // have enough stake to generate a switching proof
 | ||
|  |             purge_slots(&blockstore, next_slot_on_a, truncated_slots);
 | ||
|  |         } else {
 | ||
|  |             info!("Not removing tower!");
 | ||
|  |         }
 | ||
|  |     }
 | ||
|  | 
 | ||
|  |     // Step 4:
 | ||
|  |     // Run validator C only to make it produce and vote on its own fork.
 | ||
|  |     info!("Restart validator C again!!!");
 | ||
|  |     validator_c_info.config.voting_disabled = false;
 | ||
|  |     cluster.restart_node(
 | ||
|  |         &validator_c_pubkey,
 | ||
|  |         validator_c_info,
 | ||
|  |         SocketAddrSpace::Unspecified,
 | ||
|  |     );
 | ||
|  | 
 | ||
|  |     let mut votes_on_c_fork = std::collections::BTreeSet::new(); // S4 and S5
 | ||
|  |     for _ in 0..100 {
 | ||
|  |         sleep(Duration::from_millis(100));
 | ||
|  | 
 | ||
|  |         if let Some((last_vote, _)) = last_vote_in_tower(&val_c_ledger_path, &validator_c_pubkey) {
 | ||
|  |             if last_vote != base_slot {
 | ||
|  |                 votes_on_c_fork.insert(last_vote);
 | ||
|  |                 // Collect 4 votes
 | ||
|  |                 if votes_on_c_fork.len() >= 4 {
 | ||
|  |                     break;
 | ||
|  |                 }
 | ||
|  |             }
 | ||
|  |         }
 | ||
|  |     }
 | ||
|  |     assert!(!votes_on_c_fork.is_empty());
 | ||
|  |     info!("collected validator C's votes: {:?}", votes_on_c_fork);
 | ||
|  | 
 | ||
|  |     // Step 5:
 | ||
|  |     // verify whether there was violation or not
 | ||
|  |     info!("Restart validator A again!!!");
 | ||
|  |     cluster.restart_node(
 | ||
|  |         &validator_a_pubkey,
 | ||
|  |         validator_a_info,
 | ||
|  |         SocketAddrSpace::Unspecified,
 | ||
|  |     );
 | ||
|  | 
 | ||
|  |     // monitor for actual votes from validator A
 | ||
|  |     let mut bad_vote_detected = false;
 | ||
|  |     let mut a_votes = vec![];
 | ||
|  |     for _ in 0..100 {
 | ||
|  |         sleep(Duration::from_millis(100));
 | ||
|  | 
 | ||
|  |         if let Some((last_vote, _)) = last_vote_in_tower(&val_a_ledger_path, &validator_a_pubkey) {
 | ||
|  |             a_votes.push(last_vote);
 | ||
|  |             let blockstore = Blockstore::open_with_access_type(
 | ||
|  |                 &val_a_ledger_path,
 | ||
|  |                 BlockstoreOptions {
 | ||
|  |                     access_type: AccessType::TryPrimaryThenSecondary,
 | ||
|  |                     recovery_mode: None,
 | ||
|  |                     enforce_ulimit_nofile: true,
 | ||
|  |                 },
 | ||
|  |             )
 | ||
|  |             .unwrap();
 | ||
|  |             let mut ancestors = AncestorIterator::new(last_vote, &blockstore);
 | ||
|  |             if ancestors.any(|a| votes_on_c_fork.contains(&a)) {
 | ||
|  |                 bad_vote_detected = true;
 | ||
|  |                 break;
 | ||
|  |             }
 | ||
|  |         }
 | ||
|  |     }
 | ||
|  | 
 | ||
|  |     info!("Observed A's votes on: {:?}", a_votes);
 | ||
|  | 
 | ||
|  |     // an elaborate way of assert!(with_tower && !bad_vote_detected || ...)
 | ||
|  |     let expects_optimistic_confirmation_violation = !with_tower;
 | ||
|  |     if bad_vote_detected != expects_optimistic_confirmation_violation {
 | ||
|  |         if bad_vote_detected {
 | ||
|  |             panic!("No violation expected because of persisted tower!");
 | ||
|  |         } else {
 | ||
|  |             panic!("Violation expected because of removed persisted tower!");
 | ||
|  |         }
 | ||
|  |     } else if bad_vote_detected {
 | ||
|  |         info!("THIS TEST expected violations. And indeed, there was some, because of removed persisted tower.");
 | ||
|  |     } else {
 | ||
|  |         info!("THIS TEST expected no violation. And indeed, there was none, thanks to persisted tower.");
 | ||
|  |     }
 | ||
|  | }
 |