Fix flaky optimistic confirmation tests (#23178)

This commit is contained in:
carllin
2022-02-16 16:13:58 -05:00
committed by GitHub
parent 6e28a03c73
commit bca1d51735
4 changed files with 117 additions and 94 deletions

View File

@ -166,7 +166,8 @@ pub fn run_kill_partition_switch_threshold<C>(
.flat_map(|stakes_and_slots| stakes_and_slots.iter().map(|(_, num_slots)| *num_slots)) .flat_map(|stakes_and_slots| stakes_and_slots.iter().map(|(_, num_slots)| *num_slots))
.collect(); .collect();
let (leader_schedule, validator_keys) = create_custom_leader_schedule(&num_slots_per_validator); let (leader_schedule, validator_keys) =
create_custom_leader_schedule_with_random_keys(&num_slots_per_validator);
info!( info!(
"Validator ids: {:?}", "Validator ids: {:?}",
@ -206,23 +207,32 @@ pub fn run_kill_partition_switch_threshold<C>(
} }
pub fn create_custom_leader_schedule( pub fn create_custom_leader_schedule(
validator_num_slots: &[usize], validator_key_to_slots: impl Iterator<Item = (Pubkey, usize)>,
) -> (LeaderSchedule, Vec<Arc<Keypair>>) { ) -> LeaderSchedule {
let mut leader_schedule = vec![]; let mut leader_schedule = vec![];
let validator_keys: Vec<_> = iter::repeat_with(|| Arc::new(Keypair::new())) for (k, num_slots) in validator_key_to_slots {
.take(validator_num_slots.len()) for _ in 0..num_slots {
.collect(); leader_schedule.push(k)
for (k, num_slots) in validator_keys.iter().zip(validator_num_slots.iter()) {
for _ in 0..*num_slots {
leader_schedule.push(k.pubkey())
} }
} }
info!("leader_schedule: {}", leader_schedule.len()); info!("leader_schedule: {}", leader_schedule.len());
( LeaderSchedule::new_from_schedule(leader_schedule)
LeaderSchedule::new_from_schedule(leader_schedule), }
validator_keys,
) pub fn create_custom_leader_schedule_with_random_keys(
validator_num_slots: &[usize],
) -> (LeaderSchedule, Vec<Arc<Keypair>>) {
let validator_keys: Vec<_> = iter::repeat_with(|| Arc::new(Keypair::new()))
.take(validator_num_slots.len())
.collect();
let leader_schedule = create_custom_leader_schedule(
validator_keys
.iter()
.map(|k| k.pubkey())
.zip(validator_num_slots.iter().cloned()),
);
(leader_schedule, validator_keys)
} }
/// This function runs a network, initiates a partition based on a /// This function runs a network, initiates a partition based on a

View File

@ -2,9 +2,9 @@
use { use {
assert_matches::assert_matches, assert_matches::assert_matches,
common::{ common::{
copy_blocks, create_custom_leader_schedule, last_vote_in_tower, ms_for_n_slots, copy_blocks, create_custom_leader_schedule_with_random_keys, last_vote_in_tower,
open_blockstore, purge_slots, remove_tower, restore_tower, run_cluster_partition, ms_for_n_slots, open_blockstore, purge_slots, remove_tower, restore_tower,
run_kill_partition_switch_threshold, test_faulty_node, run_cluster_partition, run_kill_partition_switch_threshold, test_faulty_node,
wait_for_last_vote_in_tower_to_land_in_ledger, RUST_LOG_FILTER, wait_for_last_vote_in_tower_to_land_in_ledger, RUST_LOG_FILTER,
}, },
crossbeam_channel::{unbounded, Receiver}, crossbeam_channel::{unbounded, Receiver},
@ -2489,8 +2489,10 @@ fn test_run_test_load_program_accounts_partition_root() {
fn run_test_load_program_accounts_partition(scan_commitment: CommitmentConfig) { fn run_test_load_program_accounts_partition(scan_commitment: CommitmentConfig) {
let num_slots_per_validator = 8; let num_slots_per_validator = 8;
let partitions: [Vec<usize>; 2] = [vec![1], vec![1]]; let partitions: [Vec<usize>; 2] = [vec![1], vec![1]];
let (leader_schedule, validator_keys) = let (leader_schedule, validator_keys) = create_custom_leader_schedule_with_random_keys(&[
create_custom_leader_schedule(&[num_slots_per_validator, num_slots_per_validator]); num_slots_per_validator,
num_slots_per_validator,
]);
let (update_client_sender, update_client_receiver) = unbounded(); let (update_client_sender, update_client_receiver) = unbounded();
let (scan_client_sender, scan_client_receiver) = unbounded(); let (scan_client_sender, scan_client_receiver) = unbounded();

View File

@ -3,8 +3,8 @@
#![allow(clippy::integer_arithmetic)] #![allow(clippy::integer_arithmetic)]
use { use {
common::{ common::{
copy_blocks, last_vote_in_tower, open_blockstore, purge_slots, remove_tower, copy_blocks, create_custom_leader_schedule, last_vote_in_tower, open_blockstore,
wait_for_last_vote_in_tower_to_land_in_ledger, RUST_LOG_FILTER, purge_slots, remove_tower, wait_for_last_vote_in_tower_to_land_in_ledger, RUST_LOG_FILTER,
}, },
log::*, log::*,
serial_test::serial, serial_test::serial,
@ -13,13 +13,17 @@ use {
ancestor_iterator::AncestorIterator, ancestor_iterator::AncestorIterator,
blockstore::Blockstore, blockstore::Blockstore,
blockstore_db::{AccessType, BlockstoreOptions}, blockstore_db::{AccessType, BlockstoreOptions},
leader_schedule::FixedSchedule,
}, },
solana_local_cluster::{ solana_local_cluster::{
cluster::Cluster, cluster::Cluster,
local_cluster::{ClusterConfig, LocalCluster}, local_cluster::{ClusterConfig, LocalCluster},
validator_configs::*, validator_configs::*,
}, },
solana_sdk::signature::{Keypair, Signer}, solana_sdk::{
clock::Slot,
signature::{Keypair, Signer},
},
solana_streamer::socket::SocketAddrSpace, solana_streamer::socket::SocketAddrSpace,
std::{ std::{
sync::Arc, sync::Arc,
@ -62,27 +66,22 @@ fn test_optimistic_confirmation_violation_without_tower() {
// //
// S0 -> S1 -> S2 -> S4 // S0 -> S1 -> S2 -> S4
// //
// Step 3: Then restart `A` which had 31% of the stake. With the tower, from `A`'s // Step 3: Then restart `A` which had 31% of the stake, and remove S3 from its ledger, so
// perspective it sees: // that it only sees `C`'s fork at S2. From `A`'s perspective it sees:
// //
// S0 -> S1 -> S2 -> S3 (voted) // S0 -> S1 -> S2
// | // |
// -> S4 -> S5 (C's vote for S4) // -> S4 -> S5 (C's vote for S4)
// //
// The fork choice rule weights look like: // The fork choice rule weights look like:
// //
// S0 -> S1 -> S2 (ABC) -> S3 // S0 -> S1 -> S2 (ABC)
// | // |
// -> S4 (C) -> S5 // -> S4 (C) -> S5
// //
// Step 5: // Step 5:
// Without the persisted tower: // Without the persisted tower:
// `A` would choose to vote on the fork with `S4 -> S5`. This is true even if `A` // `A` would choose to vote on the fork with `S4 -> S5`.
// generates a new fork starting at slot `S3` because `C` has more stake than `A`
// so `A` will eventually pick the fork `C` is on.
//
// Furthermore `B`'s vote on `S3` is not observable because there are no
// descendants of slot `S3`, so that fork will not be chosen over `C`'s fork
// //
// With the persisted tower: // With the persisted tower:
// `A` should not be able to generate a switching proof. // `A` should not be able to generate a switching proof.
@ -94,6 +93,10 @@ fn do_test_optimistic_confirmation_violation_with_or_without_tower(with_tower: b
let slots_per_epoch = 2048; let slots_per_epoch = 2048;
let node_stakes = vec![31, 36, 33, 0]; let node_stakes = vec![31, 36, 33, 0];
let base_slot: Slot = 26; // S2
let next_slot_on_a: Slot = 27; // S3
let truncated_slots: Slot = 100; // just enough to purge all following slots after the S2 and S3
// Each pubkeys are prefixed with A, B, C and D. // Each pubkeys are prefixed with A, B, C and D.
// D is needed to: // D is needed to:
// 1) Propagate A's votes for S2 to validator C after A shuts down so that // 1) Propagate A's votes for S2 to validator C after A shuts down so that
@ -128,10 +131,29 @@ fn do_test_optimistic_confirmation_violation_with_or_without_tower(with_tower: b
// because if validator A votes past `next_slot_on_a`, and then we copy over validator B's ledger // because if validator A votes past `next_slot_on_a`, and then we copy over validator B's ledger
// below only for slots <= `next_slot_on_a`, validator A will not know how it's last vote chains // below only for slots <= `next_slot_on_a`, validator A will not know how it's last vote chains
// to the otehr forks, and may violate switching proofs on restart. // to the otehr forks, and may violate switching proofs on restart.
let mut validator_configs = let mut default_config = ValidatorConfig::default_for_test();
make_identical_validator_configs(&ValidatorConfig::default_for_test(), node_stakes.len()); // Split leader schedule 50-50 between validators B and C, don't give validator A any slots because
// it's going to be deleting its ledger, so may create versions of slots it's already created, but
// on a different fork.
let validator_to_slots = vec![
// Ensure validator b is leader for slots <= `next_slot_on_a`
(validator_b_pubkey, next_slot_on_a as usize + 1),
(validator_c_pubkey, next_slot_on_a as usize + 1),
];
validator_configs[0].voting_disabled = true; let leader_schedule = create_custom_leader_schedule(validator_to_slots.into_iter());
for slot in 0..=next_slot_on_a {
assert_eq!(leader_schedule[slot], validator_b_pubkey);
}
default_config.fixed_leader_schedule = Some(FixedSchedule {
start_epoch: 0,
leader_schedule: Arc::new(leader_schedule),
});
let mut validator_configs =
make_identical_validator_configs(&default_config, node_stakes.len());
// Disable voting on validator C
validator_configs[2].voting_disabled = true; validator_configs[2].voting_disabled = true;
let mut config = ClusterConfig { let mut config = ClusterConfig {
@ -146,10 +168,6 @@ fn do_test_optimistic_confirmation_violation_with_or_without_tower(with_tower: b
}; };
let mut cluster = LocalCluster::new(&mut config, SocketAddrSpace::Unspecified); let mut cluster = LocalCluster::new(&mut config, SocketAddrSpace::Unspecified);
let base_slot = 26; // S2
let next_slot_on_a = 27; // S3
let truncated_slots = 100; // just enough to purge all following slots after the S2 and S3
let val_a_ledger_path = cluster.ledger_path(&validator_a_pubkey); let val_a_ledger_path = cluster.ledger_path(&validator_a_pubkey);
let val_b_ledger_path = cluster.ledger_path(&validator_b_pubkey); let val_b_ledger_path = cluster.ledger_path(&validator_b_pubkey);
let val_c_ledger_path = cluster.ledger_path(&validator_c_pubkey); let val_c_ledger_path = cluster.ledger_path(&validator_c_pubkey);
@ -167,43 +185,65 @@ fn do_test_optimistic_confirmation_violation_with_or_without_tower(with_tower: b
validator_c_pubkey, val_c_ledger_path validator_c_pubkey, val_c_ledger_path
); );
// Immediately kill validator A, and C // Immediately kill validator C. No need to kill validator A because
info!("Exiting validators A and C"); // 1) It has no slots in the leader schedule, so no way to make forks
let mut validator_a_info = cluster.exit_node(&validator_a_pubkey); // 2) We need it to vote
info!("Exiting validator C");
let mut validator_c_info = cluster.exit_node(&validator_c_pubkey); let mut validator_c_info = cluster.exit_node(&validator_c_pubkey);
// Step 1: // Step 1:
// Let validator B, (D) run for a while. // Let validator A, B, (D) run. Wait for both `A` and `B` to have voted on `next_slot_on_a` or
// one of its descendants
info!(
"Waiting on both validators A and B to vote on fork at slot {}",
next_slot_on_a
);
let now = Instant::now(); let now = Instant::now();
let mut last_b_vote = 0;
let mut last_a_vote = 0;
loop { loop {
let elapsed = now.elapsed(); let elapsed = now.elapsed();
assert!( assert!(
elapsed <= Duration::from_secs(30), elapsed <= Duration::from_secs(30),
"Validator B failed to vote on any slot >= {} in {} secs", "One of the validators failed to vote on a slot >= {} in {} secs,
last validator A vote: {},
last validator B vote: {}",
next_slot_on_a, next_slot_on_a,
elapsed.as_secs() elapsed.as_secs(),
last_a_vote,
last_b_vote,
); );
sleep(Duration::from_millis(100)); sleep(Duration::from_millis(100));
if let Some((last_vote, _)) = last_vote_in_tower(&val_b_ledger_path, &validator_b_pubkey) { if let Some((last_vote, _)) = last_vote_in_tower(&val_b_ledger_path, &validator_b_pubkey) {
last_b_vote = last_vote;
if last_vote < next_slot_on_a {
continue;
}
}
if let Some((last_vote, _)) = last_vote_in_tower(&val_a_ledger_path, &validator_a_pubkey) {
last_a_vote = last_vote;
if last_vote >= next_slot_on_a { if last_vote >= next_slot_on_a {
break; break;
} }
} }
} }
// kill B
// kill A and B
let _validator_b_info = cluster.exit_node(&validator_b_pubkey); let _validator_b_info = cluster.exit_node(&validator_b_pubkey);
let validator_a_info = cluster.exit_node(&validator_a_pubkey);
// Step 2: // Step 2:
// Stop validator and truncate ledger, copy over B's ledger to A // Stop validator and truncate ledger, copy over B's ledger to C
info!("truncate validator C's ledger"); info!("Create validator C's ledger");
{ {
// first copy from validator B's ledger // first copy from validator B's ledger
std::fs::remove_dir_all(&validator_c_info.info.ledger_path).unwrap(); std::fs::remove_dir_all(&validator_c_info.info.ledger_path).unwrap();
let mut opt = fs_extra::dir::CopyOptions::new(); let mut opt = fs_extra::dir::CopyOptions::new();
opt.copy_inside = true; opt.copy_inside = true;
fs_extra::dir::copy(&val_b_ledger_path, &val_c_ledger_path, &opt).unwrap(); fs_extra::dir::copy(&val_b_ledger_path, &val_c_ledger_path, &opt).unwrap();
// Remove B's tower in the C's new copied ledger // Remove B's tower in C's new copied ledger
remove_tower(&val_c_ledger_path, &validator_b_pubkey); remove_tower(&val_c_ledger_path, &validator_b_pubkey);
let blockstore = open_blockstore(&val_c_ledger_path); let blockstore = open_blockstore(&val_c_ledger_path);
@ -224,44 +264,14 @@ fn do_test_optimistic_confirmation_violation_with_or_without_tower(with_tower: b
purge_slots(&a_blockstore, next_slot_on_a + 1, truncated_slots); purge_slots(&a_blockstore, next_slot_on_a + 1, truncated_slots);
} }
// Step 3: // This should be guaranteed because we waited for validator `A` to vote on a slot > `next_slot_on_a`
// Restart A with voting enabled so that it can vote on B's fork // before killing it earlier.
// up to `next_slot_on_a`, thereby optimistcally confirming `next_slot_on_a` info!("Checking A's tower for a vote on slot descended from slot `next_slot_on_a`");
info!("Restarting A"); let last_vote_slot = last_vote_in_tower(&val_a_ledger_path, &validator_a_pubkey)
validator_a_info.config.voting_disabled = false; .unwrap()
cluster.restart_node( .0;
&validator_a_pubkey, assert!(last_vote_slot >= next_slot_on_a);
validator_a_info,
SocketAddrSpace::Unspecified,
);
info!("Waiting for A to vote on slot descended from slot `next_slot_on_a`");
let now = Instant::now();
loop {
if let Some((last_vote_slot, _)) =
last_vote_in_tower(&val_a_ledger_path, &validator_a_pubkey)
{
if last_vote_slot >= next_slot_on_a {
info!(
"Validator A has caught up and voted on slot: {}",
last_vote_slot
);
break;
}
}
if now.elapsed().as_secs() >= 30 {
panic!(
"Validator A has not seen optimistic confirmation slot > {} in 30 seconds",
next_slot_on_a
);
}
sleep(Duration::from_millis(20));
}
info!("Killing A");
let validator_a_info = cluster.exit_node(&validator_a_pubkey);
{ {
let blockstore = open_blockstore(&val_a_ledger_path); let blockstore = open_blockstore(&val_a_ledger_path);
purge_slots(&blockstore, next_slot_on_a + 1, truncated_slots); purge_slots(&blockstore, next_slot_on_a + 1, truncated_slots);
@ -270,9 +280,9 @@ fn do_test_optimistic_confirmation_violation_with_or_without_tower(with_tower: b
remove_tower(&val_a_ledger_path, &validator_a_pubkey); remove_tower(&val_a_ledger_path, &validator_a_pubkey);
// Remove next_slot_on_a from ledger to force validator A to select // Remove next_slot_on_a from ledger to force validator A to select
// votes_on_c_fork. Otherwise the validator A will immediately vote // votes_on_c_fork. Otherwise, in the test case without a tower,
// for 27 on restart, because it hasn't gotten the heavier fork from // the validator A will immediately vote for 27 on restart, because it
// validator C yet. // hasn't gotten the heavier fork from validator C yet.
// Then it will be stuck on 27 unable to switch because C doesn't // Then it will be stuck on 27 unable to switch because C doesn't
// have enough stake to generate a switching proof // have enough stake to generate a switching proof
purge_slots(&blockstore, next_slot_on_a, truncated_slots); purge_slots(&blockstore, next_slot_on_a, truncated_slots);
@ -281,7 +291,7 @@ fn do_test_optimistic_confirmation_violation_with_or_without_tower(with_tower: b
} }
} }
// Step 4: // Step 3:
// Run validator C only to make it produce and vote on its own fork. // Run validator C only to make it produce and vote on its own fork.
info!("Restart validator C again!!!"); info!("Restart validator C again!!!");
validator_c_info.config.voting_disabled = false; validator_c_info.config.voting_disabled = false;
@ -308,7 +318,7 @@ fn do_test_optimistic_confirmation_violation_with_or_without_tower(with_tower: b
assert!(!votes_on_c_fork.is_empty()); assert!(!votes_on_c_fork.is_empty());
info!("collected validator C's votes: {:?}", votes_on_c_fork); info!("collected validator C's votes: {:?}", votes_on_c_fork);
// Step 5: // Step 4:
// verify whether there was violation or not // verify whether there was violation or not
info!("Restart validator A again!!!"); info!("Restart validator A again!!!");
cluster.restart_node( cluster.restart_node(

View File

@ -3,9 +3,10 @@
#![allow(clippy::integer_arithmetic)] #![allow(clippy::integer_arithmetic)]
use { use {
common::{ common::{
copy_blocks, create_custom_leader_schedule, last_vote_in_tower, ms_for_n_slots, copy_blocks, create_custom_leader_schedule_with_random_keys, last_vote_in_tower,
open_blockstore, restore_tower, run_cluster_partition, run_kill_partition_switch_threshold, ms_for_n_slots, open_blockstore, restore_tower, run_cluster_partition,
test_faulty_node, wait_for_last_vote_in_tower_to_land_in_ledger, RUST_LOG_FILTER, run_kill_partition_switch_threshold, test_faulty_node,
wait_for_last_vote_in_tower_to_land_in_ledger, RUST_LOG_FILTER,
}, },
log::*, log::*,
serial_test::serial, serial_test::serial,
@ -279,7 +280,7 @@ fn test_kill_heaviest_partition() {
// 4) Check for recovery // 4) Check for recovery
let num_slots_per_validator = 8; let num_slots_per_validator = 8;
let partitions: [Vec<usize>; 4] = [vec![11], vec![10], vec![10], vec![10]]; let partitions: [Vec<usize>; 4] = [vec![11], vec![10], vec![10], vec![10]];
let (leader_schedule, validator_keys) = create_custom_leader_schedule(&[ let (leader_schedule, validator_keys) = create_custom_leader_schedule_with_random_keys(&[
num_slots_per_validator * (partitions.len() - 1), num_slots_per_validator * (partitions.len() - 1),
num_slots_per_validator, num_slots_per_validator,
num_slots_per_validator, num_slots_per_validator,