Add test for making sure switch doesn't happen past failure threshold (#11138)

Fix switch threshold

Co-authored-by: Carl <carl@solana.com>
This commit is contained in:
carllin
2020-07-21 23:04:24 -07:00
committed by GitHub
parent 3fd16cea34
commit e556f85178
4 changed files with 269 additions and 14 deletions

View File

@ -304,6 +304,60 @@ pub fn check_for_new_roots(num_new_roots: usize, contact_infos: &[ContactInfo],
}
}
pub fn check_no_new_roots(
num_slots_to_wait: usize,
contact_infos: &[ContactInfo],
test_name: &str,
) {
assert!(!contact_infos.is_empty());
let mut roots = vec![0; contact_infos.len()];
let max_slot = contact_infos
.iter()
.enumerate()
.map(|(i, ingress_node)| {
let client = create_client(ingress_node.client_facing_addr(), VALIDATOR_PORT_RANGE);
let initial_root = client
.get_slot()
.unwrap_or_else(|_| panic!("get_slot for {} failed", ingress_node.id));
roots[i] = initial_root;
client
.get_slot_with_commitment(CommitmentConfig::recent())
.unwrap_or_else(|_| panic!("get_slot for {} failed", ingress_node.id))
})
.max()
.unwrap();
let end_slot = max_slot + num_slots_to_wait as u64;
let mut current_slot;
let mut last_print = Instant::now();
let client = create_client(contact_infos[0].client_facing_addr(), VALIDATOR_PORT_RANGE);
loop {
current_slot = client
.get_slot_with_commitment(CommitmentConfig::recent())
.unwrap_or_else(|_| panic!("get_slot for {} failed", contact_infos[0].id));
if current_slot > end_slot {
break;
}
if last_print.elapsed().as_secs() > 3 {
info!(
"{} current slot: {}, waiting for slot: {}",
test_name, current_slot, end_slot
);
last_print = Instant::now();
}
}
for (i, ingress_node) in contact_infos.iter().enumerate() {
let client = create_client(ingress_node.client_facing_addr(), VALIDATOR_PORT_RANGE);
assert_eq!(
client
.get_slot()
.unwrap_or_else(|_| panic!("get_slot for {} failed", ingress_node.id)),
roots[i]
);
}
}
fn poll_all_nodes_for_signature(
entry_point_info: &ContactInfo,
cluster_nodes: &[ContactInfo],

View File

@ -360,6 +360,25 @@ impl LocalCluster {
info!("{} done waiting for roots", test_name);
}
pub fn check_no_new_roots(&self, num_slots_to_wait: usize, test_name: &str) {
let alive_node_contact_infos: Vec<_> = self
.validators
.values()
.map(|v| v.info.contact_info.clone())
.collect();
assert!(!alive_node_contact_infos.is_empty());
info!("{} discovering nodes", test_name);
let cluster_nodes = discover_cluster(
&alive_node_contact_infos[0].gossip,
alive_node_contact_infos.len(),
)
.unwrap();
info!("{} discovered {} nodes", test_name, cluster_nodes.len());
info!("{} making sure no new roots on any nodes", test_name);
cluster_tests::check_no_new_roots(num_slots_to_wait, &alive_node_contact_infos, test_name);
info!("{} done waiting for roots", test_name);
}
fn transfer_with_client(
client: &ThinClient,
source_keypair: &Keypair,

View File

@ -4,8 +4,10 @@ use serial_test_derive::serial;
use solana_client::rpc_client::RpcClient;
use solana_client::thin_client::create_client;
use solana_core::{
broadcast_stage::BroadcastStageType, consensus::VOTE_THRESHOLD_DEPTH,
gossip_service::discover_cluster, validator::ValidatorConfig,
broadcast_stage::BroadcastStageType,
consensus::{SWITCH_FORK_THRESHOLD, VOTE_THRESHOLD_DEPTH},
gossip_service::discover_cluster,
validator::ValidatorConfig,
};
use solana_download_utils::download_snapshot;
use solana_ledger::{
@ -407,6 +409,137 @@ fn test_kill_heaviest_partition() {
)
}
#[allow(clippy::assertions_on_constants)]
fn run_kill_partition_switch_threshold<F>(
failures_stake: u64,
alive_stake_1: u64,
alive_stake_2: u64,
on_partition_resolved: F,
) where
F: Fn(&mut LocalCluster),
{
// Needs to be at least 1/3 or there will be no overlap
// with the confirmation supermajority 2/3
assert!(SWITCH_FORK_THRESHOLD >= 1f64 / 3f64);
info!(
"stakes: {} {} {}",
failures_stake, alive_stake_1, alive_stake_2
);
// This test:
// 1) Spins up three partitions
// 2) Kills the first partition with the stake `failures_stake`
// 5) runs `on_partition_resolved`
let mut leader_schedule = vec![];
let num_slots_per_validator = 8;
let partitions: [&[usize]; 3] = [
&[(failures_stake as usize)],
&[(alive_stake_1 as usize)],
&[(alive_stake_2 as usize)],
];
let validator_keys: Vec<_> = iter::repeat_with(|| Arc::new(Keypair::new()))
.take(partitions.len())
.collect();
for (i, k) in validator_keys.iter().enumerate() {
let num_slots = {
if i == 0 {
// Set up the leader to have 50% of the slots
num_slots_per_validator * (partitions.len() - 1)
} else {
num_slots_per_validator
}
};
for _ in 0..num_slots {
leader_schedule.push(k.pubkey())
}
}
info!("leader_schedule: {}", leader_schedule.len());
let validator_to_kill = validator_keys[0].pubkey();
let on_partition_start = |cluster: &mut LocalCluster| {
info!("Killing validator with id: {}", validator_to_kill);
cluster.exit_node(&validator_to_kill);
};
run_cluster_partition(
&partitions,
Some((
LeaderSchedule::new_from_schedule(leader_schedule),
validator_keys,
)),
on_partition_start,
on_partition_resolved,
)
}
#[test]
#[serial]
fn test_kill_partition_switch_threshold_no_progress() {
let max_switch_threshold_failure_pct = 1.0 - 2.0 * SWITCH_FORK_THRESHOLD;
let total_stake = 10_000;
let max_failures_stake = (max_switch_threshold_failure_pct * total_stake as f64) as u64;
let failures_stake = max_failures_stake;
let total_alive_stake = total_stake - failures_stake;
let alive_stake_1 = total_alive_stake / 2;
let alive_stake_2 = total_alive_stake - alive_stake_1;
// Check that no new roots were set 400 slots after partition resolves (gives time
// for lockouts built during partition to resolve and gives validators an opportunity
// to try and switch forks)
let on_partition_resolved = |cluster: &mut LocalCluster| {
cluster.check_no_new_roots(400, &"PARTITION_TEST");
};
// This kills `max_failures_stake`, so no progress should be made
run_kill_partition_switch_threshold(
failures_stake,
alive_stake_1,
alive_stake_2,
on_partition_resolved,
);
}
#[test]
#[serial]
fn test_kill_partition_switch_threshold() {
let max_switch_threshold_failure_pct = 1.0 - 2.0 * SWITCH_FORK_THRESHOLD;
let total_stake = 10_000;
// Kill `< max_failures_stake` of the validators
let max_failures_stake = (max_switch_threshold_failure_pct * total_stake as f64) as u64;
let failures_stake = max_failures_stake - 1;
let total_alive_stake = total_stake - failures_stake;
// Partition the remaining alive validators, should still make progress
// once the partition resolves
let alive_stake_1 = total_alive_stake / 2;
let alive_stake_2 = total_alive_stake - alive_stake_1;
let bigger = std::cmp::max(alive_stake_1, alive_stake_2);
let smaller = std::cmp::min(alive_stake_1, alive_stake_2);
// At least one of the forks must have > SWITCH_FORK_THRESHOLD in order
// to guarantee switching proofs can be created. Make sure the other fork
// is <= SWITCH_FORK_THRESHOLD to make sure progress can be made. Caches
// bugs such as liveness issues bank-weighted fork choice, which may stall
// because the fork with less stake could have more weight, but other fork would:
// 1) Not be able to generate a switching proof
// 2) Other more staked fork stops voting, so doesn't catch up in bank weight.
assert!(
bigger as f64 / total_stake as f64 > SWITCH_FORK_THRESHOLD
&& smaller as f64 / total_stake as f64 <= SWITCH_FORK_THRESHOLD
);
let on_partition_resolved = |cluster: &mut LocalCluster| {
cluster.check_for_new_roots(16, &"PARTITION_TEST");
};
run_kill_partition_switch_threshold(
failures_stake,
alive_stake_1,
alive_stake_2,
on_partition_resolved,
);
}
#[test]
#[serial]
fn test_two_unbalanced_stakes() {