@ -1,11 +1,14 @@
|
||||
extern crate solana;
|
||||
|
||||
use crate::solana::blocktree::Blocktree;
|
||||
use hashbrown::HashSet;
|
||||
use log::*;
|
||||
use solana::cluster::Cluster;
|
||||
use solana::cluster_tests;
|
||||
use solana::gossip_service::discover_cluster;
|
||||
use solana::local_cluster::{ClusterConfig, LocalCluster};
|
||||
use solana::validator::ValidatorConfig;
|
||||
use solana_runtime::epoch_schedule::MINIMUM_SLOT_LENGTH;
|
||||
use solana_runtime::epoch_schedule::{EpochSchedule, MINIMUM_SLOT_LENGTH};
|
||||
use solana_sdk::poh_config::PohConfig;
|
||||
use solana_sdk::timing;
|
||||
use std::time::Duration;
|
||||
@ -207,3 +210,106 @@ fn test_listener_startup() {
|
||||
let (cluster_nodes, _) = discover_cluster(&cluster.entry_point_info.gossip, 4).unwrap();
|
||||
assert_eq!(cluster_nodes.len(), 4);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_repairman_catchup() {
|
||||
run_repairman_catchup(5);
|
||||
}
|
||||
|
||||
fn run_repairman_catchup(num_repairmen: u64) {
|
||||
let mut validator_config = ValidatorConfig::default();
|
||||
let num_ticks_per_second = 100;
|
||||
let num_ticks_per_slot = 40;
|
||||
let num_slots_per_epoch = MINIMUM_SLOT_LENGTH as u64;
|
||||
let num_root_buffer_slots = 10;
|
||||
// Calculate the leader schedule num_root_buffer slots ahead. Otherwise, if stakers_slot_offset ==
|
||||
// num_slots_per_epoch, and num_slots_per_epoch == MINIMUM_SLOT_LENGTH, then repairmen
|
||||
// will stop sending repairs after the last slot in epoch 1 (0-indexed), because the root
|
||||
// is at most in the first epoch.
|
||||
//
|
||||
// For example:
|
||||
// Assume:
|
||||
// 1) num_slots_per_epoch = 32
|
||||
// 2) stakers_slot_offset = 32
|
||||
// 3) MINIMUM_SLOT_LENGTH = 32
|
||||
//
|
||||
// Then the last slot in epoch 1 is slot 63. After completing slots 0 to 63, the root on the
|
||||
// repairee is at most 31. Because, the stakers_slot_offset == 32, then the max confirmed epoch
|
||||
// on the repairee is epoch 1.
|
||||
// Thus the repairmen won't send any slots past epoch 1, slot 63 to this repairee until the repairee
|
||||
// updates their root, and the repairee can't update their root until they get slot 64, so no progress
|
||||
// is made. This is also not accounting for the fact that the repairee may not vote on every slot, so
|
||||
// their root could actually be much less than 31. This is why we give a num_root_buffer_slots buffer.
|
||||
let stakers_slot_offset = num_slots_per_epoch + num_root_buffer_slots;
|
||||
|
||||
validator_config.rpc_config.enable_fullnode_exit = true;
|
||||
|
||||
let lamports_per_repairman = 1000;
|
||||
|
||||
// Make the repairee_stake small relative to the repairmen stake so that the repairee doesn't
|
||||
// get included in the leader schedule, causing slots to get skipped while it's still trying
|
||||
// to catch up
|
||||
let repairee_stake = 3;
|
||||
let cluster_lamports = 2 * lamports_per_repairman * num_repairmen + repairee_stake;
|
||||
let node_stakes: Vec<_> = (0..num_repairmen).map(|_| lamports_per_repairman).collect();
|
||||
let mut cluster = LocalCluster::new(&ClusterConfig {
|
||||
node_stakes,
|
||||
cluster_lamports,
|
||||
validator_config: validator_config.clone(),
|
||||
ticks_per_slot: num_ticks_per_slot,
|
||||
slots_per_epoch: num_slots_per_epoch,
|
||||
stakers_slot_offset,
|
||||
poh_config: PohConfig::new_sleep(Duration::from_millis(1000 / num_ticks_per_second)),
|
||||
..ClusterConfig::default()
|
||||
});
|
||||
|
||||
let repairman_pubkeys: HashSet<_> = cluster.get_node_pubkeys().into_iter().collect();
|
||||
let epoch_schedule = EpochSchedule::new(num_slots_per_epoch, stakers_slot_offset, true);
|
||||
let num_warmup_epochs = (epoch_schedule.get_stakers_epoch(0) + 1) as f64;
|
||||
|
||||
// Sleep for longer than the first N warmup epochs, with a one epoch buffer for timing issues
|
||||
cluster_tests::sleep_n_epochs(
|
||||
num_warmup_epochs + 1.0,
|
||||
&cluster.genesis_block.poh_config,
|
||||
num_ticks_per_slot,
|
||||
num_slots_per_epoch,
|
||||
);
|
||||
|
||||
// Start up a new node, wait for catchup. Backwards repair won't be sufficient because the
|
||||
// leader is sending blobs past this validator's first two confirmed epochs. Thus, the repairman
|
||||
// protocol will have to kick in for this validator to repair.
|
||||
|
||||
cluster.add_validator(&validator_config, repairee_stake);
|
||||
|
||||
let all_pubkeys = cluster.get_node_pubkeys();
|
||||
let repairee_id = all_pubkeys
|
||||
.into_iter()
|
||||
.find(|x| !repairman_pubkeys.contains(x))
|
||||
.unwrap();
|
||||
|
||||
// Wait for repairman protocol to catch this validator up
|
||||
cluster_tests::sleep_n_epochs(
|
||||
num_warmup_epochs + 1.0,
|
||||
&cluster.genesis_block.poh_config,
|
||||
num_ticks_per_slot,
|
||||
num_slots_per_epoch,
|
||||
);
|
||||
|
||||
cluster.close_preserve_ledgers();
|
||||
let validator_ledger_path = cluster.fullnode_infos[&repairee_id].ledger_path.clone();
|
||||
|
||||
// Expect at least the the first two epochs to have been rooted after waiting 3 epochs.
|
||||
let num_expected_slots = num_slots_per_epoch * 2;
|
||||
let validator_ledger = Blocktree::open(&validator_ledger_path).unwrap();
|
||||
let validator_rooted_slots: Vec<_> =
|
||||
validator_ledger.rooted_slot_iterator(0).unwrap().collect();
|
||||
|
||||
if validator_rooted_slots.len() as u64 <= num_expected_slots {
|
||||
error!(
|
||||
"Num expected slots: {}, number of rooted slots: {}",
|
||||
num_expected_slots,
|
||||
validator_rooted_slots.len()
|
||||
);
|
||||
}
|
||||
assert!(validator_rooted_slots.len() as u64 > num_expected_slots);
|
||||
}
|
||||
|
Reference in New Issue
Block a user