Persistent tower (#10718)

* Save/restore Tower

* Avoid unwrap()

* Rebase cleanups

* Forcibly pass test

* Correct reconcilation of votes after validator resume

* d b g

* Add more tests

* fsync and fix test

* Add test

* Fix fmt

* Debug

* Fix tests...

* save

* Clarify error message and code cleaning around it

* Move most of code out of tower save hot codepath

* Proper comment for the lack of fsync on tower

* Clean up

* Clean up

* Simpler type alias

* Manage tower-restored ancestor slots without banks

* Add comment

* Extract long code blocks...

* Add comment

* Simplify returned tuple...

* Tweak too aggresive log

* Fix typo...

* Add test

* Update comment

* Improve test to require non-empty stray restored slots

* Measure tower save and dump all tower contents

* Log adjust and add threshold related assertions

* cleanup adjust

* Properly lower stray restored slots priority...

* Rust fmt

* Fix test....

* Clarify comments a bit and add TowerError::TooNew

* Further clean-up arround TowerError

* Truly create ancestors by excluding last vote slot

* Add comment for stray_restored_slots

* Add comment for stray_restored_slots

* Use BTreeSet

* Consider root_slot into post-replay adjustment

* Tweak logging

* Add test for stray_restored_ancestors

* Reorder some code

* Better names for unit tests

* Add frozen_abi to SavedTower

* Fold long lines

* Tweak stray ancestors and too old slot history

* Re-adjust error conditon of too old slot history

* Test normal ancestors is checked before stray ones

* Fix conflict, update tests, adjust behavior a bit

* Fix test

* Address review comments

* Last touch!

* Immediately after creating cleaning pr

* Revert stray slots

* Revert comment...

* Report error as metrics

* Revert not to panic! and ignore unfixable test...

* Normalize lockouts.root_slot more strictly

* Add comments for panic! and more assertions

* Proper initialize root without vote account

* Clarify code and comments based on review feedback

* Fix rebase

* Further simplify based on assured tower root

* Reorder code for more readability

Co-authored-by: Michael Vines <mvines@gmail.com>
This commit is contained in:
Ryo Onodera
2020-09-19 14:03:54 +09:00
committed by GitHub
parent 28f2c15597
commit cb8661bd49
15 changed files with 1712 additions and 106 deletions

View File

@ -9,7 +9,7 @@ use solana_client::{
use solana_core::{
broadcast_stage::BroadcastStageType,
cluster_info::VALIDATOR_PORT_RANGE,
consensus::{SWITCH_FORK_THRESHOLD, VOTE_THRESHOLD_DEPTH},
consensus::{Tower, SWITCH_FORK_THRESHOLD, VOTE_THRESHOLD_DEPTH},
gossip_service::discover_cluster,
validator::ValidatorConfig,
};
@ -1370,18 +1370,19 @@ fn test_no_voting() {
}
#[test]
fn test_optimistic_confirmation_violation() {
#[serial]
fn test_optimistic_confirmation_violation_with_no_tower() {
solana_logger::setup();
let mut buf = BufferRedirect::stderr().unwrap();
// First set up the cluster with 2 nodes
let slots_per_epoch = 2048;
let node_stakes = vec![50, 51];
let node_stakes = vec![51, 50];
let validator_keys: Vec<_> = iter::repeat_with(|| (Arc::new(Keypair::new()), true))
.take(node_stakes.len())
.collect();
let config = ClusterConfig {
cluster_lamports: 100_000,
node_stakes: vec![51, 50],
node_stakes: node_stakes.clone(),
validator_configs: vec![ValidatorConfig::default(); node_stakes.len()],
validator_keys: Some(validator_keys),
slots_per_epoch,
@ -1415,7 +1416,9 @@ fn test_optimistic_confirmation_violation() {
// Mark fork as dead on the heavier validator, this should make the fork effectively
// dead, even though it was optimistically confirmed. The smaller validator should
// jump over to the new fork
// create and jump over to a new fork
// Also, remove saved tower to intentionally make the restarted validator to violate the
// optimistic confirmation
{
let blockstore = Blockstore::open_with_access_type(
&exited_validator_info.info.ledger_path,
@ -1433,6 +1436,12 @@ fn test_optimistic_confirmation_violation() {
prev_voted_slot
);
blockstore.set_dead_slot(prev_voted_slot).unwrap();
std::fs::remove_file(Tower::get_filename(
&exited_validator_info.info.ledger_path,
&entry_point_id,
))
.unwrap();
}
cluster.restart_node(&entry_point_id, exited_validator_info);
@ -1465,6 +1474,220 @@ fn test_optimistic_confirmation_violation() {
assert!(output.contains(&expected_log));
}
#[test]
#[serial]
#[ignore]
fn test_no_optimistic_confirmation_violation_with_tower() {
solana_logger::setup();
let mut buf = BufferRedirect::stderr().unwrap();
// First set up the cluster with 2 nodes
let slots_per_epoch = 2048;
let node_stakes = vec![51, 50];
let validator_keys: Vec<_> = iter::repeat_with(|| (Arc::new(Keypair::new()), true))
.take(node_stakes.len())
.collect();
let config = ClusterConfig {
cluster_lamports: 100_000,
node_stakes: node_stakes.clone(),
validator_configs: vec![ValidatorConfig::default(); node_stakes.len()],
validator_keys: Some(validator_keys),
slots_per_epoch,
stakers_slot_offset: slots_per_epoch,
skip_warmup_slots: true,
..ClusterConfig::default()
};
let mut cluster = LocalCluster::new(&config);
let entry_point_id = cluster.entry_point_info.id;
// Let the nodes run for a while. Wait for validators to vote on slot `S`
// so that the vote on `S-1` is definitely in gossip and optimistic confirmation is
// detected on slot `S-1` for sure, then stop the heavier of the two
// validators
let client = cluster.get_validator_client(&entry_point_id).unwrap();
let mut prev_voted_slot = 0;
loop {
let last_voted_slot = client
.get_slot_with_commitment(CommitmentConfig::recent())
.unwrap();
if last_voted_slot > 50 {
if prev_voted_slot == 0 {
prev_voted_slot = last_voted_slot;
} else {
break;
}
}
sleep(Duration::from_millis(100));
}
let exited_validator_info = cluster.exit_node(&entry_point_id);
// Mark fork as dead on the heavier validator, this should make the fork effectively
// dead, even though it was optimistically confirmed. The smaller validator should
// create and jump over to a new fork
{
let blockstore = Blockstore::open_with_access_type(
&exited_validator_info.info.ledger_path,
AccessType::PrimaryOnly,
None,
)
.unwrap_or_else(|e| {
panic!(
"Failed to open ledger at {:?}, err: {}",
exited_validator_info.info.ledger_path, e
);
});
info!(
"Setting slot: {} on main fork as dead, should cause fork",
prev_voted_slot
);
blockstore.set_dead_slot(prev_voted_slot).unwrap();
}
cluster.restart_node(&entry_point_id, exited_validator_info);
cluster.check_no_new_roots(400, "test_no_optimistic_confirmation_violation_with_tower");
// Check to see that validator didn't detected optimistic confirmation for
// `prev_voted_slot` failed
let expected_log = format!("Optimistic slot {} was not rooted", prev_voted_slot);
let mut output = String::new();
buf.read_to_string(&mut output).unwrap();
assert!(!output.contains(&expected_log));
}
#[test]
#[serial]
fn test_validator_saves_tower() {
solana_logger::setup();
let validator_config = ValidatorConfig {
require_tower: true,
..ValidatorConfig::default()
};
let validator_identity_keypair = Arc::new(Keypair::new());
let validator_id = validator_identity_keypair.pubkey();
let config = ClusterConfig {
cluster_lamports: 10_000,
node_stakes: vec![100],
validator_configs: vec![validator_config],
validator_keys: Some(vec![(validator_identity_keypair.clone(), true)]),
..ClusterConfig::default()
};
let mut cluster = LocalCluster::new(&config);
let validator_client = cluster.get_validator_client(&validator_id).unwrap();
let ledger_path = cluster
.validators
.get(&validator_id)
.unwrap()
.info
.ledger_path
.clone();
// Wait for some votes to be generated
let mut last_replayed_root;
loop {
if let Ok(slot) = validator_client.get_slot_with_commitment(CommitmentConfig::recent()) {
trace!("current slot: {}", slot);
if slot > 2 {
// this will be the root next time a validator starts
last_replayed_root = slot;
break;
}
}
sleep(Duration::from_millis(10));
}
// Stop validator and check saved tower
let validator_info = cluster.exit_node(&validator_id);
let tower1 = Tower::restore(&ledger_path, &validator_id).unwrap();
trace!("tower1: {:?}", tower1);
assert_eq!(tower1.root(), Some(0));
// Restart the validator and wait for a new root
cluster.restart_node(&validator_id, validator_info);
let validator_client = cluster.get_validator_client(&validator_id).unwrap();
// Wait for the first root
loop {
if let Ok(root) = validator_client.get_slot_with_commitment(CommitmentConfig::root()) {
trace!("current root: {}", root);
if root > last_replayed_root + 1 {
last_replayed_root = root;
break;
}
}
sleep(Duration::from_millis(50));
}
// Stop validator, and check saved tower
let recent_slot = validator_client
.get_slot_with_commitment(CommitmentConfig::recent())
.unwrap();
let validator_info = cluster.exit_node(&validator_id);
let tower2 = Tower::restore(&ledger_path, &validator_id).unwrap();
trace!("tower2: {:?}", tower2);
assert_eq!(tower2.root(), Some(last_replayed_root));
last_replayed_root = recent_slot;
// Rollback saved tower to `tower1` to simulate a validator starting from a newer snapshot
// without having to wait for that snapshot to be generated in this test
tower1.save(&validator_identity_keypair).unwrap();
cluster.restart_node(&validator_id, validator_info);
let validator_client = cluster.get_validator_client(&validator_id).unwrap();
// Wait for a new root, demonstrating the validator was able to make progress from the older `tower1`
loop {
if let Ok(root) = validator_client.get_slot_with_commitment(CommitmentConfig::root()) {
trace!(
"current root: {}, last_replayed_root: {}",
root,
last_replayed_root
);
if root > last_replayed_root {
break;
}
}
sleep(Duration::from_millis(50));
}
// Check the new root is reflected in the saved tower state
let mut validator_info = cluster.exit_node(&validator_id);
let tower3 = Tower::restore(&ledger_path, &validator_id).unwrap();
trace!("tower3: {:?}", tower3);
assert!(tower3.root().unwrap() > last_replayed_root);
// Remove the tower file entirely and allow the validator to start without a tower. It will
// rebuild tower from its vote account contents
fs::remove_file(Tower::get_filename(&ledger_path, &validator_id)).unwrap();
validator_info.config.require_tower = false;
cluster.restart_node(&validator_id, validator_info);
let validator_client = cluster.get_validator_client(&validator_id).unwrap();
// Wait for a couple more slots to pass so another vote occurs
let current_slot = validator_client
.get_slot_with_commitment(CommitmentConfig::recent())
.unwrap();
loop {
if let Ok(slot) = validator_client.get_slot_with_commitment(CommitmentConfig::recent()) {
trace!("current_slot: {}, slot: {}", current_slot, slot);
if slot > current_slot + 1 {
break;
}
}
sleep(Duration::from_millis(50));
}
cluster.close_preserve_ledgers();
let tower4 = Tower::restore(&ledger_path, &validator_id).unwrap();
trace!("tower4: {:?}", tower4);
// should tower4 advance 1 slot compared to tower3????
assert_eq!(tower4.root(), tower3.root().map(|s| s + 1));
}
fn wait_for_next_snapshot(
cluster: &LocalCluster,
snapshot_package_output_path: &Path,