Persistent tower (#10718)

* Save/restore Tower

* Avoid unwrap()

* Rebase cleanups

* Forcibly pass test

* Correct reconcilation of votes after validator resume

* d b g

* Add more tests

* fsync and fix test

* Add test

* Fix fmt

* Debug

* Fix tests...

* save

* Clarify error message and code cleaning around it

* Move most of code out of tower save hot codepath

* Proper comment for the lack of fsync on tower

* Clean up

* Clean up

* Simpler type alias

* Manage tower-restored ancestor slots without banks

* Add comment

* Extract long code blocks...

* Add comment

* Simplify returned tuple...

* Tweak too aggresive log

* Fix typo...

* Add test

* Update comment

* Improve test to require non-empty stray restored slots

* Measure tower save and dump all tower contents

* Log adjust and add threshold related assertions

* cleanup adjust

* Properly lower stray restored slots priority...

* Rust fmt

* Fix test....

* Clarify comments a bit and add TowerError::TooNew

* Further clean-up arround TowerError

* Truly create ancestors by excluding last vote slot

* Add comment for stray_restored_slots

* Add comment for stray_restored_slots

* Use BTreeSet

* Consider root_slot into post-replay adjustment

* Tweak logging

* Add test for stray_restored_ancestors

* Reorder some code

* Better names for unit tests

* Add frozen_abi to SavedTower

* Fold long lines

* Tweak stray ancestors and too old slot history

* Re-adjust error conditon of too old slot history

* Test normal ancestors is checked before stray ones

* Fix conflict, update tests, adjust behavior a bit

* Fix test

* Address review comments

* Last touch!

* Immediately after creating cleaning pr

* Revert stray slots

* Revert comment...

* Report error as metrics

* Revert not to panic! and ignore unfixable test...

* Normalize lockouts.root_slot more strictly

* Add comments for panic! and more assertions

* Proper initialize root without vote account

* Clarify code and comments based on review feedback

* Fix rebase

* Further simplify based on assured tower root

* Reorder code for more readability

Co-authored-by: Michael Vines <mvines@gmail.com>
This commit is contained in:
Ryo Onodera
2020-09-19 14:03:54 +09:00
committed by GitHub
parent 28f2c15597
commit cb8661bd49
15 changed files with 1712 additions and 106 deletions

View File

@ -6,6 +6,7 @@ use crate::{
cluster_info::{ClusterInfo, Node},
cluster_info_vote_listener::VoteTracker,
completed_data_sets_service::CompletedDataSetsService,
consensus::{reconcile_blockstore_roots_with_tower, Tower, TowerError},
contact_info::ContactInfo,
gossip_service::{discover_cluster, GossipService},
poh_recorder::{PohRecorder, GRACE_TICKS_FACTOR, MAX_GRACE_SLOTS},
@ -95,6 +96,7 @@ pub struct ValidatorConfig {
pub accounts_hash_interval_slots: u64,
pub max_genesis_archive_unpacked_size: u64,
pub wal_recovery_mode: Option<BlockstoreRecoveryMode>,
pub require_tower: bool,
}
impl Default for ValidatorConfig {
@ -125,6 +127,7 @@ impl Default for ValidatorConfig {
accounts_hash_interval_slots: std::u64::MAX,
max_genesis_archive_unpacked_size: MAX_GENESIS_ARCHIVE_UNPACKED_SIZE,
wal_recovery_mode: None,
require_tower: false,
}
}
}
@ -253,7 +256,8 @@ impl Validator {
cache_block_time_sender,
cache_block_time_service,
},
) = new_banks_from_ledger(config, ledger_path, poh_verify, &exit);
tower,
) = new_banks_from_ledger(&id, vote_account, config, ledger_path, poh_verify, &exit);
let leader_schedule_cache = Arc::new(leader_schedule_cache);
let bank = bank_forks.working_bank();
@ -475,6 +479,7 @@ impl Validator {
ledger_signal_receiver,
&subscriptions,
&poh_recorder,
tower,
&leader_schedule_cache,
&exit,
completed_slots_receiver,
@ -613,8 +618,81 @@ impl Validator {
}
}
fn active_vote_account_exists_in_bank(bank: &Arc<Bank>, vote_account: &Pubkey) -> bool {
if let Some(account) = &bank.get_account(vote_account) {
if let Some(vote_state) = VoteState::from(&account) {
return !vote_state.votes.is_empty();
}
}
false
}
fn post_process_restored_tower(
restored_tower: crate::consensus::Result<Tower>,
validator_identity: &Pubkey,
vote_account: &Pubkey,
config: &ValidatorConfig,
ledger_path: &Path,
bank_forks: &BankForks,
) -> Tower {
restored_tower
.and_then(|tower| {
let root_bank = bank_forks.root_bank();
let slot_history = root_bank.get_slot_history();
tower.adjust_lockouts_after_replay(root_bank.slot(), &slot_history)
})
.unwrap_or_else(|err| {
let voting_has_been_active =
active_vote_account_exists_in_bank(&bank_forks.working_bank(), &vote_account);
let saved_tower_is_missing = if let TowerError::IOError(io_err) = &err {
io_err.kind() == std::io::ErrorKind::NotFound
} else {
false
};
if !saved_tower_is_missing {
datapoint_error!(
"tower_error",
(
"error",
format!("Unable to restore tower: {}", err),
String
),
);
}
if config.require_tower && voting_has_been_active {
error!("Requested mandatory tower restore failed: {}", err);
error!(
"And there is an existing vote_account containing actual votes. \
Aborting due to possible conflicting duplicate votes"
);
process::exit(1);
}
if saved_tower_is_missing && !voting_has_been_active {
// Currently, don't protect against spoofed snapshots with no tower at all
info!(
"Ignoring expected failed tower restore because this is the initial \
validator start with the vote account..."
);
} else {
error!(
"Rebuilding a new tower from the latest vote account due to failed tower restore: {}",
err
);
}
Tower::new_from_bankforks(
&bank_forks,
&ledger_path,
&validator_identity,
&vote_account,
)
})
}
#[allow(clippy::type_complexity)]
fn new_banks_from_ledger(
validator_identity: &Pubkey,
vote_account: &Pubkey,
config: &ValidatorConfig,
ledger_path: &Path,
poh_verify: bool,
@ -628,6 +706,7 @@ fn new_banks_from_ledger(
LeaderScheduleCache,
Option<(Slot, Hash)>,
TransactionHistoryServices,
Tower,
) {
info!("loading ledger from {:?}...", ledger_path);
let genesis_config = open_genesis_config(ledger_path, config.max_genesis_archive_unpacked_size);
@ -659,6 +738,14 @@ fn new_banks_from_ledger(
.expect("Failed to open ledger database");
blockstore.set_no_compaction(config.no_rocksdb_compaction);
let restored_tower = Tower::restore(ledger_path, &validator_identity);
if let Ok(tower) = &restored_tower {
reconcile_blockstore_roots_with_tower(&tower, &blockstore).unwrap_or_else(|err| {
error!("Failed to reconcile blockstore with tower: {:?}", err);
std::process::exit(1);
});
}
let process_options = blockstore_processor::ProcessOptions {
poh_verify,
dev_halt_at_slot: config.dev_halt_at_slot,
@ -690,6 +777,17 @@ fn new_banks_from_ledger(
process::exit(1);
});
let tower = post_process_restored_tower(
restored_tower,
&validator_identity,
&vote_account,
&config,
&ledger_path,
&bank_forks,
);
info!("Tower state: {:?}", tower);
leader_schedule_cache.set_fixed_leader_schedule(config.fixed_leader_schedule.clone());
bank_forks.set_snapshot_config(config.snapshot_config.clone());
@ -704,6 +802,7 @@ fn new_banks_from_ledger(
leader_schedule_cache,
snapshot_hash,
transaction_history_services,
tower,
)
}