Persistent tower (#10718)

* Save/restore Tower * Avoid unwrap() * Rebase cleanups * Forcibly pass test * Correct reconcilation of votes after validator resume * d b g * Add more tests * fsync and fix test * Add test * Fix fmt * Debug * Fix tests... * save * Clarify error message and code cleaning around it * Move most of code out of tower save hot codepath * Proper comment for the lack of fsync on tower * Clean up * Clean up * Simpler type alias * Manage tower-restored ancestor slots without banks * Add comment * Extract long code blocks... * Add comment * Simplify returned tuple... * Tweak too aggresive log * Fix typo... * Add test * Update comment * Improve test to require non-empty stray restored slots * Measure tower save and dump all tower contents * Log adjust and add threshold related assertions * cleanup adjust * Properly lower stray restored slots priority... * Rust fmt * Fix test.... * Clarify comments a bit and add TowerError::TooNew * Further clean-up arround TowerError * Truly create ancestors by excluding last vote slot * Add comment for stray_restored_slots * Add comment for stray_restored_slots * Use BTreeSet * Consider root_slot into post-replay adjustment * Tweak logging * Add test for stray_restored_ancestors * Reorder some code * Better names for unit tests * Add frozen_abi to SavedTower * Fold long lines * Tweak stray ancestors and too old slot history * Re-adjust error conditon of too old slot history * Test normal ancestors is checked before stray ones * Fix conflict, update tests, adjust behavior a bit * Fix test * Address review comments * Last touch! * Immediately after creating cleaning pr * Revert stray slots * Revert comment... * Report error as metrics * Revert not to panic! and ignore unfixable test... * Normalize lockouts.root_slot more strictly * Add comments for panic! and more assertions * Proper initialize root without vote account * Clarify code and comments based on review feedback * Fix rebase * Further simplify based on assured tower root * Reorder code for more readability Co-authored-by: Michael Vines <mvines@gmail.com>
2020-09-19 14:03:54 +09:00
parent 28f2c15597
commit cb8661bd49
15 changed files with 1712 additions and 106 deletions
--- a/local-cluster/tests/local_cluster.rs
+++ b/local-cluster/tests/local_cluster.rs
@ -9,7 +9,7 @@ use solana_client::{
 use solana_core::{
    broadcast_stage::BroadcastStageType,
    cluster_info::VALIDATOR_PORT_RANGE,
-    consensus::{SWITCH_FORK_THRESHOLD, VOTE_THRESHOLD_DEPTH},
+    consensus::{Tower, SWITCH_FORK_THRESHOLD, VOTE_THRESHOLD_DEPTH},
    gossip_service::discover_cluster,
    validator::ValidatorConfig,
 };
@ -1370,18 +1370,19 @@ fn test_no_voting() {
 }

 #[test]
-fn test_optimistic_confirmation_violation() {
+#[serial]
+fn test_optimistic_confirmation_violation_with_no_tower() {
    solana_logger::setup();
    let mut buf = BufferRedirect::stderr().unwrap();
    // First set up the cluster with 2 nodes
    let slots_per_epoch = 2048;
-    let node_stakes = vec![50, 51];
+    let node_stakes = vec![51, 50];
    let validator_keys: Vec<_> = iter::repeat_with(|| (Arc::new(Keypair::new()), true))
        .take(node_stakes.len())
        .collect();
    let config = ClusterConfig {
        cluster_lamports: 100_000,
-        node_stakes: vec![51, 50],
+        node_stakes: node_stakes.clone(),
        validator_configs: vec![ValidatorConfig::default(); node_stakes.len()],
        validator_keys: Some(validator_keys),
        slots_per_epoch,
@ -1415,7 +1416,9 @@ fn test_optimistic_confirmation_violation() {

    // Mark fork as dead on the heavier validator, this should make the fork effectively
    // dead, even though it was optimistically confirmed. The smaller validator should
-    // jump over to the new fork
+    // create and jump over to a new fork
+    // Also, remove saved tower to intentionally make the restarted validator to violate the
+    // optimistic confirmation
    {
        let blockstore = Blockstore::open_with_access_type(
            &exited_validator_info.info.ledger_path,
@ -1433,6 +1436,12 @@ fn test_optimistic_confirmation_violation() {
            prev_voted_slot
        );
        blockstore.set_dead_slot(prev_voted_slot).unwrap();
+
+        std::fs::remove_file(Tower::get_filename(
+            &exited_validator_info.info.ledger_path,
+            &entry_point_id,
+        ))
+        .unwrap();
    }
    cluster.restart_node(&entry_point_id, exited_validator_info);

@ -1465,6 +1474,220 @@ fn test_optimistic_confirmation_violation() {
    assert!(output.contains(&expected_log));
 }

+#[test]
+#[serial]
+#[ignore]
+fn test_no_optimistic_confirmation_violation_with_tower() {
+    solana_logger::setup();
+    let mut buf = BufferRedirect::stderr().unwrap();
+
+    // First set up the cluster with 2 nodes
+    let slots_per_epoch = 2048;
+    let node_stakes = vec![51, 50];
+    let validator_keys: Vec<_> = iter::repeat_with(|| (Arc::new(Keypair::new()), true))
+        .take(node_stakes.len())
+        .collect();
+    let config = ClusterConfig {
+        cluster_lamports: 100_000,
+        node_stakes: node_stakes.clone(),
+        validator_configs: vec![ValidatorConfig::default(); node_stakes.len()],
+        validator_keys: Some(validator_keys),
+        slots_per_epoch,
+        stakers_slot_offset: slots_per_epoch,
+        skip_warmup_slots: true,
+        ..ClusterConfig::default()
+    };
+    let mut cluster = LocalCluster::new(&config);
+    let entry_point_id = cluster.entry_point_info.id;
+    // Let the nodes run for a while. Wait for validators to vote on slot `S`
+    // so that the vote on `S-1` is definitely in gossip and optimistic confirmation is
+    // detected on slot `S-1` for sure, then stop the heavier of the two
+    // validators
+    let client = cluster.get_validator_client(&entry_point_id).unwrap();
+    let mut prev_voted_slot = 0;
+    loop {
+        let last_voted_slot = client
+            .get_slot_with_commitment(CommitmentConfig::recent())
+            .unwrap();
+        if last_voted_slot > 50 {
+            if prev_voted_slot == 0 {
+                prev_voted_slot = last_voted_slot;
+            } else {
+                break;
+            }
+        }
+        sleep(Duration::from_millis(100));
+    }
+
+    let exited_validator_info = cluster.exit_node(&entry_point_id);
+
+    // Mark fork as dead on the heavier validator, this should make the fork effectively
+    // dead, even though it was optimistically confirmed. The smaller validator should
+    // create and jump over to a new fork
+    {
+        let blockstore = Blockstore::open_with_access_type(
+            &exited_validator_info.info.ledger_path,
+            AccessType::PrimaryOnly,
+            None,
+        )
+        .unwrap_or_else(|e| {
+            panic!(
+                "Failed to open ledger at {:?}, err: {}",
+                exited_validator_info.info.ledger_path, e
+            );
+        });
+        info!(
+            "Setting slot: {} on main fork as dead, should cause fork",
+            prev_voted_slot
+        );
+        blockstore.set_dead_slot(prev_voted_slot).unwrap();
+    }
+    cluster.restart_node(&entry_point_id, exited_validator_info);
+
+    cluster.check_no_new_roots(400, "test_no_optimistic_confirmation_violation_with_tower");
+
+    // Check to see that validator didn't detected optimistic confirmation for
+    // `prev_voted_slot` failed
+    let expected_log = format!("Optimistic slot {} was not rooted", prev_voted_slot);
+    let mut output = String::new();
+    buf.read_to_string(&mut output).unwrap();
+    assert!(!output.contains(&expected_log));
+}
+
+#[test]
+#[serial]
+fn test_validator_saves_tower() {
+    solana_logger::setup();
+
+    let validator_config = ValidatorConfig {
+        require_tower: true,
+        ..ValidatorConfig::default()
+    };
+    let validator_identity_keypair = Arc::new(Keypair::new());
+    let validator_id = validator_identity_keypair.pubkey();
+    let config = ClusterConfig {
+        cluster_lamports: 10_000,
+        node_stakes: vec![100],
+        validator_configs: vec![validator_config],
+        validator_keys: Some(vec![(validator_identity_keypair.clone(), true)]),
+        ..ClusterConfig::default()
+    };
+    let mut cluster = LocalCluster::new(&config);
+
+    let validator_client = cluster.get_validator_client(&validator_id).unwrap();
+
+    let ledger_path = cluster
+        .validators
+        .get(&validator_id)
+        .unwrap()
+        .info
+        .ledger_path
+        .clone();
+
+    // Wait for some votes to be generated
+    let mut last_replayed_root;
+    loop {
+        if let Ok(slot) = validator_client.get_slot_with_commitment(CommitmentConfig::recent()) {
+            trace!("current slot: {}", slot);
+            if slot > 2 {
+                // this will be the root next time a validator starts
+                last_replayed_root = slot;
+                break;
+            }
+        }
+        sleep(Duration::from_millis(10));
+    }
+
+    // Stop validator and check saved tower
+    let validator_info = cluster.exit_node(&validator_id);
+    let tower1 = Tower::restore(&ledger_path, &validator_id).unwrap();
+    trace!("tower1: {:?}", tower1);
+    assert_eq!(tower1.root(), Some(0));
+
+    // Restart the validator and wait for a new root
+    cluster.restart_node(&validator_id, validator_info);
+    let validator_client = cluster.get_validator_client(&validator_id).unwrap();
+
+    // Wait for the first root
+    loop {
+        if let Ok(root) = validator_client.get_slot_with_commitment(CommitmentConfig::root()) {
+            trace!("current root: {}", root);
+            if root > last_replayed_root + 1 {
+                last_replayed_root = root;
+                break;
+            }
+        }
+        sleep(Duration::from_millis(50));
+    }
+
+    // Stop validator, and check saved tower
+    let recent_slot = validator_client
+        .get_slot_with_commitment(CommitmentConfig::recent())
+        .unwrap();
+    let validator_info = cluster.exit_node(&validator_id);
+    let tower2 = Tower::restore(&ledger_path, &validator_id).unwrap();
+    trace!("tower2: {:?}", tower2);
+    assert_eq!(tower2.root(), Some(last_replayed_root));
+    last_replayed_root = recent_slot;
+
+    // Rollback saved tower to `tower1` to simulate a validator starting from a newer snapshot
+    // without having to wait for that snapshot to be generated in this test
+    tower1.save(&validator_identity_keypair).unwrap();
+
+    cluster.restart_node(&validator_id, validator_info);
+    let validator_client = cluster.get_validator_client(&validator_id).unwrap();
+
+    // Wait for a new root, demonstrating the validator was able to make progress from the older `tower1`
+    loop {
+        if let Ok(root) = validator_client.get_slot_with_commitment(CommitmentConfig::root()) {
+            trace!(
+                "current root: {}, last_replayed_root: {}",
+                root,
+                last_replayed_root
+            );
+            if root > last_replayed_root {
+                break;
+            }
+        }
+        sleep(Duration::from_millis(50));
+    }
+
+    // Check the new root is reflected in the saved tower state
+    let mut validator_info = cluster.exit_node(&validator_id);
+    let tower3 = Tower::restore(&ledger_path, &validator_id).unwrap();
+    trace!("tower3: {:?}", tower3);
+    assert!(tower3.root().unwrap() > last_replayed_root);
+
+    // Remove the tower file entirely and allow the validator to start without a tower.  It will
+    // rebuild tower from its vote account contents
+    fs::remove_file(Tower::get_filename(&ledger_path, &validator_id)).unwrap();
+    validator_info.config.require_tower = false;
+
+    cluster.restart_node(&validator_id, validator_info);
+    let validator_client = cluster.get_validator_client(&validator_id).unwrap();
+
+    // Wait for a couple more slots to pass so another vote occurs
+    let current_slot = validator_client
+        .get_slot_with_commitment(CommitmentConfig::recent())
+        .unwrap();
+    loop {
+        if let Ok(slot) = validator_client.get_slot_with_commitment(CommitmentConfig::recent()) {
+            trace!("current_slot: {}, slot: {}", current_slot, slot);
+            if slot > current_slot + 1 {
+                break;
+            }
+        }
+        sleep(Duration::from_millis(50));
+    }
+
+    cluster.close_preserve_ledgers();
+
+    let tower4 = Tower::restore(&ledger_path, &validator_id).unwrap();
+    trace!("tower4: {:?}", tower4);
+    // should tower4 advance 1 slot compared to tower3????
+    assert_eq!(tower4.root(), tower3.root().map(|s| s + 1));
+}
+
 fn wait_for_next_snapshot(
    cluster: &LocalCluster,
    snapshot_package_output_path: &Path,