leader_scheduler: remove bootstrap_height

2019-02-05 08:03:52 -08:00
parent 73979d8f5a
commit c5a74ada05
27 changed files with 1591 additions and 1789 deletions
--- a/src/db_window.rs
+++ b/src/db_window.rs
@@ -31,7 +31,6 @@ pub fn repair(
    leader_scheduler_option: &Arc<RwLock<LeaderScheduler>>,
 ) -> Result<Vec<(SocketAddr, Vec<u8>)>> {
    let rcluster_info = cluster_info.read().unwrap();
-    let mut is_next_leader = false;
    let meta = db_ledger.meta()?;
    if meta.is_none() {
        return Ok(vec![]);
@@ -43,35 +42,33 @@ pub fn repair(

    // Repair should only be called when received > consumed, enforced in window_service
    assert!(received > consumed);
-    {
-        let ls_lock = leader_scheduler_option.read().unwrap();
-        if !ls_lock.use_only_bootstrap_leader {
-            // Calculate the next leader rotation height and check if we are the leader
-            if let Some(next_leader_rotation_height) = ls_lock.max_height_for_leader(tick_height) {
-                match ls_lock.get_scheduled_leader(next_leader_rotation_height) {
-                    Some((leader_id, _)) if leader_id == *id => is_next_leader = true,
-                    // In the case that we are not in the current scope of the leader schedule
-                    // window then either:
-                    //
-                    // 1) The replay stage hasn't caught up to the "consumed" entries we sent,
-                    // in which case it will eventually catch up
-                    //
-                    // 2) We are on the border between seed_rotation_intervals, so the
-                    // schedule won't be known until the entry on that cusp is received
-                    // by the replay stage (which comes after this stage). Hence, the next
-                    // leader at the beginning of that next epoch will not know they are the
-                    // leader until they receive that last "cusp" entry. The leader also won't ask for repairs
-                    // for that entry because "is_next_leader" won't be set here. In this case,
-                    // everybody will be blocking waiting for that "cusp" entry instead of repairing,
-                    // until the leader hits "times" >= the max times in calculate_max_repair_entry_height().
-                    // The impact of this, along with the similar problem from broadcast for the transitioning
-                    // leader, can be observed in the multinode test, test_full_leader_validator_network(),
-                    None => (),
-                    _ => (),
-                }
-            }
+
+    // Check if we are the next next slot leader
+    let is_next_leader = {
+        let leader_scheduler = leader_scheduler_option.read().unwrap();
+        let next_slot = leader_scheduler.tick_height_to_slot(tick_height) + 1;
+        match leader_scheduler.get_leader_for_slot(next_slot) {
+            Some(leader_id) if leader_id == *id => true,
+            // In the case that we are not in the current scope of the leader schedule
+            // window then either:
+            //
+            // 1) The replay stage hasn't caught up to the "consumed" entries we sent,
+            // in which case it will eventually catch up
+            //
+            // 2) We are on the border between seed_rotation_intervals, so the
+            // schedule won't be known until the entry on that cusp is received
+            // by the replay stage (which comes after this stage). Hence, the next
+            // leader at the beginning of that next epoch will not know they are the
+            // leader until they receive that last "cusp" entry. The leader also won't ask for repairs
+            // for that entry because "is_next_leader" won't be set here. In this case,
+            // everybody will be blocking waiting for that "cusp" entry instead of repairing,
+            // until the leader hits "times" >= the max times in calculate_max_repair_entry_height().
+            // The impact of this, along with the similar problem from broadcast for the transitioning
+            // leader, can be observed in the multinode test, test_full_leader_validator_network(),
+            None => false,
+            _ => false,
        }
-    }
+    };

    let num_peers = rcluster_info.repair_peers().len() as u64;

@@ -195,7 +192,8 @@ pub fn process_blob(
    // TODO: Once the original leader signature is added to the blob, make sure that
    // the blob was originally generated by the expected leader for this slot
    if leader.is_none() {
-        return Ok(());
+        warn!("No leader for slot {}, blob dropped", slot);
+        return Ok(()); // Occurs as a leader is rotating into a validator
    }

    // Insert the new blob into the window
@@ -393,8 +391,9 @@ mod test {
    pub fn test_retransmit() {
        let leader = Keypair::new().pubkey();
        let nonleader = Keypair::new().pubkey();
-        let leader_scheduler =
-            Arc::new(RwLock::new(LeaderScheduler::from_bootstrap_leader(leader)));
+        let mut leader_scheduler = LeaderScheduler::default();
+        leader_scheduler.set_leader_schedule(vec![leader]);
+        let leader_scheduler = Arc::new(RwLock::new(leader_scheduler));
        let blob = SharedBlob::default();

        let (blob_sender, blob_receiver) = channel();
@@ -655,17 +654,12 @@ mod test {

    #[test]
    fn test_process_blob() {
-        // Create the leader scheduler
-        let leader_keypair = Keypair::new();
-        let mut leader_scheduler = LeaderScheduler::from_bootstrap_leader(leader_keypair.pubkey());
+        let mut leader_scheduler = LeaderScheduler::default();
+        leader_scheduler.set_leader_schedule(vec![Keypair::new().pubkey()]);

        let db_ledger_path = get_tmp_ledger_path("test_process_blob");
        let db_ledger = Arc::new(DbLedger::open(&db_ledger_path).unwrap());

-        // Mock the tick height to look like the tick height right after a leader transition
-        leader_scheduler.last_seed_height = None;
-        leader_scheduler.use_only_bootstrap_leader = false;
-
        let leader_scheduler = Arc::new(RwLock::new(leader_scheduler));
        let num_entries = 10;
        let original_entries = make_tiny_test_entries(num_entries);