Leader scheduler plumbing (#1440)

* Added LeaderScheduler module and tests

* plumbing for LeaderScheduler in Fullnode + tests. Add vote processing for active set to ReplicateStage and WriteStage

* Add LeaderScheduler plumbing for Tvu, window, and tests

* Fix bank and switch tests to use new LeaderScheduler

* move leader rotation check from window service to replicate stage

* Add replicate_stage leader rotation exit test

* removed leader scheduler from the window service and associated modules/tests

* Corrected is_leader calculation in repair() function in window.rs

* Integrate LeaderScheduler with write_stage for leader to validator transitions

* Integrated LeaderScheduler with BroadcastStage

* Removed gossip leader rotation from crdt

* Add multi validator, leader test

* Comments and cleanup

* Remove unneeded checks from broadcast stage

* Fix case where a validator/leader need to immediately transition on startup after reading ledger and seeing they are not in the correct role

* Set new leader in validator -> validator transitions

* Clean up for PR comments, refactor LeaderScheduler from process_entry/process_ledger_tail

* Cleaned out LeaderScheduler options, implemented LeaderScheduler strategy that only picks the bootstrap leader to support existing tests, drone/airdrops

* Ignore test_full_leader_validator_network test due to bug where the next leader in line fails to get the last entry before rotation (b/c it hasn't started up yet). Added a test test_dropped_handoff_recovery go track this bug
This commit is contained in:
carllin
2018-10-10 16:49:41 -07:00
committed by GitHub
parent 2ba2bc72ca
commit 9931ac9780
22 changed files with 1743 additions and 898 deletions

View File

@@ -5,6 +5,7 @@ use counter::Counter;
use entry::Entry;
#[cfg(feature = "erasure")]
use erasure;
use leader_scheduler::LeaderScheduler;
use ledger::{reconstruct_entries_from_blobs, Block};
use log::Level;
use packet::SharedBlob;
@@ -51,6 +52,7 @@ pub trait WindowUtil {
/// Finds available slots, clears them, and returns their indices.
fn clear_slots(&mut self, consumed: u64, received: u64) -> Vec<u64>;
#[cfg_attr(feature = "cargo-clippy", allow(too_many_arguments))]
fn repair(
&mut self,
cluster_info: &Arc<RwLock<ClusterInfo>>,
@@ -59,6 +61,7 @@ pub trait WindowUtil {
consumed: u64,
received: u64,
max_entry_height: u64,
leader_scheduler_option: &Arc<RwLock<LeaderScheduler>>,
) -> Vec<(SocketAddr, Vec<u8>)>;
fn print(&self, id: &Pubkey, consumed: u64) -> String;
@@ -67,14 +70,12 @@ pub trait WindowUtil {
fn process_blob(
&mut self,
id: &Pubkey,
cluster_info: &Arc<RwLock<ClusterInfo>>,
blob: SharedBlob,
pix: u64,
consume_queue: &mut Vec<Entry>,
consumed: &mut u64,
leader_unknown: bool,
pending_retransmits: &mut bool,
leader_rotation_interval: u64,
);
}
@@ -101,13 +102,40 @@ impl WindowUtil for Window {
consumed: u64,
received: u64,
max_entry_height: u64,
leader_scheduler_option: &Arc<RwLock<LeaderScheduler>>,
) -> Vec<(SocketAddr, Vec<u8>)> {
let rcluster_info = cluster_info.read().unwrap();
let leader_rotation_interval = rcluster_info.get_leader_rotation_interval();
// Calculate the next leader rotation height and check if we are the leader
let next_leader_rotation =
consumed + leader_rotation_interval - (consumed % leader_rotation_interval);
let is_next_leader = rcluster_info.get_scheduled_leader(next_leader_rotation) == Some(*id);
let mut is_next_leader = false;
{
let ls_lock = leader_scheduler_option.read().unwrap();
if !ls_lock.use_only_bootstrap_leader {
// Calculate the next leader rotation height and check if we are the leader
let next_leader_rotation_height = consumed + ls_lock.entries_until_next_leader_rotation(consumed).expect("Leader rotation should exist when not using default implementation of LeaderScheduler");
match ls_lock.get_scheduled_leader(next_leader_rotation_height) {
Some(leader_id) if leader_id == *id => is_next_leader = true,
// In the case that we are not in the current scope of the leader schedule
// window then either:
//
// 1) The replicate stage hasn't caught up to the "consumed" entries we sent,
// in which case it will eventually catch up
//
// 2) We are on the border between seed_rotation_intervals, so the
// schedule won't be known until the entry on that cusp is received
// by the replicate stage (which comes after this stage). Hence, the next
// leader at the beginning of that next epoch will not know he is the
// leader until he receives that last "cusp" entry. He also won't ask for repairs
// for that entry because "is_next_leader" won't be set here. In this case,
// everybody will be blocking waiting for that "cusp" entry instead of repairing,
// until the leader hits "times" >= the max times in calculate_max_repair().
// The impact of this, along with the similar problem from broadcast for the transitioning
// leader, can be observed in the multinode test, test_full_leader_validator_network(),
None => (),
_ => (),
}
}
}
let num_peers = rcluster_info.table.len() as u64;
let max_repair = if max_entry_height == 0 {
@@ -196,14 +224,12 @@ impl WindowUtil for Window {
fn process_blob(
&mut self,
id: &Pubkey,
cluster_info: &Arc<RwLock<ClusterInfo>>,
blob: SharedBlob,
pix: u64,
consume_queue: &mut Vec<Entry>,
consumed: &mut u64,
leader_unknown: bool,
pending_retransmits: &mut bool,
leader_rotation_interval: u64,
) {
let w = (pix % WINDOW_SIZE) as usize;
@@ -258,18 +284,6 @@ impl WindowUtil for Window {
// push all contiguous blobs into consumed queue, increment consumed
loop {
if *consumed != 0 && *consumed % (leader_rotation_interval as u64) == 0 {
let rcluster_info = cluster_info.read().unwrap();
let my_id = rcluster_info.my_data().id;
match rcluster_info.get_scheduled_leader(*consumed) {
// If we are the next leader, exit
Some(id) if id == my_id => {
break;
}
_ => (),
}
}
let k = (*consumed % WINDOW_SIZE) as usize;
trace!("{}: k: {} consumed: {}", id, k, *consumed,);