prioritize slot repairs for unknown last index and close to completion (#21070)

This commit is contained in:
Jeff Biseda
2021-11-19 19:17:30 -08:00
committed by GitHub
parent ebea3297f9
commit 2ed7e3af89
6 changed files with 600 additions and 18 deletions

View File

@ -95,6 +95,8 @@ pub struct RepairTiming {
pub add_votes_elapsed: u64,
pub get_best_orphans_elapsed: u64,
pub get_best_shreds_elapsed: u64,
pub get_unknown_last_index_elapsed: u64,
pub get_closest_completion_elapsed: u64,
pub send_repairs_elapsed: u64,
pub build_repairs_batch_elapsed: u64,
pub batch_send_repairs_elapsed: u64,
@ -118,11 +120,50 @@ impl RepairTiming {
}
}
#[derive(Default, Debug)]
pub struct BestRepairsStats {
pub call_count: u64,
pub num_orphan_slots: u64,
pub num_orphan_repairs: u64,
pub num_best_shreds_slots: u64,
pub num_best_shreds_repairs: u64,
pub num_unknown_last_index_slots: u64,
pub num_unknown_last_index_repairs: u64,
pub num_closest_completion_slots: u64,
pub num_closest_completion_repairs: u64,
}
impl BestRepairsStats {
pub fn update(
&mut self,
num_orphan_slots: u64,
num_orphan_repairs: u64,
num_best_shreds_slots: u64,
num_best_shreds_repairs: u64,
num_unknown_last_index_slots: u64,
num_unknown_last_index_repairs: u64,
num_closest_completion_slots: u64,
num_closest_completion_repairs: u64,
) {
self.call_count += 1;
self.num_orphan_slots += num_orphan_slots;
self.num_orphan_repairs += num_orphan_repairs;
self.num_best_shreds_slots += num_best_shreds_slots;
self.num_best_shreds_repairs += num_best_shreds_repairs;
self.num_unknown_last_index_slots += num_unknown_last_index_slots;
self.num_unknown_last_index_repairs += num_unknown_last_index_repairs;
self.num_closest_completion_slots += num_closest_completion_slots;
self.num_closest_completion_repairs += num_closest_completion_repairs;
}
}
pub const MAX_REPAIR_LENGTH: usize = 512;
pub const MAX_REPAIR_PER_DUPLICATE: usize = 20;
pub const MAX_DUPLICATE_WAIT_MS: usize = 10_000;
pub const REPAIR_MS: u64 = 100;
pub const MAX_ORPHANS: usize = 5;
pub const MAX_UNKNOWN_LAST_INDEX_REPAIRS: usize = 10;
pub const MAX_CLOSEST_COMPLETION_REPAIRS: usize = 100;
#[derive(Clone)]
pub struct RepairInfo {
@ -210,6 +251,7 @@ impl RepairService {
let id = repair_info.cluster_info.id();
let mut repair_stats = RepairStats::default();
let mut repair_timing = RepairTiming::default();
let mut best_repairs_stats = BestRepairsStats::default();
let mut last_stats = Instant::now();
let duplicate_slot_repair_statuses: HashMap<Slot, DuplicateSlotRepairStatus> =
HashMap::new();
@ -257,15 +299,20 @@ impl RepairService {
);
add_votes_elapsed.stop();
repair_weight.get_best_weighted_repairs(
let repairs = repair_weight.get_best_weighted_repairs(
blockstore,
root_bank.epoch_stakes_map(),
root_bank.epoch_schedule(),
MAX_ORPHANS,
MAX_REPAIR_LENGTH,
MAX_UNKNOWN_LAST_INDEX_REPAIRS,
MAX_CLOSEST_COMPLETION_REPAIRS,
&duplicate_slot_repair_statuses,
Some(&mut repair_timing),
)
Some(&mut best_repairs_stats),
);
repairs
};
let mut build_repairs_batch_elapsed = Measure::start("build_repairs_batch_elapsed");
@ -362,6 +409,16 @@ impl RepairService {
repair_timing.get_best_shreds_elapsed,
i64
),
(
"get-unknown-last-index-elapsed",
repair_timing.get_unknown_last_index_elapsed,
i64
),
(
"get-closest-completion-elapsed",
repair_timing.get_closest_completion_elapsed,
i64
),
(
"send-repairs-elapsed",
repair_timing.send_repairs_elapsed,
@ -378,8 +435,45 @@ impl RepairService {
i64
),
);
datapoint_info!(
"serve_repair-best-repairs",
("call-count", best_repairs_stats.call_count, i64),
("orphan-slots", best_repairs_stats.num_orphan_slots, i64),
("orphan-repairs", best_repairs_stats.num_orphan_repairs, i64),
(
"best-shreds-slots",
best_repairs_stats.num_best_shreds_slots,
i64
),
(
"best-shreds-repairs",
best_repairs_stats.num_best_shreds_repairs,
i64
),
(
"unknown-last-index-slots",
best_repairs_stats.num_unknown_last_index_slots,
i64
),
(
"unknown-last-index-repairs",
best_repairs_stats.num_unknown_last_index_repairs,
i64
),
(
"closest-completion-slots",
best_repairs_stats.num_closest_completion_slots,
i64
),
(
"closest-completion-repairs",
best_repairs_stats.num_closest_completion_repairs,
i64
),
);
repair_stats = RepairStats::default();
repair_timing = RepairTiming::default();
best_repairs_stats = BestRepairsStats::default();
last_stats = Instant::now();
}
sleep(Duration::from_millis(REPAIR_MS));
@ -474,7 +568,7 @@ impl RepairService {
}
}
#[allow(dead_code)]
#[cfg_attr(not(test), allow(dead_code))]
fn generate_duplicate_repairs_for_slot(
blockstore: &Blockstore,
slot: Slot,
@ -499,7 +593,7 @@ impl RepairService {
}
}
#[allow(dead_code)]
#[cfg_attr(not(test), allow(dead_code))]
fn generate_and_send_duplicate_repairs(
duplicate_slot_repair_statuses: &mut HashMap<Slot, DuplicateSlotRepairStatus>,
cluster_slots: &ClusterSlots,
@ -550,7 +644,7 @@ impl RepairService {
})
}
#[allow(dead_code)]
#[cfg_attr(not(test), allow(dead_code))]
fn serialize_and_send_request(
repair_type: &ShredRepairType,
repair_socket: &UdpSocket,
@ -566,7 +660,7 @@ impl RepairService {
Ok(())
}
#[allow(dead_code)]
#[cfg_attr(not(test), allow(dead_code))]
fn update_duplicate_slot_repair_addr(
slot: Slot,
status: &mut DuplicateSlotRepairStatus,
@ -659,8 +753,11 @@ mod test {
&EpochSchedule::default(),
MAX_ORPHANS,
MAX_REPAIR_LENGTH,
MAX_UNKNOWN_LAST_INDEX_REPAIRS,
MAX_CLOSEST_COMPLETION_REPAIRS,
&HashSet::default(),
None,
None,
),
vec![
ShredRepairType::Orphan(2),
@ -693,8 +790,11 @@ mod test {
&EpochSchedule::default(),
MAX_ORPHANS,
MAX_REPAIR_LENGTH,
MAX_UNKNOWN_LAST_INDEX_REPAIRS,
MAX_CLOSEST_COMPLETION_REPAIRS,
&HashSet::default(),
None
None,
None,
),
vec![ShredRepairType::HighestShred(0, 0)]
);
@ -748,8 +848,11 @@ mod test {
&EpochSchedule::default(),
MAX_ORPHANS,
MAX_REPAIR_LENGTH,
MAX_UNKNOWN_LAST_INDEX_REPAIRS,
MAX_CLOSEST_COMPLETION_REPAIRS,
&HashSet::default(),
None
None,
None,
),
expected
);
@ -761,8 +864,11 @@ mod test {
&EpochSchedule::default(),
MAX_ORPHANS,
expected.len() - 2,
MAX_UNKNOWN_LAST_INDEX_REPAIRS,
MAX_CLOSEST_COMPLETION_REPAIRS,
&HashSet::default(),
None
None,
None,
)[..],
expected[0..expected.len() - 2]
);
@ -799,8 +905,11 @@ mod test {
&EpochSchedule::default(),
MAX_ORPHANS,
MAX_REPAIR_LENGTH,
MAX_UNKNOWN_LAST_INDEX_REPAIRS,
MAX_CLOSEST_COMPLETION_REPAIRS,
&HashSet::default(),
None
None,
None,
),
expected
);