prioritize slot repairs for unknown last index and close to completion (#21070)
This commit is contained in:
@ -95,6 +95,8 @@ pub struct RepairTiming {
|
||||
pub add_votes_elapsed: u64,
|
||||
pub get_best_orphans_elapsed: u64,
|
||||
pub get_best_shreds_elapsed: u64,
|
||||
pub get_unknown_last_index_elapsed: u64,
|
||||
pub get_closest_completion_elapsed: u64,
|
||||
pub send_repairs_elapsed: u64,
|
||||
pub build_repairs_batch_elapsed: u64,
|
||||
pub batch_send_repairs_elapsed: u64,
|
||||
@ -118,11 +120,50 @@ impl RepairTiming {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Default, Debug)]
|
||||
pub struct BestRepairsStats {
|
||||
pub call_count: u64,
|
||||
pub num_orphan_slots: u64,
|
||||
pub num_orphan_repairs: u64,
|
||||
pub num_best_shreds_slots: u64,
|
||||
pub num_best_shreds_repairs: u64,
|
||||
pub num_unknown_last_index_slots: u64,
|
||||
pub num_unknown_last_index_repairs: u64,
|
||||
pub num_closest_completion_slots: u64,
|
||||
pub num_closest_completion_repairs: u64,
|
||||
}
|
||||
|
||||
impl BestRepairsStats {
|
||||
pub fn update(
|
||||
&mut self,
|
||||
num_orphan_slots: u64,
|
||||
num_orphan_repairs: u64,
|
||||
num_best_shreds_slots: u64,
|
||||
num_best_shreds_repairs: u64,
|
||||
num_unknown_last_index_slots: u64,
|
||||
num_unknown_last_index_repairs: u64,
|
||||
num_closest_completion_slots: u64,
|
||||
num_closest_completion_repairs: u64,
|
||||
) {
|
||||
self.call_count += 1;
|
||||
self.num_orphan_slots += num_orphan_slots;
|
||||
self.num_orphan_repairs += num_orphan_repairs;
|
||||
self.num_best_shreds_slots += num_best_shreds_slots;
|
||||
self.num_best_shreds_repairs += num_best_shreds_repairs;
|
||||
self.num_unknown_last_index_slots += num_unknown_last_index_slots;
|
||||
self.num_unknown_last_index_repairs += num_unknown_last_index_repairs;
|
||||
self.num_closest_completion_slots += num_closest_completion_slots;
|
||||
self.num_closest_completion_repairs += num_closest_completion_repairs;
|
||||
}
|
||||
}
|
||||
|
||||
pub const MAX_REPAIR_LENGTH: usize = 512;
|
||||
pub const MAX_REPAIR_PER_DUPLICATE: usize = 20;
|
||||
pub const MAX_DUPLICATE_WAIT_MS: usize = 10_000;
|
||||
pub const REPAIR_MS: u64 = 100;
|
||||
pub const MAX_ORPHANS: usize = 5;
|
||||
pub const MAX_UNKNOWN_LAST_INDEX_REPAIRS: usize = 10;
|
||||
pub const MAX_CLOSEST_COMPLETION_REPAIRS: usize = 100;
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct RepairInfo {
|
||||
@ -210,6 +251,7 @@ impl RepairService {
|
||||
let id = repair_info.cluster_info.id();
|
||||
let mut repair_stats = RepairStats::default();
|
||||
let mut repair_timing = RepairTiming::default();
|
||||
let mut best_repairs_stats = BestRepairsStats::default();
|
||||
let mut last_stats = Instant::now();
|
||||
let duplicate_slot_repair_statuses: HashMap<Slot, DuplicateSlotRepairStatus> =
|
||||
HashMap::new();
|
||||
@ -257,15 +299,20 @@ impl RepairService {
|
||||
);
|
||||
add_votes_elapsed.stop();
|
||||
|
||||
repair_weight.get_best_weighted_repairs(
|
||||
let repairs = repair_weight.get_best_weighted_repairs(
|
||||
blockstore,
|
||||
root_bank.epoch_stakes_map(),
|
||||
root_bank.epoch_schedule(),
|
||||
MAX_ORPHANS,
|
||||
MAX_REPAIR_LENGTH,
|
||||
MAX_UNKNOWN_LAST_INDEX_REPAIRS,
|
||||
MAX_CLOSEST_COMPLETION_REPAIRS,
|
||||
&duplicate_slot_repair_statuses,
|
||||
Some(&mut repair_timing),
|
||||
)
|
||||
Some(&mut best_repairs_stats),
|
||||
);
|
||||
|
||||
repairs
|
||||
};
|
||||
|
||||
let mut build_repairs_batch_elapsed = Measure::start("build_repairs_batch_elapsed");
|
||||
@ -362,6 +409,16 @@ impl RepairService {
|
||||
repair_timing.get_best_shreds_elapsed,
|
||||
i64
|
||||
),
|
||||
(
|
||||
"get-unknown-last-index-elapsed",
|
||||
repair_timing.get_unknown_last_index_elapsed,
|
||||
i64
|
||||
),
|
||||
(
|
||||
"get-closest-completion-elapsed",
|
||||
repair_timing.get_closest_completion_elapsed,
|
||||
i64
|
||||
),
|
||||
(
|
||||
"send-repairs-elapsed",
|
||||
repair_timing.send_repairs_elapsed,
|
||||
@ -378,8 +435,45 @@ impl RepairService {
|
||||
i64
|
||||
),
|
||||
);
|
||||
datapoint_info!(
|
||||
"serve_repair-best-repairs",
|
||||
("call-count", best_repairs_stats.call_count, i64),
|
||||
("orphan-slots", best_repairs_stats.num_orphan_slots, i64),
|
||||
("orphan-repairs", best_repairs_stats.num_orphan_repairs, i64),
|
||||
(
|
||||
"best-shreds-slots",
|
||||
best_repairs_stats.num_best_shreds_slots,
|
||||
i64
|
||||
),
|
||||
(
|
||||
"best-shreds-repairs",
|
||||
best_repairs_stats.num_best_shreds_repairs,
|
||||
i64
|
||||
),
|
||||
(
|
||||
"unknown-last-index-slots",
|
||||
best_repairs_stats.num_unknown_last_index_slots,
|
||||
i64
|
||||
),
|
||||
(
|
||||
"unknown-last-index-repairs",
|
||||
best_repairs_stats.num_unknown_last_index_repairs,
|
||||
i64
|
||||
),
|
||||
(
|
||||
"closest-completion-slots",
|
||||
best_repairs_stats.num_closest_completion_slots,
|
||||
i64
|
||||
),
|
||||
(
|
||||
"closest-completion-repairs",
|
||||
best_repairs_stats.num_closest_completion_repairs,
|
||||
i64
|
||||
),
|
||||
);
|
||||
repair_stats = RepairStats::default();
|
||||
repair_timing = RepairTiming::default();
|
||||
best_repairs_stats = BestRepairsStats::default();
|
||||
last_stats = Instant::now();
|
||||
}
|
||||
sleep(Duration::from_millis(REPAIR_MS));
|
||||
@ -474,7 +568,7 @@ impl RepairService {
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(dead_code)]
|
||||
#[cfg_attr(not(test), allow(dead_code))]
|
||||
fn generate_duplicate_repairs_for_slot(
|
||||
blockstore: &Blockstore,
|
||||
slot: Slot,
|
||||
@ -499,7 +593,7 @@ impl RepairService {
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(dead_code)]
|
||||
#[cfg_attr(not(test), allow(dead_code))]
|
||||
fn generate_and_send_duplicate_repairs(
|
||||
duplicate_slot_repair_statuses: &mut HashMap<Slot, DuplicateSlotRepairStatus>,
|
||||
cluster_slots: &ClusterSlots,
|
||||
@ -550,7 +644,7 @@ impl RepairService {
|
||||
})
|
||||
}
|
||||
|
||||
#[allow(dead_code)]
|
||||
#[cfg_attr(not(test), allow(dead_code))]
|
||||
fn serialize_and_send_request(
|
||||
repair_type: &ShredRepairType,
|
||||
repair_socket: &UdpSocket,
|
||||
@ -566,7 +660,7 @@ impl RepairService {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[allow(dead_code)]
|
||||
#[cfg_attr(not(test), allow(dead_code))]
|
||||
fn update_duplicate_slot_repair_addr(
|
||||
slot: Slot,
|
||||
status: &mut DuplicateSlotRepairStatus,
|
||||
@ -659,8 +753,11 @@ mod test {
|
||||
&EpochSchedule::default(),
|
||||
MAX_ORPHANS,
|
||||
MAX_REPAIR_LENGTH,
|
||||
MAX_UNKNOWN_LAST_INDEX_REPAIRS,
|
||||
MAX_CLOSEST_COMPLETION_REPAIRS,
|
||||
&HashSet::default(),
|
||||
None,
|
||||
None,
|
||||
),
|
||||
vec![
|
||||
ShredRepairType::Orphan(2),
|
||||
@ -693,8 +790,11 @@ mod test {
|
||||
&EpochSchedule::default(),
|
||||
MAX_ORPHANS,
|
||||
MAX_REPAIR_LENGTH,
|
||||
MAX_UNKNOWN_LAST_INDEX_REPAIRS,
|
||||
MAX_CLOSEST_COMPLETION_REPAIRS,
|
||||
&HashSet::default(),
|
||||
None
|
||||
None,
|
||||
None,
|
||||
),
|
||||
vec![ShredRepairType::HighestShred(0, 0)]
|
||||
);
|
||||
@ -748,8 +848,11 @@ mod test {
|
||||
&EpochSchedule::default(),
|
||||
MAX_ORPHANS,
|
||||
MAX_REPAIR_LENGTH,
|
||||
MAX_UNKNOWN_LAST_INDEX_REPAIRS,
|
||||
MAX_CLOSEST_COMPLETION_REPAIRS,
|
||||
&HashSet::default(),
|
||||
None
|
||||
None,
|
||||
None,
|
||||
),
|
||||
expected
|
||||
);
|
||||
@ -761,8 +864,11 @@ mod test {
|
||||
&EpochSchedule::default(),
|
||||
MAX_ORPHANS,
|
||||
expected.len() - 2,
|
||||
MAX_UNKNOWN_LAST_INDEX_REPAIRS,
|
||||
MAX_CLOSEST_COMPLETION_REPAIRS,
|
||||
&HashSet::default(),
|
||||
None
|
||||
None,
|
||||
None,
|
||||
)[..],
|
||||
expected[0..expected.len() - 2]
|
||||
);
|
||||
@ -799,8 +905,11 @@ mod test {
|
||||
&EpochSchedule::default(),
|
||||
MAX_ORPHANS,
|
||||
MAX_REPAIR_LENGTH,
|
||||
MAX_UNKNOWN_LAST_INDEX_REPAIRS,
|
||||
MAX_CLOSEST_COMPLETION_REPAIRS,
|
||||
&HashSet::default(),
|
||||
None
|
||||
None,
|
||||
None,
|
||||
),
|
||||
expected
|
||||
);
|
||||
|
Reference in New Issue
Block a user