Fix repair (#3581)
Add DetachedHeads repair protocol Add DetachedHeads repair test Repair starting from root
This commit is contained in:
@ -16,28 +16,16 @@ use std::time::Duration;
|
||||
pub const MAX_REPAIR_LENGTH: usize = 16;
|
||||
pub const REPAIR_MS: u64 = 100;
|
||||
pub const MAX_REPAIR_TRIES: u64 = 128;
|
||||
pub const NUM_FORKS_TO_REPAIR: usize = 5;
|
||||
pub const MAX_ORPHANS: usize = 5;
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub enum RepairType {
|
||||
Orphan(u64),
|
||||
HighestBlob(u64, u64),
|
||||
Blob(u64, u64),
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
struct RepairInfo {
|
||||
max_slot: u64,
|
||||
repair_tries: u64,
|
||||
}
|
||||
|
||||
impl RepairInfo {
|
||||
fn new() -> Self {
|
||||
RepairInfo {
|
||||
max_slot: 0,
|
||||
repair_tries: 0,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct RepairSlotRange {
|
||||
pub start: u64,
|
||||
pub end: u64,
|
||||
@ -62,50 +50,36 @@ impl RepairService {
|
||||
exit: Arc<AtomicBool>,
|
||||
repair_socket: &Arc<UdpSocket>,
|
||||
cluster_info: &Arc<RwLock<ClusterInfo>>,
|
||||
repair_slot_range: RepairSlotRange,
|
||||
_repair_slot_range: RepairSlotRange,
|
||||
) {
|
||||
let mut repair_info = RepairInfo::new();
|
||||
let id = cluster_info.read().unwrap().id();
|
||||
loop {
|
||||
if exit.load(Ordering::Relaxed) {
|
||||
break;
|
||||
}
|
||||
|
||||
let repairs = Self::generate_repairs(
|
||||
blocktree,
|
||||
MAX_REPAIR_LENGTH,
|
||||
&mut repair_info,
|
||||
&repair_slot_range,
|
||||
);
|
||||
let repairs = Self::generate_repairs(blocktree, MAX_REPAIR_LENGTH);
|
||||
|
||||
if let Ok(repairs) = repairs {
|
||||
let reqs: Vec<_> = repairs
|
||||
.into_iter()
|
||||
.filter_map(|repair_request| {
|
||||
let (slot, blob_index, is_highest_request) = {
|
||||
match repair_request {
|
||||
RepairType::Blob(s, i) => (s, i, false),
|
||||
RepairType::HighestBlob(s, i) => (s, i, true),
|
||||
}
|
||||
};
|
||||
cluster_info
|
||||
.read()
|
||||
.unwrap()
|
||||
.window_index_request(slot, blob_index, is_highest_request)
|
||||
.map(|result| (result, slot, blob_index))
|
||||
.repair_request(&repair_request)
|
||||
.map(|result| (result, repair_request))
|
||||
.ok()
|
||||
})
|
||||
.collect();
|
||||
|
||||
for ((to, req), slot, blob_index) in reqs {
|
||||
for ((to, req), repair_request) in reqs {
|
||||
if let Ok(local_addr) = repair_socket.local_addr() {
|
||||
submit(
|
||||
influxdb::Point::new("repair_service")
|
||||
.add_field("repair_slot", influxdb::Value::Integer(slot as i64))
|
||||
.to_owned()
|
||||
.add_field(
|
||||
"repair_blob",
|
||||
influxdb::Value::Integer(blob_index as i64),
|
||||
"repair_request",
|
||||
influxdb::Value::String(format!("{:?}", repair_request)),
|
||||
)
|
||||
.to_owned()
|
||||
.add_field("to", influxdb::Value::String(to.to_string()))
|
||||
@ -151,7 +125,7 @@ impl RepairService {
|
||||
RepairService { t_repair }
|
||||
}
|
||||
|
||||
fn process_slot(
|
||||
fn generate_repairs_for_slot(
|
||||
blocktree: &Blocktree,
|
||||
slot: u64,
|
||||
slot_meta: &SlotMeta,
|
||||
@ -175,49 +149,49 @@ impl RepairService {
|
||||
}
|
||||
}
|
||||
|
||||
fn generate_repairs(
|
||||
blocktree: &Blocktree,
|
||||
max_repairs: usize,
|
||||
repair_info: &mut RepairInfo,
|
||||
repair_range: &RepairSlotRange,
|
||||
) -> Result<(Vec<RepairType>)> {
|
||||
fn generate_repairs(blocktree: &Blocktree, max_repairs: usize) -> Result<(Vec<RepairType>)> {
|
||||
// Slot height and blob indexes for blobs we want to repair
|
||||
let mut repairs: Vec<RepairType> = vec![];
|
||||
let mut current_slot = Some(repair_range.start);
|
||||
while repairs.len() < max_repairs && current_slot.is_some() {
|
||||
if current_slot.unwrap() > repair_range.end {
|
||||
break;
|
||||
}
|
||||
let slot = *blocktree.root_slot.read().unwrap();
|
||||
Self::generate_repairs_for_fork(blocktree, &mut repairs, max_repairs, slot);
|
||||
|
||||
if current_slot.unwrap() > repair_info.max_slot {
|
||||
repair_info.repair_tries = 0;
|
||||
repair_info.max_slot = current_slot.unwrap();
|
||||
}
|
||||
// TODO: Incorporate gossip to determine priorities for repair?
|
||||
|
||||
if let Some(slot) = blocktree.meta(current_slot.unwrap())? {
|
||||
let new_repairs = Self::process_slot(
|
||||
// Try to resolve orphans in blocktree
|
||||
let orphans = blocktree.get_orphans(Some(MAX_ORPHANS));
|
||||
|
||||
Self::generate_repairs_for_orphans(&orphans[..], &mut repairs);
|
||||
Ok(repairs)
|
||||
}
|
||||
|
||||
fn generate_repairs_for_orphans(orphans: &[u64], repairs: &mut Vec<RepairType>) {
|
||||
repairs.extend(orphans.iter().map(|h| RepairType::Orphan(*h)));
|
||||
}
|
||||
|
||||
/// Repairs any fork starting at the input slot
|
||||
fn generate_repairs_for_fork(
|
||||
blocktree: &Blocktree,
|
||||
repairs: &mut Vec<RepairType>,
|
||||
max_repairs: usize,
|
||||
slot: u64,
|
||||
) {
|
||||
let mut pending_slots = vec![slot];
|
||||
while repairs.len() < max_repairs && !pending_slots.is_empty() {
|
||||
let slot = pending_slots.pop().unwrap();
|
||||
if let Some(slot_meta) = blocktree.meta(slot).unwrap() {
|
||||
let new_repairs = Self::generate_repairs_for_slot(
|
||||
blocktree,
|
||||
current_slot.unwrap(),
|
||||
&slot,
|
||||
slot,
|
||||
&slot_meta,
|
||||
max_repairs - repairs.len(),
|
||||
);
|
||||
repairs.extend(new_repairs);
|
||||
let next_slots = slot_meta.next_slots;
|
||||
pending_slots.extend(next_slots);
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
current_slot = blocktree.get_next_slot(current_slot.unwrap())?;
|
||||
}
|
||||
|
||||
// Only increment repair_tries if the ledger contains every blob for every slot
|
||||
if repairs.is_empty() {
|
||||
repair_info.repair_tries += 1;
|
||||
}
|
||||
|
||||
// Optimistically try the next slot if we haven't gotten any repairs
|
||||
// for a while
|
||||
if repair_info.repair_tries >= MAX_REPAIR_TRIES {
|
||||
repairs.push(RepairType::HighestBlob(repair_info.max_slot + 1, 0))
|
||||
}
|
||||
|
||||
Ok(repairs)
|
||||
}
|
||||
}
|
||||
|
||||
@ -234,67 +208,26 @@ mod test {
|
||||
use super::*;
|
||||
use crate::blocktree::tests::{make_many_slot_entries, make_slot_entries};
|
||||
use crate::blocktree::{get_tmp_ledger_path, Blocktree};
|
||||
use crate::entry::create_ticks;
|
||||
use crate::entry::{make_tiny_test_entries, EntrySlice};
|
||||
use solana_sdk::hash::Hash;
|
||||
|
||||
#[test]
|
||||
pub fn test_repair_missed_future_slot() {
|
||||
pub fn test_repair_orphan() {
|
||||
let blocktree_path = get_tmp_ledger_path!();
|
||||
{
|
||||
let blocktree = Blocktree::open(&blocktree_path).unwrap();
|
||||
|
||||
let mut blobs = create_ticks(1, Hash::default()).to_blobs();
|
||||
blobs[0].set_index(0);
|
||||
blobs[0].set_slot(0);
|
||||
blobs[0].set_is_last_in_slot();
|
||||
|
||||
blocktree.write_blobs(&blobs).unwrap();
|
||||
|
||||
let mut repair_info = RepairInfo::new();
|
||||
let repair_slot_range = RepairSlotRange::default();
|
||||
// We have all the blobs for all the slots in the ledger, wait for optimistic
|
||||
// future repair after MAX_REPAIR_TRIES
|
||||
for i in 0..MAX_REPAIR_TRIES {
|
||||
// Check that repair tries to patch the empty slot
|
||||
assert_eq!(repair_info.repair_tries, i);
|
||||
assert_eq!(repair_info.max_slot, 0);
|
||||
let expected = if i == MAX_REPAIR_TRIES - 1 {
|
||||
vec![RepairType::HighestBlob(1, 0)]
|
||||
} else {
|
||||
vec![]
|
||||
};
|
||||
assert_eq!(
|
||||
RepairService::generate_repairs(
|
||||
&blocktree,
|
||||
2,
|
||||
&mut repair_info,
|
||||
&repair_slot_range
|
||||
)
|
||||
.unwrap(),
|
||||
expected
|
||||
);
|
||||
}
|
||||
|
||||
// Insert a bigger blob, see that we the MAX_REPAIR_TRIES gets reset
|
||||
let mut blobs = create_ticks(1, Hash::default()).to_blobs();
|
||||
blobs[0].set_index(0);
|
||||
blobs[0].set_slot(1);
|
||||
blobs[0].set_is_last_in_slot();
|
||||
|
||||
// Create some orphan slots
|
||||
let (mut blobs, _) = make_slot_entries(1, 0, 1);
|
||||
let (blobs2, _) = make_slot_entries(5, 2, 1);
|
||||
blobs.extend(blobs2);
|
||||
blocktree.write_blobs(&blobs).unwrap();
|
||||
assert_eq!(
|
||||
RepairService::generate_repairs(
|
||||
&blocktree,
|
||||
2,
|
||||
&mut repair_info,
|
||||
&repair_slot_range
|
||||
)
|
||||
.unwrap(),
|
||||
vec![]
|
||||
RepairService::generate_repairs(&blocktree, 2).unwrap(),
|
||||
vec![
|
||||
RepairType::HighestBlob(0, 0),
|
||||
RepairType::Orphan(0),
|
||||
RepairType::Orphan(2)
|
||||
]
|
||||
);
|
||||
assert_eq!(repair_info.repair_tries, 1);
|
||||
assert_eq!(repair_info.max_slot, 1);
|
||||
}
|
||||
|
||||
Blocktree::destroy(&blocktree_path).expect("Expected successful database destruction");
|
||||
@ -306,25 +239,16 @@ mod test {
|
||||
{
|
||||
let blocktree = Blocktree::open(&blocktree_path).unwrap();
|
||||
|
||||
let mut blobs = make_tiny_test_entries(1).to_blobs();
|
||||
blobs[0].set_index(1);
|
||||
blobs[0].set_slot(2);
|
||||
let (blobs, _) = make_slot_entries(2, 0, 1);
|
||||
|
||||
let mut repair_info = RepairInfo::new();
|
||||
|
||||
// Write this blob to slot 2, should chain to slot 1, which we haven't received
|
||||
// Write this blob to slot 2, should chain to slot 0, which we haven't received
|
||||
// any blobs for
|
||||
blocktree.write_blobs(&blobs).unwrap();
|
||||
|
||||
// Check that repair tries to patch the empty slot
|
||||
assert_eq!(
|
||||
RepairService::generate_repairs(
|
||||
&blocktree,
|
||||
2,
|
||||
&mut repair_info,
|
||||
&RepairSlotRange::default()
|
||||
)
|
||||
.unwrap(),
|
||||
vec![RepairType::HighestBlob(0, 0), RepairType::Blob(2, 0)]
|
||||
RepairService::generate_repairs(&blocktree, 2).unwrap(),
|
||||
vec![RepairType::HighestBlob(0, 0), RepairType::Orphan(0)]
|
||||
);
|
||||
}
|
||||
Blocktree::destroy(&blocktree_path).expect("Expected successful database destruction");
|
||||
@ -340,8 +264,6 @@ mod test {
|
||||
let num_entries_per_slot = 5 * nth;
|
||||
let num_slots = 2;
|
||||
|
||||
let mut repair_info = RepairInfo::new();
|
||||
|
||||
// Create some blobs
|
||||
let (blobs, _) =
|
||||
make_many_slot_entries(0, num_slots as u64, num_entries_per_slot as u64);
|
||||
@ -363,28 +285,13 @@ mod test {
|
||||
})
|
||||
.collect();
|
||||
|
||||
// Across all slots, find all missing indexes in the range [0, num_entries_per_slot]
|
||||
let repair_slot_range = RepairSlotRange::default();
|
||||
|
||||
assert_eq!(
|
||||
RepairService::generate_repairs(
|
||||
&blocktree,
|
||||
std::usize::MAX,
|
||||
&mut repair_info,
|
||||
&repair_slot_range
|
||||
)
|
||||
.unwrap(),
|
||||
RepairService::generate_repairs(&blocktree, std::usize::MAX).unwrap(),
|
||||
expected
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
RepairService::generate_repairs(
|
||||
&blocktree,
|
||||
expected.len() - 2,
|
||||
&mut repair_info,
|
||||
&repair_slot_range
|
||||
)
|
||||
.unwrap()[..],
|
||||
RepairService::generate_repairs(&blocktree, expected.len() - 2).unwrap()[..],
|
||||
expected[0..expected.len() - 2]
|
||||
);
|
||||
}
|
||||
@ -399,8 +306,6 @@ mod test {
|
||||
|
||||
let num_entries_per_slot = 10;
|
||||
|
||||
let mut repair_info = RepairInfo::new();
|
||||
|
||||
// Create some blobs
|
||||
let (mut blobs, _) = make_slot_entries(0, 0, num_entries_per_slot as u64);
|
||||
|
||||
@ -412,23 +317,15 @@ mod test {
|
||||
// We didn't get the last blob for this slot, so ask for the highest blob for that slot
|
||||
let expected: Vec<RepairType> = vec![RepairType::HighestBlob(0, num_entries_per_slot)];
|
||||
|
||||
let repair_slot_range = RepairSlotRange::default();
|
||||
|
||||
assert_eq!(
|
||||
RepairService::generate_repairs(
|
||||
&blocktree,
|
||||
std::usize::MAX,
|
||||
&mut repair_info,
|
||||
&repair_slot_range
|
||||
)
|
||||
.unwrap(),
|
||||
RepairService::generate_repairs(&blocktree, std::usize::MAX).unwrap(),
|
||||
expected
|
||||
);
|
||||
}
|
||||
Blocktree::destroy(&blocktree_path).expect("Expected successful database destruction");
|
||||
}
|
||||
|
||||
#[test]
|
||||
/*#[test]
|
||||
pub fn test_repair_range() {
|
||||
let blocktree_path = get_tmp_ledger_path!();
|
||||
{
|
||||
@ -456,17 +353,11 @@ mod test {
|
||||
repair_slot_range.end = end;
|
||||
|
||||
assert_eq!(
|
||||
RepairService::generate_repairs(
|
||||
&blocktree,
|
||||
std::usize::MAX,
|
||||
&mut repair_info,
|
||||
&repair_slot_range
|
||||
)
|
||||
.unwrap(),
|
||||
RepairService::generate_repairs(&blocktree, std::usize::MAX, &mut repair_info,)
|
||||
.unwrap(),
|
||||
expected
|
||||
);
|
||||
}
|
||||
Blocktree::destroy(&blocktree_path).expect("Expected successful database destruction");
|
||||
}
|
||||
|
||||
}*/
|
||||
}
|
||||
|
Reference in New Issue
Block a user