Fix repair (#3581)

Add DetachedHeads repair protocol

Add DetachedHeads repair test

Repair starting from root
This commit is contained in:
carllin
2019-04-06 19:41:22 -07:00
committed by GitHub
parent 03da63b41b
commit 20aa4434e2
9 changed files with 343 additions and 319 deletions

View File

@ -16,28 +16,16 @@ use std::time::Duration;
pub const MAX_REPAIR_LENGTH: usize = 16;
pub const REPAIR_MS: u64 = 100;
pub const MAX_REPAIR_TRIES: u64 = 128;
pub const NUM_FORKS_TO_REPAIR: usize = 5;
pub const MAX_ORPHANS: usize = 5;
#[derive(Serialize, Deserialize, Debug, Clone, Copy, PartialEq, Eq)]
pub enum RepairType {
Orphan(u64),
HighestBlob(u64, u64),
Blob(u64, u64),
}
#[derive(Default)]
struct RepairInfo {
max_slot: u64,
repair_tries: u64,
}
impl RepairInfo {
fn new() -> Self {
RepairInfo {
max_slot: 0,
repair_tries: 0,
}
}
}
pub struct RepairSlotRange {
pub start: u64,
pub end: u64,
@ -62,50 +50,36 @@ impl RepairService {
exit: Arc<AtomicBool>,
repair_socket: &Arc<UdpSocket>,
cluster_info: &Arc<RwLock<ClusterInfo>>,
repair_slot_range: RepairSlotRange,
_repair_slot_range: RepairSlotRange,
) {
let mut repair_info = RepairInfo::new();
let id = cluster_info.read().unwrap().id();
loop {
if exit.load(Ordering::Relaxed) {
break;
}
let repairs = Self::generate_repairs(
blocktree,
MAX_REPAIR_LENGTH,
&mut repair_info,
&repair_slot_range,
);
let repairs = Self::generate_repairs(blocktree, MAX_REPAIR_LENGTH);
if let Ok(repairs) = repairs {
let reqs: Vec<_> = repairs
.into_iter()
.filter_map(|repair_request| {
let (slot, blob_index, is_highest_request) = {
match repair_request {
RepairType::Blob(s, i) => (s, i, false),
RepairType::HighestBlob(s, i) => (s, i, true),
}
};
cluster_info
.read()
.unwrap()
.window_index_request(slot, blob_index, is_highest_request)
.map(|result| (result, slot, blob_index))
.repair_request(&repair_request)
.map(|result| (result, repair_request))
.ok()
})
.collect();
for ((to, req), slot, blob_index) in reqs {
for ((to, req), repair_request) in reqs {
if let Ok(local_addr) = repair_socket.local_addr() {
submit(
influxdb::Point::new("repair_service")
.add_field("repair_slot", influxdb::Value::Integer(slot as i64))
.to_owned()
.add_field(
"repair_blob",
influxdb::Value::Integer(blob_index as i64),
"repair_request",
influxdb::Value::String(format!("{:?}", repair_request)),
)
.to_owned()
.add_field("to", influxdb::Value::String(to.to_string()))
@ -151,7 +125,7 @@ impl RepairService {
RepairService { t_repair }
}
fn process_slot(
fn generate_repairs_for_slot(
blocktree: &Blocktree,
slot: u64,
slot_meta: &SlotMeta,
@ -175,49 +149,49 @@ impl RepairService {
}
}
fn generate_repairs(
blocktree: &Blocktree,
max_repairs: usize,
repair_info: &mut RepairInfo,
repair_range: &RepairSlotRange,
) -> Result<(Vec<RepairType>)> {
fn generate_repairs(blocktree: &Blocktree, max_repairs: usize) -> Result<(Vec<RepairType>)> {
// Slot height and blob indexes for blobs we want to repair
let mut repairs: Vec<RepairType> = vec![];
let mut current_slot = Some(repair_range.start);
while repairs.len() < max_repairs && current_slot.is_some() {
if current_slot.unwrap() > repair_range.end {
break;
}
let slot = *blocktree.root_slot.read().unwrap();
Self::generate_repairs_for_fork(blocktree, &mut repairs, max_repairs, slot);
if current_slot.unwrap() > repair_info.max_slot {
repair_info.repair_tries = 0;
repair_info.max_slot = current_slot.unwrap();
}
// TODO: Incorporate gossip to determine priorities for repair?
if let Some(slot) = blocktree.meta(current_slot.unwrap())? {
let new_repairs = Self::process_slot(
// Try to resolve orphans in blocktree
let orphans = blocktree.get_orphans(Some(MAX_ORPHANS));
Self::generate_repairs_for_orphans(&orphans[..], &mut repairs);
Ok(repairs)
}
fn generate_repairs_for_orphans(orphans: &[u64], repairs: &mut Vec<RepairType>) {
repairs.extend(orphans.iter().map(|h| RepairType::Orphan(*h)));
}
/// Repairs any fork starting at the input slot
fn generate_repairs_for_fork(
blocktree: &Blocktree,
repairs: &mut Vec<RepairType>,
max_repairs: usize,
slot: u64,
) {
let mut pending_slots = vec![slot];
while repairs.len() < max_repairs && !pending_slots.is_empty() {
let slot = pending_slots.pop().unwrap();
if let Some(slot_meta) = blocktree.meta(slot).unwrap() {
let new_repairs = Self::generate_repairs_for_slot(
blocktree,
current_slot.unwrap(),
&slot,
slot,
&slot_meta,
max_repairs - repairs.len(),
);
repairs.extend(new_repairs);
let next_slots = slot_meta.next_slots;
pending_slots.extend(next_slots);
} else {
break;
}
current_slot = blocktree.get_next_slot(current_slot.unwrap())?;
}
// Only increment repair_tries if the ledger contains every blob for every slot
if repairs.is_empty() {
repair_info.repair_tries += 1;
}
// Optimistically try the next slot if we haven't gotten any repairs
// for a while
if repair_info.repair_tries >= MAX_REPAIR_TRIES {
repairs.push(RepairType::HighestBlob(repair_info.max_slot + 1, 0))
}
Ok(repairs)
}
}
@ -234,67 +208,26 @@ mod test {
use super::*;
use crate::blocktree::tests::{make_many_slot_entries, make_slot_entries};
use crate::blocktree::{get_tmp_ledger_path, Blocktree};
use crate::entry::create_ticks;
use crate::entry::{make_tiny_test_entries, EntrySlice};
use solana_sdk::hash::Hash;
#[test]
pub fn test_repair_missed_future_slot() {
pub fn test_repair_orphan() {
let blocktree_path = get_tmp_ledger_path!();
{
let blocktree = Blocktree::open(&blocktree_path).unwrap();
let mut blobs = create_ticks(1, Hash::default()).to_blobs();
blobs[0].set_index(0);
blobs[0].set_slot(0);
blobs[0].set_is_last_in_slot();
blocktree.write_blobs(&blobs).unwrap();
let mut repair_info = RepairInfo::new();
let repair_slot_range = RepairSlotRange::default();
// We have all the blobs for all the slots in the ledger, wait for optimistic
// future repair after MAX_REPAIR_TRIES
for i in 0..MAX_REPAIR_TRIES {
// Check that repair tries to patch the empty slot
assert_eq!(repair_info.repair_tries, i);
assert_eq!(repair_info.max_slot, 0);
let expected = if i == MAX_REPAIR_TRIES - 1 {
vec![RepairType::HighestBlob(1, 0)]
} else {
vec![]
};
assert_eq!(
RepairService::generate_repairs(
&blocktree,
2,
&mut repair_info,
&repair_slot_range
)
.unwrap(),
expected
);
}
// Insert a bigger blob, see that we the MAX_REPAIR_TRIES gets reset
let mut blobs = create_ticks(1, Hash::default()).to_blobs();
blobs[0].set_index(0);
blobs[0].set_slot(1);
blobs[0].set_is_last_in_slot();
// Create some orphan slots
let (mut blobs, _) = make_slot_entries(1, 0, 1);
let (blobs2, _) = make_slot_entries(5, 2, 1);
blobs.extend(blobs2);
blocktree.write_blobs(&blobs).unwrap();
assert_eq!(
RepairService::generate_repairs(
&blocktree,
2,
&mut repair_info,
&repair_slot_range
)
.unwrap(),
vec![]
RepairService::generate_repairs(&blocktree, 2).unwrap(),
vec![
RepairType::HighestBlob(0, 0),
RepairType::Orphan(0),
RepairType::Orphan(2)
]
);
assert_eq!(repair_info.repair_tries, 1);
assert_eq!(repair_info.max_slot, 1);
}
Blocktree::destroy(&blocktree_path).expect("Expected successful database destruction");
@ -306,25 +239,16 @@ mod test {
{
let blocktree = Blocktree::open(&blocktree_path).unwrap();
let mut blobs = make_tiny_test_entries(1).to_blobs();
blobs[0].set_index(1);
blobs[0].set_slot(2);
let (blobs, _) = make_slot_entries(2, 0, 1);
let mut repair_info = RepairInfo::new();
// Write this blob to slot 2, should chain to slot 1, which we haven't received
// Write this blob to slot 2, should chain to slot 0, which we haven't received
// any blobs for
blocktree.write_blobs(&blobs).unwrap();
// Check that repair tries to patch the empty slot
assert_eq!(
RepairService::generate_repairs(
&blocktree,
2,
&mut repair_info,
&RepairSlotRange::default()
)
.unwrap(),
vec![RepairType::HighestBlob(0, 0), RepairType::Blob(2, 0)]
RepairService::generate_repairs(&blocktree, 2).unwrap(),
vec![RepairType::HighestBlob(0, 0), RepairType::Orphan(0)]
);
}
Blocktree::destroy(&blocktree_path).expect("Expected successful database destruction");
@ -340,8 +264,6 @@ mod test {
let num_entries_per_slot = 5 * nth;
let num_slots = 2;
let mut repair_info = RepairInfo::new();
// Create some blobs
let (blobs, _) =
make_many_slot_entries(0, num_slots as u64, num_entries_per_slot as u64);
@ -363,28 +285,13 @@ mod test {
})
.collect();
// Across all slots, find all missing indexes in the range [0, num_entries_per_slot]
let repair_slot_range = RepairSlotRange::default();
assert_eq!(
RepairService::generate_repairs(
&blocktree,
std::usize::MAX,
&mut repair_info,
&repair_slot_range
)
.unwrap(),
RepairService::generate_repairs(&blocktree, std::usize::MAX).unwrap(),
expected
);
assert_eq!(
RepairService::generate_repairs(
&blocktree,
expected.len() - 2,
&mut repair_info,
&repair_slot_range
)
.unwrap()[..],
RepairService::generate_repairs(&blocktree, expected.len() - 2).unwrap()[..],
expected[0..expected.len() - 2]
);
}
@ -399,8 +306,6 @@ mod test {
let num_entries_per_slot = 10;
let mut repair_info = RepairInfo::new();
// Create some blobs
let (mut blobs, _) = make_slot_entries(0, 0, num_entries_per_slot as u64);
@ -412,23 +317,15 @@ mod test {
// We didn't get the last blob for this slot, so ask for the highest blob for that slot
let expected: Vec<RepairType> = vec![RepairType::HighestBlob(0, num_entries_per_slot)];
let repair_slot_range = RepairSlotRange::default();
assert_eq!(
RepairService::generate_repairs(
&blocktree,
std::usize::MAX,
&mut repair_info,
&repair_slot_range
)
.unwrap(),
RepairService::generate_repairs(&blocktree, std::usize::MAX).unwrap(),
expected
);
}
Blocktree::destroy(&blocktree_path).expect("Expected successful database destruction");
}
#[test]
/*#[test]
pub fn test_repair_range() {
let blocktree_path = get_tmp_ledger_path!();
{
@ -456,17 +353,11 @@ mod test {
repair_slot_range.end = end;
assert_eq!(
RepairService::generate_repairs(
&blocktree,
std::usize::MAX,
&mut repair_info,
&repair_slot_range
)
.unwrap(),
RepairService::generate_repairs(&blocktree, std::usize::MAX, &mut repair_info,)
.unwrap(),
expected
);
}
Blocktree::destroy(&blocktree_path).expect("Expected successful database destruction");
}
}*/
}