Fix repair (#3581)

Add DetachedHeads repair protocol

Add DetachedHeads repair test

Repair starting from root
This commit is contained in:
carllin
2019-04-06 19:41:22 -07:00
committed by GitHub
parent 03da63b41b
commit 20aa4434e2
9 changed files with 343 additions and 319 deletions

View File

@ -20,6 +20,7 @@ use crate::crds_gossip_error::CrdsGossipError;
use crate::crds_gossip_pull::CRDS_GOSSIP_PULL_CRDS_TIMEOUT_MS;
use crate::crds_value::{CrdsValue, CrdsValueLabel, Vote};
use crate::packet::{to_shared_blob, Blob, SharedBlob, BLOB_SIZE};
use crate::repair_service::RepairType;
use crate::result::Result;
use crate::staking_utils;
use crate::streamer::{BlobReceiver, BlobSender};
@ -58,6 +59,9 @@ pub const GROW_LAYER_CAPACITY: bool = false;
/// milliseconds we sleep for between gossip requests
pub const GOSSIP_SLEEP_MILLIS: u64 = 100;
/// the number of slots to respond with when responding to `Orphan` requests
pub const MAX_ORPHAN_REPAIR_RESPONSES: usize = 10;
#[derive(Debug, PartialEq, Eq)]
pub enum ClusterInfoError {
NoPeers,
@ -161,6 +165,7 @@ enum Protocol {
/// TODO: move this message to a different module
RequestWindowIndex(ContactInfo, u64, u64),
RequestHighestWindowIndex(ContactInfo, u64, u64),
RequestOrphan(ContactInfo, u64),
}
impl ClusterInfo {
@ -308,6 +313,7 @@ impl ClusterInfo {
.collect();
let max_ts = votes.iter().map(|x| x.0).max().unwrap_or(since);
let txs: Vec<Transaction> = votes.into_iter().map(|x| x.1).collect();
let votes: Vec<u64> = txs.iter().map(|tx| )
(txs, max_ts)
}
@ -746,12 +752,13 @@ impl ClusterInfo {
Ok(out)
}
pub fn window_index_request(
&self,
slot: u64,
blob_index: u64,
get_highest: bool,
) -> Result<(SocketAddr, Vec<u8>)> {
fn orphan_bytes(&self, slot: u64) -> Result<Vec<u8>> {
let req = Protocol::RequestOrphan(self.my_data().clone(), slot);
let out = serialize(&req)?;
Ok(out)
}
pub fn repair_request(&self, repair_request: &RepairType) -> Result<(SocketAddr, Vec<u8>)> {
// find a peer that appears to be accepting replication, as indicated
// by a valid tvu port location
let valid: Vec<_> = self.repair_peers();
@ -761,19 +768,39 @@ impl ClusterInfo {
let n = thread_rng().gen::<usize>() % valid.len();
let addr = valid[n].gossip; // send the request to the peer's gossip port
let out = {
if get_highest {
self.window_highest_index_request_bytes(slot, blob_index)?
} else {
self.window_index_request_bytes(slot, blob_index)?
match repair_request {
RepairType::Blob(slot, blob_index) => {
submit(
influxdb::Point::new("cluster_info-repair")
.add_field("repair-slot", influxdb::Value::Integer(*slot as i64))
.add_field("repair-ix", influxdb::Value::Integer(*blob_index as i64))
.to_owned(),
);
self.window_index_request_bytes(*slot, *blob_index)?
}
RepairType::HighestBlob(slot, blob_index) => {
submit(
influxdb::Point::new("cluster_info-repair_highest")
.add_field(
"repair-highest-slot",
influxdb::Value::Integer(*slot as i64),
)
.add_field("repair-highest-ix", influxdb::Value::Integer(*slot as i64))
.to_owned(),
);
self.window_highest_index_request_bytes(*slot, *blob_index)?
}
RepairType::Orphan(slot) => {
submit(
influxdb::Point::new("cluster_info-repair_orphan")
.add_field("repair-orphan", influxdb::Value::Integer(*slot as i64))
.to_owned(),
);
self.orphan_bytes(*slot)?
}
}
};
submit(
influxdb::Point::new("cluster-info")
.add_field("repair-ix", influxdb::Value::Integer(blob_index as i64))
.to_owned(),
);
Ok((addr, out))
}
// If the network entrypoint hasn't been discovered yet, add it to the crds table
@ -966,6 +993,35 @@ impl ClusterInfo {
vec![]
}
fn run_orphan(
from_addr: &SocketAddr,
blocktree: Option<&Arc<Blocktree>>,
mut slot: u64,
max_responses: usize,
) -> Vec<SharedBlob> {
let mut res = vec![];
if let Some(blocktree) = blocktree {
// Try to find the next "n" parent slots of the input slot
while let Ok(Some(meta)) = blocktree.meta(slot) {
if meta.received == 0 {
break;
}
let blob = blocktree.get_data_blob(slot, meta.received - 1);
if let Ok(Some(mut blob)) = blob {
blob.meta.set_addr(from_addr);
res.push(Arc::new(RwLock::new(blob)));
}
if meta.is_parent_set() && res.len() <= max_responses {
slot = meta.parent_slot;
} else {
break;
}
}
}
res
}
//TODO we should first coalesce all the requests
fn handle_blob(
obj: &Arc<RwLock<Self>>,
@ -1082,14 +1138,21 @@ impl ClusterInfo {
vec![]
}
}
fn handle_request_window_index(
fn get_repair_sender(request: &Protocol) -> &ContactInfo {
match request {
Protocol::RequestWindowIndex(ref from, _, _) => from,
Protocol::RequestHighestWindowIndex(ref from, _, _) => from,
Protocol::RequestOrphan(ref from, _) => from,
_ => panic!("Not a repair request"),
}
}
fn handle_repair(
me: &Arc<RwLock<Self>>,
from: &ContactInfo,
blocktree: Option<&Arc<Blocktree>>,
slot: u64,
blob_index: u64,
from_addr: &SocketAddr,
is_get_highest: bool,
blocktree: Option<&Arc<Blocktree>>,
request: Protocol,
) -> Vec<SharedBlob> {
let now = Instant::now();
@ -1098,12 +1161,13 @@ impl ClusterInfo {
//TODO verify from is signed
let self_id = me.read().unwrap().gossip.id;
let from = Self::get_repair_sender(&request);
if from.id == me.read().unwrap().gossip.id {
warn!(
"{}: Ignored received RequestWindowIndex from ME {} {} {} ",
self_id, from.id, slot, blob_index,
"{}: Ignored received repair request from ME {}",
self_id, from.id,
);
inc_new_counter_info!("cluster_info-window-request-address-eq", 1);
inc_new_counter_info!("cluster_info-handle-repair--eq", 1);
return vec![];
}
@ -1113,26 +1177,49 @@ impl ClusterInfo {
.crds
.update_record_timestamp(&from.id, timestamp());
let my_info = me.read().unwrap().my_data().clone();
inc_new_counter_info!("cluster_info-window-request-recv", 1);
trace!(
"{}: received RequestWindowIndex from: {} slot: {}, blob_index: {}",
self_id,
from.id,
slot,
blob_index,
);
let res = {
if is_get_highest {
Self::run_highest_window_request(&from_addr, blocktree, slot, blob_index)
} else {
Self::run_window_request(&from, &from_addr, blocktree, &my_info, slot, blob_index)
let (res, label) = {
match &request {
Protocol::RequestWindowIndex(from, slot, blob_index) => {
inc_new_counter_info!("cluster_info-request-window-index", 1);
(
Self::run_window_request(
from,
&from_addr,
blocktree,
&my_info,
*slot,
*blob_index,
),
"RequestWindowIndex",
)
}
Protocol::RequestHighestWindowIndex(_, slot, highest_index) => {
inc_new_counter_info!("cluster_info-request-highest-window-index", 1);
(
Self::run_highest_window_request(
&from_addr,
blocktree,
*slot,
*highest_index,
),
"RequestHighestWindowIndex",
)
}
Protocol::RequestOrphan(_, slot) => {
inc_new_counter_info!("cluster_info-request-orphan", 1);
(
Self::run_orphan(&from_addr, blocktree, *slot, MAX_ORPHAN_REPAIR_RESPONSES),
"RequestOrphan",
)
}
_ => panic!("Not a repair request"),
}
};
report_time_spent(
"RequestWindowIndex",
&now.elapsed(),
&format!("slot {}, blob_index: {}", slot, blob_index),
);
trace!("{}: received repair request: {:?}", self_id, request);
report_time_spent(label, &now.elapsed(), "");
res
}
@ -1198,22 +1285,7 @@ impl ClusterInfo {
}
vec![]
}
Protocol::RequestWindowIndex(from, slot, blob_index) => {
Self::handle_request_window_index(
me, &from, blocktree, slot, blob_index, from_addr, false,
)
}
Protocol::RequestHighestWindowIndex(from, slot, highest_index) => {
Self::handle_request_window_index(
me,
&from,
blocktree,
slot,
highest_index,
from_addr,
true,
)
}
_ => Self::handle_repair(me, from_addr, blocktree, request),
}
}
@ -1522,9 +1594,11 @@ fn report_time_spent(label: &str, time: &Duration, extra: &str) {
mod tests {
use super::*;
use crate::blocktree::get_tmp_ledger_path;
use crate::blocktree::tests::make_many_slot_entries;
use crate::blocktree::Blocktree;
use crate::crds_value::CrdsValueLabel;
use crate::packet::BLOB_HEADER_SIZE;
use crate::repair_service::RepairType;
use crate::result::Error;
use crate::test_tx::test_tx;
use solana_sdk::signature::{Keypair, KeypairUtil};
@ -1591,7 +1665,7 @@ mod tests {
fn window_index_request() {
let me = ContactInfo::new_localhost(&Pubkey::new_rand(), timestamp());
let mut cluster_info = ClusterInfo::new_with_invalid_keypair(me);
let rv = cluster_info.window_index_request(0, 0, false);
let rv = cluster_info.repair_request(&RepairType::Blob(0, 0));
assert_matches!(rv, Err(Error::ClusterInfoError(ClusterInfoError::NoPeers)));
let gossip_addr = socketaddr!([127, 0, 0, 1], 1234);
@ -1607,7 +1681,9 @@ mod tests {
0,
);
cluster_info.insert_info(nxt.clone());
let rv = cluster_info.window_index_request(0, 0, false).unwrap();
let rv = cluster_info
.repair_request(&RepairType::Blob(0, 0))
.unwrap();
assert_eq!(nxt.gossip, gossip_addr);
assert_eq!(rv.0, nxt.gossip);
@ -1628,7 +1704,9 @@ mod tests {
let mut two = false;
while !one || !two {
//this randomly picks an option, so eventually it should pick both
let rv = cluster_info.window_index_request(0, 0, false).unwrap();
let rv = cluster_info
.repair_request(&RepairType::Blob(0, 0))
.unwrap();
if rv.0 == gossip_addr {
one = true;
}
@ -1746,6 +1824,42 @@ mod tests {
Blocktree::destroy(&ledger_path).expect("Expected successful database destruction");
}
#[test]
fn run_orphan() {
solana_logger::setup();
let ledger_path = get_tmp_ledger_path!();
{
let blocktree = Arc::new(Blocktree::open(&ledger_path).unwrap());
let rv = ClusterInfo::run_orphan(&socketaddr_any!(), Some(&blocktree), 2, 0);
assert!(rv.is_empty());
// Create slots 1, 2, 3 with 5 blobs apiece
let (blobs, _) = make_many_slot_entries(1, 3, 5);
blocktree
.write_blobs(&blobs)
.expect("Expect successful ledger write");
// We don't have slot 4, so we don't know how to service this requeset
let rv = ClusterInfo::run_orphan(&socketaddr_any!(), Some(&blocktree), 4, 5);
assert!(rv.is_empty());
// For slot 3, we should return the highest blobs from slots 3, 2, 1 respectively
// for this request
let rv: Vec<_> = ClusterInfo::run_orphan(&socketaddr_any!(), Some(&blocktree), 3, 5)
.iter()
.map(|b| b.read().unwrap().clone())
.collect();
let expected: Vec<_> = (1..=3)
.rev()
.map(|slot| blocktree.get_data_blob(slot, 4).unwrap().unwrap())
.collect();
assert_eq!(rv, expected)
}
Blocktree::destroy(&ledger_path).expect("Expected successful database destruction");
}
#[test]
fn test_default_leader() {
solana_logger::setup();