retransmits shreds recovered from erasure codes

Shreds recovered from erasure codes have not been received from turbine
and have not been retransmitted to other nodes downstream. This results
in more repairs across the cluster which is slower.

This commit channels through recovered shreds to retransmit stage in
order to further broadcast the shreds to downstream nodes in the tree.
This commit is contained in:
behzad nouri
2021-08-13 14:11:37 -04:00
parent 3c71670bd9
commit 7a8807b8bb
3 changed files with 112 additions and 114 deletions

View File

@ -460,6 +460,8 @@ impl RetransmitStage {
ancestor_hashes_replay_update_receiver: AncestorHashesReplayUpdateReceiver,
) -> Self {
let (retransmit_sender, retransmit_receiver) = channel();
// https://github.com/rust-lang/rust/issues/39364#issuecomment-634545136
let _retransmit_sender = retransmit_sender.clone();
let retransmit_receiver = Arc::new(Mutex::new(retransmit_receiver));
let thread_hdls = retransmitter(
@ -598,6 +600,7 @@ mod tests {
let cluster_info = Arc::new(cluster_info);
let (retransmit_sender, retransmit_receiver) = channel();
let _retransmit_sender = retransmit_sender.clone();
let _t_retransmit = retransmitter(
retransmit_socket,
bank_forks,

View File

@ -260,6 +260,7 @@ fn run_insert<F>(
metrics: &mut BlockstoreInsertionMetrics,
ws_metrics: &mut WindowServiceMetrics,
completed_data_sets_sender: &CompletedDataSetsSender,
retransmit_sender: &Sender<Vec<Shred>>,
outstanding_requests: &RwLock<OutstandingShredRepairs>,
) -> Result<()>
where
@ -287,7 +288,8 @@ where
shreds,
repairs,
Some(leader_schedule_cache),
false,
false, // is_trusted
Some(retransmit_sender),
&handle_duplicate,
metrics,
)?;
@ -467,6 +469,7 @@ impl WindowService {
insert_receiver,
duplicate_sender,
completed_data_sets_sender,
retransmit_sender.clone(),
outstanding_requests,
);
@ -528,6 +531,7 @@ impl WindowService {
insert_receiver: CrossbeamReceiver<(Vec<Shred>, Vec<Option<RepairMeta>>)>,
check_duplicate_sender: CrossbeamSender<Shred>,
completed_data_sets_sender: CompletedDataSetsSender,
retransmit_sender: Sender<Vec<Shred>>,
outstanding_requests: Arc<RwLock<OutstandingShredRepairs>>,
) -> JoinHandle<()> {
let mut handle_timeout = || {};
@ -557,6 +561,7 @@ impl WindowService {
&mut metrics,
&mut ws_metrics,
&completed_data_sets_sender,
&retransmit_sender,
&outstanding_requests,
) {
if Self::should_exit_on_error(e, &mut handle_timeout, &handle_error) {