Add scalable gossip library (#1546)

* Cluster Replicated Data Store

Separate the data storage and merge strategy from the network IO boundary.
Implement an eager push overlay for transporting recent messages.

Simulation shows fast convergence with 20k nodes.
This commit is contained in:
anatoly yakovenko
2018-11-15 13:23:26 -08:00
committed by GitHub
parent 4a3230904e
commit a41254e18c
31 changed files with 2821 additions and 1698 deletions

View File

@ -10,6 +10,7 @@ use solana::ncp::Ncp;
use solana::packet::{Blob, SharedBlob};
use solana::result;
use solana::service::Service;
use solana::timing::timestamp;
use std::net::UdpSocket;
use std::sync::atomic::{AtomicBool, Ordering};
use std::sync::{Arc, RwLock};
@ -22,6 +23,7 @@ fn test_node(exit: Arc<AtomicBool>) -> (Arc<RwLock<ClusterInfo>>, Ncp, UdpSocket
let c = Arc::new(RwLock::new(cluster_info));
let w = Arc::new(RwLock::new(vec![]));
let d = Ncp::new(&c.clone(), w, None, tn.sockets.gossip, exit);
let _ = c.read().unwrap().my_data();
(c, d, tn.sockets.replicate.pop().unwrap())
}
@ -29,38 +31,31 @@ fn test_node(exit: Arc<AtomicBool>) -> (Arc<RwLock<ClusterInfo>>, Ncp, UdpSocket
/// Run until every node in the network has a full NodeInfo set.
/// Check that nodes stop sending updates after all the NodeInfo has been shared.
/// tests that actually use this function are below
fn run_gossip_topo<F>(topo: F)
fn run_gossip_topo<F>(num: usize, topo: F)
where
F: Fn(&Vec<(Arc<RwLock<ClusterInfo>>, Ncp, UdpSocket)>) -> (),
{
let num: usize = 5;
let exit = Arc::new(AtomicBool::new(false));
let listen: Vec<_> = (0..num).map(|_| test_node(exit.clone())).collect();
topo(&listen);
let mut done = true;
for i in 0..(num * 32) {
done = false;
trace!("round {}", i);
for (c, _, _) in &listen {
if num == c.read().unwrap().convergence() as usize {
done = true;
break;
}
}
//at least 1 node converged
if done == true {
done = true;
let total: usize = listen
.iter()
.map(|v| v.0.read().unwrap().ncp_peers().len())
.sum();
if (total + num) * 10 > num * num * 9 {
done = true;
break;
} else {
trace!("not converged {} {} {}", i, total + num, num * num);
}
sleep(Duration::new(1, 0));
}
exit.store(true, Ordering::Relaxed);
for (c, dr, _) in listen {
for (_, dr, _) in listen {
dr.join().unwrap();
// make it clear what failed
// protocol is to chatty, updates should stop after everyone receives `num`
assert!(c.read().unwrap().update_index <= num as u64);
// protocol is not chatty enough, everyone should get `num` entries
assert_eq!(c.read().unwrap().table.len(), num);
}
assert!(done);
}
@ -68,37 +63,57 @@ where
#[test]
fn gossip_ring() -> result::Result<()> {
logger::setup();
run_gossip_topo(|listen| {
run_gossip_topo(50, |listen| {
let num = listen.len();
for n in 0..num {
let y = n % listen.len();
let x = (n + 1) % listen.len();
let mut xv = listen[x].0.write().unwrap();
let yv = listen[y].0.read().unwrap();
let mut d = yv.table[&yv.id].clone();
d.version = 0;
xv.insert(&d);
let mut d = yv.lookup(yv.id()).unwrap().clone();
d.wallclock = timestamp();
xv.insert_info(d);
}
});
Ok(())
}
/// ring a -> b -> c -> d -> e -> a
#[test]
#[ignore]
fn gossip_ring_large() -> result::Result<()> {
logger::setup();
run_gossip_topo(600, |listen| {
let num = listen.len();
for n in 0..num {
let y = n % listen.len();
let x = (n + 1) % listen.len();
let mut xv = listen[x].0.write().unwrap();
let yv = listen[y].0.read().unwrap();
let mut d = yv.lookup(yv.id()).unwrap().clone();
d.wallclock = timestamp();
xv.insert_info(d);
}
});
Ok(())
}
/// star a -> (b,c,d,e)
#[test]
fn gossip_star() {
logger::setup();
run_gossip_topo(|listen| {
run_gossip_topo(50, |listen| {
let num = listen.len();
for n in 0..(num - 1) {
let x = 0;
let y = (n + 1) % listen.len();
let mut xv = listen[x].0.write().unwrap();
let yv = listen[y].0.read().unwrap();
let mut yd = yv.table[&yv.id].clone();
yd.version = 0;
xv.insert(&yd);
trace!("star leader {:?}", &xv.id.as_ref()[..4]);
let mut yd = yv.lookup(yv.id()).unwrap().clone();
yd.wallclock = timestamp();
xv.insert_info(yd);
trace!("star leader {}", &xv.id());
}
});
}
@ -107,22 +122,18 @@ fn gossip_star() {
#[test]
fn gossip_rstar() {
logger::setup();
run_gossip_topo(|listen| {
run_gossip_topo(50, |listen| {
let num = listen.len();
let xd = {
let xv = listen[0].0.read().unwrap();
xv.table[&xv.id].clone()
xv.lookup(xv.id()).unwrap().clone()
};
trace!("rstar leader {:?}", &xd.id.as_ref()[..4]);
trace!("rstar leader {}", xd.id);
for n in 0..(num - 1) {
let y = (n + 1) % listen.len();
let mut yv = listen[y].0.write().unwrap();
yv.insert(&xd);
trace!(
"rstar insert {:?} into {:?}",
&xd.id.as_ref()[..4],
&yv.id.as_ref()[..4]
);
yv.insert_info(xd.clone());
trace!("rstar insert {} into {}", xd.id, yv.id());
}
});
}
@ -140,19 +151,20 @@ pub fn cluster_info_retransmit() -> result::Result<()> {
let c1_data = c1.read().unwrap().my_data().clone();
c1.write().unwrap().set_leader(c1_data.id);
c2.write().unwrap().insert(&c1_data);
c3.write().unwrap().insert(&c1_data);
c2.write().unwrap().insert_info(c1_data.clone());
c3.write().unwrap().insert_info(c1_data.clone());
c2.write().unwrap().set_leader(c1_data.id);
c3.write().unwrap().set_leader(c1_data.id);
let num = 3;
//wait to converge
trace!("waiting to converge:");
let mut done = false;
for _ in 0..30 {
done = c1.read().unwrap().table.len() == 3
&& c2.read().unwrap().table.len() == 3
&& c3.read().unwrap().table.len() == 3;
done = c1.read().unwrap().ncp_peers().len() == num - 1
&& c2.read().unwrap().ncp_peers().len() == num - 1
&& c3.read().unwrap().ncp_peers().len() == num - 1;
if done {
break;
}
@ -180,102 +192,3 @@ pub fn cluster_info_retransmit() -> result::Result<()> {
Ok(())
}
#[test]
#[ignore]
fn test_external_liveness_table() {
logger::setup();
assert!(cfg!(feature = "test"));
let c1_c4_exit = Arc::new(AtomicBool::new(false));
let c2_c3_exit = Arc::new(AtomicBool::new(false));
trace!("c1:");
let (c1, dr1, _) = test_node(c1_c4_exit.clone());
trace!("c2:");
let (c2, dr2, _) = test_node(c2_c3_exit.clone());
trace!("c3:");
let (c3, dr3, _) = test_node(c2_c3_exit.clone());
trace!("c4:");
let (c4, dr4, _) = test_node(c1_c4_exit.clone());
let c1_data = c1.read().unwrap().my_data().clone();
c1.write().unwrap().set_leader(c1_data.id);
let c2_id = c2.read().unwrap().id;
let c3_id = c3.read().unwrap().id;
let c4_id = c4.read().unwrap().id;
// Insert the remote data about c4
let c2_index_for_c4 = 10;
c2.write().unwrap().remote.insert(c4_id, c2_index_for_c4);
let c3_index_for_c4 = 20;
c3.write().unwrap().remote.insert(c4_id, c3_index_for_c4);
// Set up the initial network topology
c2.write().unwrap().insert(&c1_data);
c3.write().unwrap().insert(&c1_data);
c2.write().unwrap().set_leader(c1_data.id);
c3.write().unwrap().set_leader(c1_data.id);
// Wait to converge
trace!("waiting to converge:");
let mut done = false;
for _ in 0..30 {
done = c1.read().unwrap().table.len() == 3
&& c2.read().unwrap().table.len() == 3
&& c3.read().unwrap().table.len() == 3;
if done {
break;
}
sleep(Duration::new(1, 0));
}
assert!(done);
// Validate c1's external liveness table, then release lock rc1
{
let rc1 = c1.read().unwrap();
let el = rc1.get_external_liveness_entry(&c4.read().unwrap().id);
// Make sure liveness table entry for c4 exists on node c1
assert!(el.is_some());
let liveness_map = el.unwrap();
// Make sure liveness table entry contains correct result for c2
let c2_index_result_for_c4 = liveness_map.get(&c2_id);
assert!(c2_index_result_for_c4.is_some());
assert_eq!(*(c2_index_result_for_c4.unwrap()), c2_index_for_c4);
// Make sure liveness table entry contains correct result for c3
let c3_index_result_for_c4 = liveness_map.get(&c3_id);
assert!(c3_index_result_for_c4.is_some());
assert_eq!(*(c3_index_result_for_c4.unwrap()), c3_index_for_c4);
}
// Shutdown validators c2 and c3
c2_c3_exit.store(true, Ordering::Relaxed);
dr2.join().unwrap();
dr3.join().unwrap();
// Allow communication between c1 and c4, make sure that c1's external_liveness table
// entry for c4 gets cleared
c4.write().unwrap().insert(&c1_data);
c4.write().unwrap().set_leader(c1_data.id);
for _ in 0..30 {
done = c1
.read()
.unwrap()
.get_external_liveness_entry(&c4_id)
.is_none();
if done {
break;
}
sleep(Duration::new(1, 0));
}
assert!(done);
// Shutdown validators c1 and c4
c1_c4_exit.store(true, Ordering::Relaxed);
dr1.join().unwrap();
dr4.join().unwrap();
}

View File

@ -45,9 +45,9 @@ fn make_spy_node(leader: &NodeInfo) -> (Ncp, Arc<RwLock<ClusterInfo>>, Pubkey) {
let mut spy = Node::new_localhost();
let me = spy.info.id.clone();
let daddr = "0.0.0.0:0".parse().unwrap();
spy.info.contact_info.tvu = daddr;
spy.info.tvu = daddr;
let mut spy_cluster_info = ClusterInfo::new(spy.info).expect("ClusterInfo::new");
spy_cluster_info.insert(&leader);
spy_cluster_info.insert_info(leader.clone());
spy_cluster_info.set_leader(leader.id);
let spy_cluster_info_ref = Arc::new(RwLock::new(spy_cluster_info));
let spy_window = Arc::new(RwLock::new(default_window()));
@ -68,7 +68,7 @@ fn make_listening_node(leader: &NodeInfo) -> (Ncp, Arc<RwLock<ClusterInfo>>, Nod
let new_node_info = new_node.info.clone();
let me = new_node.info.id.clone();
let mut new_node_cluster_info = ClusterInfo::new(new_node_info).expect("ClusterInfo::new");
new_node_cluster_info.insert(&leader);
new_node_cluster_info.insert_info(leader.clone());
new_node_cluster_info.set_leader(leader.id);
let new_node_cluster_info_ref = Arc::new(RwLock::new(new_node_cluster_info));
let new_node_window = Arc::new(RwLock::new(default_window()));
@ -96,8 +96,8 @@ fn converge(leader: &NodeInfo, num_nodes: usize) -> Vec<NodeInfo> {
let mut rv = vec![];
for _ in 0..30 {
let num = spy_ref.read().unwrap().convergence();
let mut v = spy_ref.read().unwrap().get_valid_peers();
if num >= num_nodes as u64 && v.len() >= num_nodes {
let mut v = spy_ref.read().unwrap().rpc_peers();
if num >= num_nodes && v.len() >= num_nodes {
rv.append(&mut v);
converged = true;
break;
@ -183,7 +183,7 @@ fn test_multi_node_ledger_window() -> result::Result<()> {
&zero_ledger_path,
keypair,
Arc::new(Keypair::new()),
Some(leader_data.contact_info.ncp),
Some(leader_data.ncp),
false,
LeaderScheduler::from_bootstrap_leader(leader_pubkey),
None,
@ -288,7 +288,7 @@ fn test_multi_node_validator_catchup_from_zero() -> result::Result<()> {
&ledger_path,
keypair,
Arc::new(Keypair::new()),
Some(leader_data.contact_info.ncp),
Some(leader_data.ncp),
false,
LeaderScheduler::from_bootstrap_leader(leader_pubkey),
None,
@ -326,7 +326,7 @@ fn test_multi_node_validator_catchup_from_zero() -> result::Result<()> {
&zero_ledger_path,
keypair,
Arc::new(Keypair::new()),
Some(leader_data.contact_info.ncp),
Some(leader_data.ncp),
false,
LeaderScheduler::from_bootstrap_leader(leader_pubkey),
None,
@ -420,7 +420,7 @@ fn test_multi_node_basic() {
&ledger_path,
keypair,
Arc::new(Keypair::new()),
Some(leader_data.contact_info.ncp),
Some(leader_data.ncp),
false,
LeaderScheduler::from_bootstrap_leader(leader_pubkey),
None,
@ -496,7 +496,7 @@ fn test_boot_validator_from_file() -> result::Result<()> {
&ledger_path,
keypair,
Arc::new(Keypair::new()),
Some(leader_data.contact_info.ncp),
Some(leader_data.ncp),
false,
LeaderScheduler::from_bootstrap_leader(leader_pubkey),
None,
@ -584,7 +584,7 @@ fn test_leader_restart_validator_start_from_old_ledger() -> result::Result<()> {
&stale_ledger_path,
keypair,
Arc::new(Keypair::new()),
Some(leader_data.contact_info.ncp),
Some(leader_data.ncp),
false,
LeaderScheduler::from_bootstrap_leader(leader_data.id),
None,
@ -715,7 +715,7 @@ fn test_multi_node_dynamic_network() {
&ledger_path,
Arc::new(keypair),
Arc::new(Keypair::new()),
Some(leader_data.contact_info.ncp),
Some(leader_data.ncp),
true,
LeaderScheduler::from_bootstrap_leader(leader_pubkey),
None,
@ -861,7 +861,7 @@ fn test_leader_to_validator_transition() {
&leader_ledger_path,
leader_keypair,
Arc::new(Keypair::new()),
Some(leader_info.contact_info.ncp),
Some(leader_info.ncp),
false,
LeaderScheduler::new(&leader_scheduler_config),
None,
@ -875,10 +875,10 @@ fn test_leader_to_validator_transition() {
let mut converged = false;
for _ in 0..30 {
let num = spy_node.read().unwrap().convergence();
let mut v: Vec<NodeInfo> = spy_node.read().unwrap().get_valid_peers();
let mut v: Vec<NodeInfo> = spy_node.read().unwrap().rpc_peers();
// There's only one person excluding the spy node (the leader) who should see
// two nodes on the network
if num >= 2 as u64 && v.len() >= 1 {
if num >= 2 && v.len() >= 1 {
converged = true;
break;
}
@ -1001,7 +1001,7 @@ fn test_leader_validator_basic() {
&validator_ledger_path,
validator_keypair,
Arc::new(vote_account_keypair),
Some(leader_info.contact_info.ncp),
Some(leader_info.ncp),
false,
LeaderScheduler::new(&leader_scheduler_config),
None,
@ -1013,7 +1013,7 @@ fn test_leader_validator_basic() {
&leader_ledger_path,
leader_keypair,
Arc::new(Keypair::new()),
Some(leader_info.contact_info.ncp),
Some(leader_info.ncp),
false,
LeaderScheduler::new(&leader_scheduler_config),
None,
@ -1189,7 +1189,7 @@ fn test_dropped_handoff_recovery() {
&bootstrap_leader_ledger_path,
bootstrap_leader_keypair,
Arc::new(Keypair::new()),
Some(bootstrap_leader_info.contact_info.ncp),
Some(bootstrap_leader_info.ncp),
false,
LeaderScheduler::new(&leader_scheduler_config),
None,
@ -1212,7 +1212,7 @@ fn test_dropped_handoff_recovery() {
&validator_ledger_path,
kp,
Arc::new(Keypair::new()),
Some(bootstrap_leader_info.contact_info.ncp),
Some(bootstrap_leader_info.ncp),
false,
LeaderScheduler::new(&leader_scheduler_config),
None,
@ -1238,7 +1238,7 @@ fn test_dropped_handoff_recovery() {
&next_leader_ledger_path,
next_leader_keypair,
Arc::new(vote_account_keypair),
Some(bootstrap_leader_info.contact_info.ncp),
Some(bootstrap_leader_info.ncp),
false,
LeaderScheduler::new(&leader_scheduler_config),
None,
@ -1355,7 +1355,7 @@ fn test_full_leader_validator_network() {
&bootstrap_leader_ledger_path,
Arc::new(node_keypairs.pop_front().unwrap()),
Arc::new(vote_account_keypairs.pop_front().unwrap()),
Some(bootstrap_leader_info.contact_info.ncp),
Some(bootstrap_leader_info.ncp),
false,
LeaderScheduler::new(&leader_scheduler_config),
None,
@ -1382,7 +1382,7 @@ fn test_full_leader_validator_network() {
&validator_ledger_path,
Arc::new(kp),
Arc::new(vote_account_keypairs.pop_front().unwrap()),
Some(bootstrap_leader_info.contact_info.ncp),
Some(bootstrap_leader_info.ncp),
false,
LeaderScheduler::new(&leader_scheduler_config),
None,
@ -1559,7 +1559,7 @@ fn test_broadcast_last_tick() {
&bootstrap_leader_ledger_path,
Arc::new(bootstrap_leader_keypair),
Arc::new(Keypair::new()),
Some(bootstrap_leader_info.contact_info.ncp),
Some(bootstrap_leader_info.ncp),
false,
LeaderScheduler::new(&leader_scheduler_config),
None,
@ -1621,12 +1621,8 @@ fn test_broadcast_last_tick() {
fn mk_client(leader: &NodeInfo) -> ThinClient {
let transactions_socket = UdpSocket::bind("0.0.0.0:0").unwrap();
assert!(ClusterInfo::is_valid_address(&leader.contact_info.tpu));
ThinClient::new(
leader.contact_info.rpc,
leader.contact_info.tpu,
transactions_socket,
)
assert!(ClusterInfo::is_valid_address(&leader.tpu));
ThinClient::new(leader.rpc, leader.tpu, transactions_socket)
}
fn send_tx_and_retry_get_balance(