Add EtcdTowerStorage
This commit is contained in:
26
Cargo.lock
generated
26
Cargo.lock
generated
@ -1308,6 +1308,21 @@ dependencies = [
|
||||
"termcolor",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "etcd-client"
|
||||
version = "0.7.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "11d1f66c65d1b777fc92a5b57a32c35dcb28b644a8c2c5fbc363cc90e8b99e60"
|
||||
dependencies = [
|
||||
"http",
|
||||
"prost",
|
||||
"tokio 1.9.0",
|
||||
"tokio-stream",
|
||||
"tonic",
|
||||
"tonic-build",
|
||||
"tower-service",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "failure"
|
||||
version = "0.1.8"
|
||||
@ -1833,11 +1848,11 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "http"
|
||||
version = "0.2.1"
|
||||
version = "0.2.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "28d569972648b2c512421b5f2a405ad6ac9666547189d0c5477a3f200f3e02f9"
|
||||
checksum = "527e8c9ac747e28542699a951517aa9a6945af506cd1f2e1b53a576c17b6cc11"
|
||||
dependencies = [
|
||||
"bytes 0.5.4",
|
||||
"bytes 1.0.1",
|
||||
"fnv",
|
||||
"itoa",
|
||||
]
|
||||
@ -4677,6 +4692,7 @@ dependencies = [
|
||||
"crossbeam-channel",
|
||||
"dashmap",
|
||||
"ed25519-dalek",
|
||||
"etcd-client",
|
||||
"flate2",
|
||||
"fs_extra",
|
||||
"indexmap",
|
||||
@ -6785,9 +6801,9 @@ checksum = "343bc9466d3fe6b0f960ef45960509f84480bf4fd96f92901afe7ff3df9d3a62"
|
||||
|
||||
[[package]]
|
||||
name = "tower-service"
|
||||
version = "0.3.0"
|
||||
version = "0.3.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e987b6bf443f4b5b3b6f38704195592cca41c5bb7aedd3c3693c7081f8289860"
|
||||
checksum = "360dfd1d6d30e05fda32ace2c8c70e9c0a9da713275777f5a4dbb8a1893930c6"
|
||||
|
||||
[[package]]
|
||||
name = "tracing"
|
||||
|
@ -25,6 +25,7 @@ chrono = { version = "0.4.11", features = ["serde"] }
|
||||
crossbeam-channel = "0.5"
|
||||
dashmap = { version = "4.0.2", features = ["rayon", "raw-api"] }
|
||||
ed25519-dalek = "=1.0.1"
|
||||
etcd-client = { version = "0.7.1", features = ["tls"]}
|
||||
fs_extra = "1.2.0"
|
||||
flate2 = "1.0"
|
||||
indexmap = { version = "1.7", features = ["rayon"] }
|
||||
@ -71,6 +72,7 @@ solana-vote-program = { path = "../programs/vote", version = "=1.8.0" }
|
||||
tempfile = "3.2.0"
|
||||
thiserror = "1.0"
|
||||
solana-rayon-threadlimit = { path = "../rayon-threadlimit", version = "=1.8.0" }
|
||||
tokio = { version = "1", features = ["full"] }
|
||||
trees = "0.4.2"
|
||||
|
||||
[dev-dependencies]
|
||||
@ -86,7 +88,6 @@ solana-version = { path = "../version", version = "=1.8.0" }
|
||||
static_assertions = "1.1.0"
|
||||
symlink = "0.1.0"
|
||||
systemstat = "0.1.8"
|
||||
tokio = { version = "1", features = ["full"] }
|
||||
|
||||
[build-dependencies]
|
||||
rustc_version = "0.4"
|
||||
|
@ -8,6 +8,7 @@ use {
|
||||
fs::{self, File},
|
||||
io::{self, BufReader},
|
||||
path::PathBuf,
|
||||
sync::RwLock,
|
||||
},
|
||||
};
|
||||
|
||||
@ -127,3 +128,157 @@ impl TowerStorage for FileTowerStorage {
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
pub struct EtcdTowerStorage {
|
||||
client: RwLock<etcd_client::Client>,
|
||||
instance_id: [u8; 8],
|
||||
runtime: tokio::runtime::Runtime,
|
||||
}
|
||||
|
||||
pub struct EtcdTlsConfig {
|
||||
pub domain_name: String,
|
||||
pub ca_certificate: Vec<u8>,
|
||||
pub identity_certificate: Vec<u8>,
|
||||
pub identity_private_key: Vec<u8>,
|
||||
}
|
||||
|
||||
impl EtcdTowerStorage {
|
||||
pub fn new<E: AsRef<str>, S: AsRef<[E]>>(
|
||||
endpoints: S,
|
||||
tls_config: Option<EtcdTlsConfig>,
|
||||
) -> Result<Self> {
|
||||
let runtime = tokio::runtime::Builder::new_current_thread()
|
||||
.enable_io()
|
||||
.enable_time()
|
||||
.build()
|
||||
.unwrap();
|
||||
|
||||
let client = runtime
|
||||
.block_on(async {
|
||||
etcd_client::Client::connect(
|
||||
endpoints,
|
||||
tls_config.map(|tls_config| {
|
||||
etcd_client::ConnectOptions::default().with_tls(
|
||||
etcd_client::TlsOptions::new()
|
||||
.domain_name(tls_config.domain_name)
|
||||
.ca_certificate(etcd_client::Certificate::from_pem(
|
||||
tls_config.ca_certificate,
|
||||
))
|
||||
.identity(etcd_client::Identity::from_pem(
|
||||
tls_config.identity_certificate,
|
||||
tls_config.identity_private_key,
|
||||
)),
|
||||
)
|
||||
}),
|
||||
)
|
||||
.await
|
||||
})
|
||||
.map_err(Self::etdc_to_tower_error)?;
|
||||
|
||||
Ok(Self {
|
||||
client: RwLock::new(client),
|
||||
instance_id: solana_sdk::timing::timestamp().to_le_bytes(),
|
||||
runtime,
|
||||
})
|
||||
}
|
||||
|
||||
fn get_keys(node_pubkey: &Pubkey) -> (String, String) {
|
||||
let instance_key = format!("{}/instance", node_pubkey);
|
||||
let tower_key = format!("{}/tower", node_pubkey);
|
||||
(instance_key, tower_key)
|
||||
}
|
||||
|
||||
fn etdc_to_tower_error(error: etcd_client::Error) -> TowerError {
|
||||
TowerError::IoError(io::Error::new(io::ErrorKind::Other, error.to_string()))
|
||||
}
|
||||
}
|
||||
|
||||
impl TowerStorage for EtcdTowerStorage {
|
||||
fn load(&self, node_pubkey: &Pubkey) -> Result<SavedTower> {
|
||||
let (instance_key, tower_key) = Self::get_keys(node_pubkey);
|
||||
let mut client = self.client.write().unwrap();
|
||||
|
||||
let txn = etcd_client::Txn::new().and_then(vec![etcd_client::TxnOp::put(
|
||||
instance_key.clone(),
|
||||
self.instance_id,
|
||||
None,
|
||||
)]);
|
||||
self.runtime
|
||||
.block_on(async { client.txn(txn).await })
|
||||
.map_err(|err| {
|
||||
error!("Failed to acquire etcd instance lock: {}", err);
|
||||
Self::etdc_to_tower_error(err)
|
||||
})?;
|
||||
|
||||
let txn = etcd_client::Txn::new()
|
||||
.when(vec![etcd_client::Compare::value(
|
||||
instance_key,
|
||||
etcd_client::CompareOp::Equal,
|
||||
self.instance_id,
|
||||
)])
|
||||
.and_then(vec![etcd_client::TxnOp::get(tower_key, None)]);
|
||||
|
||||
let response = self
|
||||
.runtime
|
||||
.block_on(async { client.txn(txn).await })
|
||||
.map_err(|err| {
|
||||
error!("Failed to read etcd saved tower: {}", err);
|
||||
Self::etdc_to_tower_error(err)
|
||||
})?;
|
||||
|
||||
if !response.succeeded() {
|
||||
return Err(TowerError::IoError(io::Error::new(
|
||||
io::ErrorKind::Other,
|
||||
format!("Lost etcd instance lock for {}", node_pubkey),
|
||||
)));
|
||||
}
|
||||
|
||||
for op_response in response.op_responses() {
|
||||
if let etcd_client::TxnOpResponse::Get(get_response) = op_response {
|
||||
if let Some(kv) = get_response.kvs().get(0) {
|
||||
return bincode::deserialize_from(kv.value()).map_err(|e| e.into());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Should never happen...
|
||||
Err(TowerError::IoError(io::Error::new(
|
||||
io::ErrorKind::Other,
|
||||
"Saved tower response missing".to_string(),
|
||||
)))
|
||||
}
|
||||
|
||||
fn store(&self, saved_tower: &SavedTower) -> Result<()> {
|
||||
let (instance_key, tower_key) = Self::get_keys(&saved_tower.node_pubkey);
|
||||
let mut client = self.client.write().unwrap();
|
||||
|
||||
let txn = etcd_client::Txn::new()
|
||||
.when(vec![etcd_client::Compare::value(
|
||||
instance_key,
|
||||
etcd_client::CompareOp::Equal,
|
||||
self.instance_id,
|
||||
)])
|
||||
.and_then(vec![etcd_client::TxnOp::put(
|
||||
tower_key,
|
||||
bincode::serialize(saved_tower)?,
|
||||
None,
|
||||
)]);
|
||||
|
||||
let response = self
|
||||
.runtime
|
||||
.block_on(async { client.txn(txn).await })
|
||||
.map_err(|err| {
|
||||
error!("Failed to write etcd saved tower: {}", err);
|
||||
err
|
||||
})
|
||||
.map_err(Self::etdc_to_tower_error)?;
|
||||
|
||||
if !response.succeeded() {
|
||||
return Err(TowerError::IoError(io::Error::new(
|
||||
io::ErrorKind::Other,
|
||||
format!("Lost etcd instance lock for {}", saved_tower.node_pubkey),
|
||||
)));
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
@ -101,6 +101,7 @@ module.exports = {
|
||||
"running-validator/validator-stake",
|
||||
"running-validator/validator-monitor",
|
||||
"running-validator/validator-info",
|
||||
"running-validator/validator-failover",
|
||||
"running-validator/validator-troubleshoot",
|
||||
],
|
||||
Clusters: [
|
||||
|
146
docs/src/running-validator/validator-failover.md
Normal file
146
docs/src/running-validator/validator-failover.md
Normal file
@ -0,0 +1,146 @@
|
||||
---
|
||||
title: Failover Setup
|
||||
---
|
||||
|
||||
A simple two machine instance failover method is described here, which allows you to:
|
||||
* upgrade your validator software with virtually no down time, and
|
||||
* failover to the secondary instance when your monitoring detects a problem with
|
||||
the primary instance
|
||||
without any safety issues that would otherwise be associated with running two
|
||||
instances of your validator.
|
||||
|
||||
You will need two validator-class machines for your primary and secondary
|
||||
validator. A third machine for running an [etcd](https://etcd.io/) cluster,
|
||||
which is used to store the tower voting record for your validator.
|
||||
|
||||
## Setup
|
||||
|
||||
### etcd cluster setup
|
||||
|
||||
There is ample documentation regarding etcd setup and configuration at
|
||||
https://etcd.io/, please generally familiarize yourself with etcd before
|
||||
continuing.
|
||||
|
||||
It's recommended that etcd be installed on a separate machine from your primary
|
||||
and secondary validator machines. This machine must be highly available, and
|
||||
depending on your needs you may wish to configure etcd with more than just
|
||||
one node.
|
||||
|
||||
First install `etcd` as desired for your machine. Then TLS certificates must be
|
||||
created for authentication between the etcd cluster and your validator. Here is
|
||||
one way to do this:
|
||||
|
||||
With [Golang](https://golang.org/) installed, run `go get
|
||||
github.com/cloudflare/cfssl/cmd/cfssl`. The `cfssl` program should now be
|
||||
available at `~/go/bin/cfssl`. Ensure `~/go/bin` is in your PATH by running
|
||||
`PATH=$PATH:~/go/bin/`.
|
||||
|
||||
Now create a certificate directory and configuration file:
|
||||
```
|
||||
mkdir -p certs/
|
||||
echo '{"CN":"etcd","hosts":["localhost", "127.0.0.1"],"key":{"algo":"rsa","size":2048}}' > certs/config.json
|
||||
```
|
||||
|
||||
then create certificates for the etcd server and the validator:
|
||||
```
|
||||
cfssl gencert -initca certs/config.json | cfssljson -bare certs/etcd-ca
|
||||
cfssl gencert -ca certs/etcd-ca.pem -ca-key certs/etcd-ca-key.pem certs/config.json | cfssljson -bare certs/validator
|
||||
cfssl gencert -ca certs/etcd-ca.pem -ca-key certs/etcd-ca-key.pem certs/config.json | cfssljson -bare certs/etcd
|
||||
```
|
||||
|
||||
Copy these files to your primary and secondary validator machines:
|
||||
* `certs/validator-key.pem`
|
||||
* `certs/validator.pem`
|
||||
* `certs/etcd-ca.pem`
|
||||
|
||||
and these files to the machine running the etcd server:
|
||||
* `certs/etcd.pem`
|
||||
* `certs/etcd-key.pem`
|
||||
* `certs/etcd-ca.pem`
|
||||
|
||||
With this configuration, both the validator and etdc will share the same
|
||||
TLS certificate authority and will each authenticate the other with it.
|
||||
|
||||
|
||||
Start `etcd` with the following arguments:
|
||||
```bash
|
||||
etcd --auto-compaction-retention 2 --auto-compaction-mode revision \
|
||||
--cert-file=certs/etcd.pem --key-file=certs/etcd-key.pem \
|
||||
--client-cert-auth \
|
||||
--trusted-ca-file=certs/etcd-ca.pem \
|
||||
--listen-client-urls=https://127.0.0.1:2379 \
|
||||
--advertise-client-urls=https://127.0.0.1:2379
|
||||
```
|
||||
|
||||
and use `curl` to confirm the etcd TLS certificates are properly configured:
|
||||
```bash
|
||||
curl --cacert certs/etcd-ca.pem https://127.0.0.1:2379/ --cert certs/validator.pem --key certs/validator-key.pem
|
||||
```
|
||||
On success, curl will return a 404 response.
|
||||
|
||||
For more information on etcd TLS setup, please refer to
|
||||
https://etcd.io/docs/v3.5/op-guide/security/#example-2-client-to-server-authentication-with-https-client-certificates
|
||||
|
||||
### Primary Validator
|
||||
The following additional `solana-validator` parameters are required to enable
|
||||
tower storage into etcd:
|
||||
|
||||
```
|
||||
solana-validator ... \
|
||||
--tower-storage etcd \
|
||||
--etcd-cacert-file certs/etcd-ca.pem \
|
||||
--etcd-cert-file certs/validator.pem \
|
||||
--etcd-key-file certs/validator-key.pem \
|
||||
--etcd-endpoint 127.0.0.1:2379 # <-- replace 127.0.0.1 with the actual IP address
|
||||
```
|
||||
|
||||
Note that once running your validator *will terminate* if it's not able to write
|
||||
its tower into etcd before submitting a vote transactioin, so it's essential
|
||||
that your etcd endpoint remain accessible at all times.
|
||||
|
||||
### Secondary Validator
|
||||
Configure the secondary validator like the primary with the exception of the
|
||||
following `solana-validator` command-line argument changes:
|
||||
* Generate and use a secondary validator identity: `--identity secondary-validator-keypair.json`
|
||||
* Add `--no-check-vote-account`
|
||||
* Add `--authorized-voter validator-keypair.json` (where
|
||||
`validator-keypair.json` is the identity keypair for your primary validator)
|
||||
|
||||
## Triggering a failover manually
|
||||
When both validators are running normally and caught up to the cluster, a
|
||||
failover from primary to secondary can be triggered by running the following
|
||||
command on the secondary validator:
|
||||
```bash
|
||||
$ solana-validator wait-for-restart-window --identity validator-keypair.json \
|
||||
&& solana-validator set-identity validator-keypair.json
|
||||
```
|
||||
|
||||
The secondary validator will acquire a lock on the tower in etcd to ensure
|
||||
voting and block production safely switches over from the primary validator.
|
||||
|
||||
The primary validator will then terminate as soon as it detects the secondary
|
||||
validator using its identity.
|
||||
|
||||
Note: When the primary validator restarts (which may be immediate if you have
|
||||
configured your primary validator to do so) it will reclaim its identity
|
||||
from the secondary validator. This will in turn cause the secondary validator to
|
||||
exit. However if/when the secondary validator restarts, it will do so using the
|
||||
secondary validator identity and thus the restart cycle is broken.
|
||||
|
||||
## Triggering a failover via monitoring
|
||||
Monitoring of your choosing can invoke the `solana-validator set-identity
|
||||
validator-keypair.json` command mentioned in the previous section.
|
||||
|
||||
It is not necessary to guarantee the primary validator has halted before failing
|
||||
over to the secondary, as the failover process will prevent the primary
|
||||
validator from voting and producing blocks even if it is in an unknown state.
|
||||
|
||||
## Validator Software Upgrades
|
||||
To perform a software upgrade using this failover method:
|
||||
1. Install the new software version on your primary validator system but do not
|
||||
restart it yet.
|
||||
2. Trigger a manual failover to your secondary validator. This should cause your
|
||||
primary validator to terminate.
|
||||
3. When your primary validator restarts it will now be using the new software version.
|
||||
4. Once the primary validator catches up upgrade the secondary validator at
|
||||
your convenience.
|
@ -21,7 +21,7 @@ use {
|
||||
},
|
||||
solana_core::{
|
||||
ledger_cleanup_service::{DEFAULT_MAX_LEDGER_SHREDS, DEFAULT_MIN_MAX_LEDGER_SHREDS},
|
||||
tower_storage::FileTowerStorage,
|
||||
tower_storage,
|
||||
tpu::DEFAULT_TPU_COALESCE_MS,
|
||||
validator::{
|
||||
is_snapshot_config_invalid, Validator, ValidatorConfig, ValidatorStartProgress,
|
||||
@ -1299,7 +1299,58 @@ pub fn main() {
|
||||
.long("tower")
|
||||
.value_name("DIR")
|
||||
.takes_value(true)
|
||||
.help("Use DIR as tower location [default: --ledger value]"),
|
||||
.help("Use DIR as file tower storage location [default: --ledger value]"),
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name("tower_storage")
|
||||
.long("tower-storage")
|
||||
.possible_values(&["file", "etcd"])
|
||||
.default_value("file")
|
||||
.takes_value(true)
|
||||
.help("Where to store the tower"),
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name("etcd_endpoint")
|
||||
.long("etcd-endpoint")
|
||||
.required_if("tower_storage", "etcd")
|
||||
.value_name("HOST:PORT")
|
||||
.takes_value(true)
|
||||
.multiple(true)
|
||||
.validator(solana_net_utils::is_host_port)
|
||||
.help("etcd gRPC endpoint to connect with")
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name("etcd_domain_name")
|
||||
.long("etcd-domain-name")
|
||||
.required_if("tower_storage", "etcd")
|
||||
.value_name("DOMAIN")
|
||||
.default_value("localhost")
|
||||
.takes_value(true)
|
||||
.help("domain name against which to verify the etcd server’s TLS certificate")
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name("etcd_cacert_file")
|
||||
.long("etcd-cacert-file")
|
||||
.required_if("tower_storage", "etcd")
|
||||
.value_name("FILE")
|
||||
.takes_value(true)
|
||||
.help("verify the TLS certificate of the etcd endpoint using this CA bundle")
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name("etcd_key_file")
|
||||
.long("etcd-key-file")
|
||||
.required_if("tower_storage", "etcd")
|
||||
.value_name("FILE")
|
||||
.takes_value(true)
|
||||
.help("TLS key file to use when establishing a connection to the etcd endpoint")
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name("etcd_cert_file")
|
||||
.long("etcd-cert-file")
|
||||
.required_if("tower_storage", "etcd")
|
||||
.value_name("FILE")
|
||||
.takes_value(true)
|
||||
.help("TLS certificate to use when establishing a connection to the etcd endpoint")
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name("gossip_port")
|
||||
@ -1316,7 +1367,6 @@ pub fn main() {
|
||||
.validator(solana_net_utils::is_host)
|
||||
.help("Gossip DNS name or IP address for the validator to advertise in gossip \
|
||||
[default: ask --entrypoint, or 127.0.0.1 when --entrypoint is not provided]"),
|
||||
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name("public_rpc_addr")
|
||||
@ -2296,13 +2346,50 @@ pub fn main() {
|
||||
.ok()
|
||||
.or_else(|| get_cluster_shred_version(&entrypoint_addrs));
|
||||
|
||||
let tower_storage: Arc<dyn solana_core::tower_storage::TowerStorage> =
|
||||
match value_t_or_exit!(matches, "tower_storage", String).as_str() {
|
||||
"file" => {
|
||||
let tower_path = value_t!(matches, "tower", PathBuf)
|
||||
.ok()
|
||||
.unwrap_or_else(|| ledger_path.clone());
|
||||
|
||||
Arc::new(tower_storage::FileTowerStorage::new(tower_path))
|
||||
}
|
||||
"etcd" => {
|
||||
let endpoints = values_t_or_exit!(matches, "etcd_endpoint", String);
|
||||
let domain_name = value_t_or_exit!(matches, "etcd_domain_name", String);
|
||||
let ca_certificate_file = value_t_or_exit!(matches, "etcd_cacert_file", String);
|
||||
let identity_certificate_file = value_t_or_exit!(matches, "etcd_cert_file", String);
|
||||
let identity_private_key_file = value_t_or_exit!(matches, "etcd_key_file", String);
|
||||
|
||||
let read = |file| {
|
||||
fs::read(&file).unwrap_or_else(|err| {
|
||||
eprintln!("Unable to read {}: {}", file, err);
|
||||
exit(1)
|
||||
})
|
||||
};
|
||||
|
||||
let tls_config = tower_storage::EtcdTlsConfig {
|
||||
domain_name,
|
||||
ca_certificate: read(ca_certificate_file),
|
||||
identity_certificate: read(identity_certificate_file),
|
||||
identity_private_key: read(identity_private_key_file),
|
||||
};
|
||||
|
||||
Arc::new(
|
||||
tower_storage::EtcdTowerStorage::new(endpoints, Some(tls_config))
|
||||
.unwrap_or_else(|err| {
|
||||
eprintln!("Failed to connect to etcd: {}", err);
|
||||
exit(1);
|
||||
}),
|
||||
)
|
||||
}
|
||||
_ => unreachable!(),
|
||||
};
|
||||
|
||||
let mut validator_config = ValidatorConfig {
|
||||
require_tower: matches.is_present("require_tower"),
|
||||
tower_storage: Arc::new(FileTowerStorage::new(tower_path)),
|
||||
tower_storage,
|
||||
dev_halt_at_slot: value_t!(matches, "dev_halt_at_slot", Slot).ok(),
|
||||
expected_genesis_hash: matches
|
||||
.value_of("expected_genesis_hash")
|
||||
|
Reference in New Issue
Block a user