If the network loses a validator or two, it's the job of the sanity check to detect this not the bench clients
392 lines
9.3 KiB
Bash
Executable File
392 lines
9.3 KiB
Bash
Executable File
#!/bin/bash -e
|
|
|
|
here=$(dirname "$0")
|
|
SOLANA_ROOT="$(cd "$here"/..; pwd)"
|
|
|
|
# shellcheck source=net/common.sh
|
|
source "$here"/common.sh
|
|
|
|
usage() {
|
|
exitcode=0
|
|
if [[ -n "$1" ]]; then
|
|
exitcode=1
|
|
echo "Error: $*"
|
|
fi
|
|
cat <<EOF
|
|
usage: $0 [start|stop|restart|sanity] [command-specific options]
|
|
|
|
Operate a configured testnet
|
|
|
|
start - Start the network
|
|
sanity - Sanity check the network
|
|
stop - Stop the network
|
|
restart - Shortcut for stop then start
|
|
|
|
start-specific options:
|
|
-S [snapFilename] - Deploy the specified Snap file
|
|
-s edge|beta|stable - Deploy the latest Snap on the specified Snap release channel
|
|
-t edge|beta|stable - Deploy the latest tarball release for the specified channel
|
|
-f [cargoFeatures] - List of |cargo --feaures=| to activate
|
|
(ignored if -s or -S is specified)
|
|
|
|
Note: if RUST_LOG is set in the environment it will be propogated into the
|
|
network nodes.
|
|
|
|
sanity/start-specific options:
|
|
-o noLedgerVerify - Skip ledger verification
|
|
-o noValidatorSanity - Skip validator sanity
|
|
-o rejectExtraNodes - Require the exact number of nodes
|
|
|
|
stop-specific options:
|
|
none
|
|
|
|
EOF
|
|
exit $exitcode
|
|
}
|
|
|
|
snapChannel=
|
|
releaseChannel=
|
|
snapFilename=
|
|
deployMethod=local
|
|
sanityExtraArgs=
|
|
cargoFeatures=
|
|
|
|
command=$1
|
|
[[ -n $command ]] || usage
|
|
shift
|
|
|
|
while getopts "h?S:s:t:o:f:" opt; do
|
|
case $opt in
|
|
h | \?)
|
|
usage
|
|
;;
|
|
S)
|
|
snapFilename=$OPTARG
|
|
[[ -f $snapFilename ]] || usage "Snap not readable: $snapFilename"
|
|
deployMethod=snap
|
|
;;
|
|
s)
|
|
case $OPTARG in
|
|
edge|beta|stable)
|
|
snapChannel=$OPTARG
|
|
deployMethod=snap
|
|
;;
|
|
*)
|
|
usage "Invalid snap channel: $OPTARG"
|
|
;;
|
|
esac
|
|
;;
|
|
t)
|
|
case $OPTARG in
|
|
edge|beta|stable)
|
|
releaseChannel=$OPTARG
|
|
deployMethod=tar
|
|
;;
|
|
*)
|
|
usage "Invalid release channel: $OPTARG"
|
|
;;
|
|
esac
|
|
;;
|
|
f)
|
|
cargoFeatures=$OPTARG
|
|
;;
|
|
o)
|
|
case $OPTARG in
|
|
noLedgerVerify|noValidatorSanity|rejectExtraNodes)
|
|
sanityExtraArgs="$sanityExtraArgs -o $OPTARG"
|
|
;;
|
|
*)
|
|
echo "Error: unknown option: $OPTARG"
|
|
exit 1
|
|
;;
|
|
esac
|
|
;;
|
|
*)
|
|
usage "Error: unhandled option: $opt"
|
|
;;
|
|
esac
|
|
done
|
|
|
|
loadConfigFile
|
|
expectedNodeCount=$((${#validatorIpList[@]} + 1))
|
|
|
|
build() {
|
|
declare MAYBE_DOCKER=
|
|
if [[ $(uname) != Linux ]]; then
|
|
MAYBE_DOCKER="ci/docker-run.sh solanalabs/rust"
|
|
fi
|
|
SECONDS=0
|
|
(
|
|
cd "$SOLANA_ROOT"
|
|
echo "--- Build started at $(date)"
|
|
|
|
set -x
|
|
rm -rf farf
|
|
$MAYBE_DOCKER cargo install --features="$cargoFeatures" --root farf
|
|
./scripts/install-native-programs.sh farf/
|
|
)
|
|
echo "Build took $SECONDS seconds"
|
|
}
|
|
|
|
startCommon() {
|
|
declare ipAddress=$1
|
|
test -d "$SOLANA_ROOT"
|
|
ssh "${sshOptions[@]}" "$ipAddress" "mkdir -p ~/solana ~/.cargo/bin"
|
|
rsync -vPrc -e "ssh ${sshOptions[*]}" \
|
|
"$SOLANA_ROOT"/{fetch-perf-libs.sh,scripts,net,multinode-demo} \
|
|
"$ipAddress":~/solana/
|
|
}
|
|
|
|
startLeader() {
|
|
declare ipAddress=$1
|
|
declare logFile="$2"
|
|
echo "--- Starting leader: $leaderIp"
|
|
echo "start log: $logFile"
|
|
|
|
# Deploy local binaries to leader. Validators and clients later fetch the
|
|
# binaries from the leader.
|
|
(
|
|
set -x
|
|
startCommon "$ipAddress" || exit 1
|
|
case $deployMethod in
|
|
snap)
|
|
rsync -vPrc -e "ssh ${sshOptions[*]}" "$snapFilename" "$ipAddress:~/solana/solana.snap"
|
|
;;
|
|
tar)
|
|
rsync -vPrc -e "ssh ${sshOptions[*]}" "$SOLANA_ROOT"/solana-release/bin/* "$ipAddress:~/.cargo/bin/"
|
|
;;
|
|
local)
|
|
rsync -vPrc -e "ssh ${sshOptions[*]}" "$SOLANA_ROOT"/farf/bin/* "$ipAddress:~/.cargo/bin/"
|
|
;;
|
|
*)
|
|
usage "Internal error: invalid deployMethod: $deployMethod"
|
|
;;
|
|
esac
|
|
|
|
ssh "${sshOptions[@]}" -n "$ipAddress" \
|
|
"./solana/net/remote/remote-node.sh $deployMethod leader $publicNetwork $entrypointIp $expectedNodeCount \"$RUST_LOG\""
|
|
) >> "$logFile" 2>&1 || {
|
|
cat "$logFile"
|
|
echo "^^^ +++"
|
|
exit 1
|
|
}
|
|
}
|
|
|
|
startValidator() {
|
|
declare ipAddress=$1
|
|
declare logFile="$netLogDir/validator-$ipAddress.log"
|
|
|
|
echo "--- Starting validator: $ipAddress"
|
|
echo "start log: $logFile"
|
|
(
|
|
set -x
|
|
startCommon "$ipAddress"
|
|
ssh "${sshOptions[@]}" -n "$ipAddress" \
|
|
"./solana/net/remote/remote-node.sh $deployMethod validator $publicNetwork $entrypointIp $expectedNodeCount \"$RUST_LOG\""
|
|
) >> "$logFile" 2>&1 &
|
|
declare pid=$!
|
|
ln -sfT "validator-$ipAddress.log" "$netLogDir/validator-$pid.log"
|
|
pids+=("$pid")
|
|
}
|
|
|
|
startClient() {
|
|
declare ipAddress=$1
|
|
declare logFile="$2"
|
|
echo "--- Starting client: $ipAddress"
|
|
echo "start log: $logFile"
|
|
(
|
|
set -x
|
|
startCommon "$ipAddress"
|
|
ssh "${sshOptions[@]}" -f "$ipAddress" \
|
|
"./solana/net/remote/remote-client.sh $deployMethod $entrypointIp \"$RUST_LOG\""
|
|
) >> "$logFile" 2>&1 || {
|
|
cat "$logFile"
|
|
echo "^^^ +++"
|
|
exit 1
|
|
}
|
|
}
|
|
|
|
sanity() {
|
|
declare expectedNodeCount=$((${#validatorIpList[@]} + 1))
|
|
declare ok=true
|
|
|
|
echo "--- Sanity"
|
|
$metricsWriteDatapoint "testnet-deploy net-sanity-begin=1"
|
|
|
|
declare host=$leaderIp # TODO: maybe use ${validatorIpList[0]} ?
|
|
(
|
|
set -x
|
|
# shellcheck disable=SC2029 # remote-client.sh args are expanded on client side intentionally
|
|
ssh "${sshOptions[@]}" "$host" \
|
|
"./solana/net/remote/remote-sanity.sh $sanityExtraArgs"
|
|
) || ok=false
|
|
|
|
$metricsWriteDatapoint "testnet-deploy net-sanity-complete=1"
|
|
$ok || exit 1
|
|
}
|
|
|
|
start() {
|
|
case $deployMethod in
|
|
snap)
|
|
if [[ -n $snapChannel ]]; then
|
|
rm -f "$SOLANA_ROOT"/solana_*.snap
|
|
if [[ $(uname) != Linux ]]; then
|
|
(
|
|
set -x
|
|
SOLANA_DOCKER_RUN_NOSETUID=1 "$SOLANA_ROOT"/ci/docker-run.sh ubuntu:18.04 bash -c "
|
|
set -ex;
|
|
apt-get -qq update;
|
|
apt-get -qq -y install snapd;
|
|
until snap download --channel=$snapChannel solana; do
|
|
sleep 1;
|
|
done
|
|
"
|
|
)
|
|
else
|
|
(
|
|
cd "$SOLANA_ROOT"
|
|
until snap download --channel="$snapChannel" solana; do
|
|
sleep 1
|
|
done
|
|
)
|
|
fi
|
|
snapFilename="$(echo "$SOLANA_ROOT"/solana_*.snap)"
|
|
[[ -r $snapFilename ]] || {
|
|
echo "Error: Snap not readable: $snapFilename"
|
|
exit 1
|
|
}
|
|
fi
|
|
;;
|
|
tar)
|
|
if [[ -n $releaseChannel ]]; then
|
|
rm -f "$SOLANA_ROOT"/solana-release.tar.bz2
|
|
cd "$SOLANA_ROOT"
|
|
|
|
set -x
|
|
curl -o solana-release.tar.bz2 http://solana-release.s3.amazonaws.com/"$releaseChannel"/solana-release.tar.bz2
|
|
tar jxvf solana-release.tar.bz2
|
|
fi
|
|
;;
|
|
local)
|
|
build
|
|
;;
|
|
*)
|
|
usage "Internal error: invalid deployMethod: $deployMethod"
|
|
;;
|
|
esac
|
|
|
|
echo "Deployment started at $(date)"
|
|
$metricsWriteDatapoint "testnet-deploy net-start-begin=1"
|
|
|
|
SECONDS=0
|
|
declare leaderDeployTime=
|
|
startLeader "$leaderIp" "$netLogDir/leader-$leaderIp.log"
|
|
leaderDeployTime=$SECONDS
|
|
$metricsWriteDatapoint "testnet-deploy net-leader-started=1"
|
|
|
|
SECONDS=0
|
|
pids=()
|
|
loopCount=0
|
|
for ipAddress in "${validatorIpList[@]}"; do
|
|
startValidator "$ipAddress"
|
|
|
|
# Staggering validator startup time. If too many validators
|
|
# bootup simultaneously, leader node gets more rsync requests
|
|
# from the validators than it can handle.
|
|
((loopCount++ % 2 == 0)) && sleep 2
|
|
done
|
|
|
|
for pid in "${pids[@]}"; do
|
|
declare ok=true
|
|
wait "$pid" || ok=false
|
|
if ! $ok; then
|
|
cat "$netLogDir/validator-$pid.log"
|
|
echo ^^^ +++
|
|
exit 1
|
|
fi
|
|
done
|
|
|
|
$metricsWriteDatapoint "testnet-deploy net-validators-started=1"
|
|
validatorDeployTime=$SECONDS
|
|
|
|
sanity
|
|
|
|
SECONDS=0
|
|
for ipAddress in "${clientIpList[@]}"; do
|
|
startClient "$ipAddress" "$netLogDir/client-$ipAddress.log"
|
|
done
|
|
clientDeployTime=$SECONDS
|
|
$metricsWriteDatapoint "testnet-deploy net-start-complete=1"
|
|
|
|
if [[ $deployMethod = "snap" ]]; then
|
|
declare networkVersion=unknown
|
|
IFS=\ read -r _ networkVersion _ < <(
|
|
ssh "${sshOptions[@]}" "$leaderIp" \
|
|
"snap info solana | grep \"^installed:\""
|
|
)
|
|
networkVersion=${networkVersion/0+git./}
|
|
$metricsWriteDatapoint "testnet-deploy version=\"$networkVersion\""
|
|
fi
|
|
|
|
echo
|
|
echo "+++ Deployment Successful"
|
|
echo "Leader deployment took $leaderDeployTime seconds"
|
|
echo "Validator deployment (${#validatorIpList[@]} instances) took $validatorDeployTime seconds"
|
|
echo "Client deployment (${#clientIpList[@]} instances) took $clientDeployTime seconds"
|
|
echo "Network start logs in $netLogDir:"
|
|
ls -l "$netLogDir"
|
|
}
|
|
|
|
|
|
stopNode() {
|
|
local ipAddress=$1
|
|
echo "--- Stopping node: $ipAddress"
|
|
(
|
|
set -x
|
|
ssh "${sshOptions[@]}" "$ipAddress" "
|
|
set -x
|
|
if snap list solana; then
|
|
sudo snap set solana mode=
|
|
sudo snap remove solana
|
|
fi
|
|
! tmux list-sessions || tmux kill-session
|
|
for pattern in solana- remote- oom-monitor net-stats; do
|
|
pkill -9 \$pattern
|
|
done
|
|
"
|
|
) || true
|
|
}
|
|
|
|
stop() {
|
|
SECONDS=0
|
|
$metricsWriteDatapoint "testnet-deploy net-stop-begin=1"
|
|
|
|
stopNode "$leaderIp"
|
|
|
|
for ipAddress in "${validatorIpList[@]}" "${clientIpList[@]}"; do
|
|
stopNode "$ipAddress"
|
|
done
|
|
|
|
$metricsWriteDatapoint "testnet-deploy net-stop-complete=1"
|
|
echo "Stopping nodes took $SECONDS seconds"
|
|
}
|
|
|
|
case $command in
|
|
restart)
|
|
stop
|
|
start
|
|
;;
|
|
start)
|
|
start
|
|
;;
|
|
sanity)
|
|
sanity
|
|
;;
|
|
stop)
|
|
stop
|
|
;;
|
|
*)
|
|
echo "Internal error: Unknown command: $command"
|
|
exit 1
|
|
esac
|