* Tweak GCE scripts for higher node count - Some validators were unable to rsync config from leader when the node count was high (e.g. 25). Looks like the leader node was getting more rsync requests in parallel than it count handle. - This change staggers the validators bootup, and rsync time * Address review comments
360 lines
8.4 KiB
Bash
Executable File
360 lines
8.4 KiB
Bash
Executable File
#!/bin/bash -e
|
|
|
|
here=$(dirname "$0")
|
|
SOLANA_ROOT="$(cd "$here"/..; pwd)"
|
|
|
|
# shellcheck source=net/common.sh
|
|
source "$here"/common.sh
|
|
|
|
usage() {
|
|
exitcode=0
|
|
if [[ -n "$1" ]]; then
|
|
exitcode=1
|
|
echo "Error: $*"
|
|
fi
|
|
cat <<EOF
|
|
usage: $0 [start|stop|restart|sanity] [command-specific options]
|
|
|
|
Operate a configured testnet
|
|
|
|
start - Start the network
|
|
sanity - Sanity check the network
|
|
stop - Stop the network
|
|
restart - Shortcut for stop then start
|
|
|
|
start-specific options:
|
|
-S [snapFilename] - Deploy the specified Snap file
|
|
-s edge|beta|stable - Deploy the latest Snap on the specified Snap release channel
|
|
-f [cargoFeatures] - List of |cargo --feaures=| to activate
|
|
(ignored if -s or -S is specified)
|
|
|
|
Note: if RUST_LOG is set in the environment it will be propogated into the
|
|
network nodes.
|
|
|
|
sanity/start-specific options:
|
|
-o noLedgerVerify - Skip ledger verification
|
|
-o noValidatorSanity - Skip validator sanity
|
|
-o rejectExtraNodes - Require the exact number of nodes
|
|
|
|
stop-specific options:
|
|
none
|
|
|
|
EOF
|
|
exit $exitcode
|
|
}
|
|
|
|
snapChannel=
|
|
snapFilename=
|
|
deployMethod=local
|
|
sanityExtraArgs=
|
|
cargoFeatures=
|
|
|
|
command=$1
|
|
[[ -n $command ]] || usage
|
|
shift
|
|
|
|
while getopts "h?S:s:o:f:" opt; do
|
|
case $opt in
|
|
h | \?)
|
|
usage
|
|
;;
|
|
S)
|
|
snapFilename=$OPTARG
|
|
[[ -f $snapFilename ]] || usage "Snap not readable: $snapFilename"
|
|
deployMethod=snap
|
|
;;
|
|
s)
|
|
case $OPTARG in
|
|
edge|beta|stable)
|
|
snapChannel=$OPTARG
|
|
deployMethod=snap
|
|
;;
|
|
*)
|
|
usage "Invalid snap channel: $OPTARG"
|
|
;;
|
|
esac
|
|
;;
|
|
f)
|
|
cargoFeatures=$OPTARG
|
|
;;
|
|
o)
|
|
case $OPTARG in
|
|
noLedgerVerify|noValidatorSanity|rejectExtraNodes)
|
|
sanityExtraArgs="$sanityExtraArgs -o $OPTARG"
|
|
;;
|
|
*)
|
|
echo "Error: unknown option: $OPTARG"
|
|
exit 1
|
|
;;
|
|
esac
|
|
;;
|
|
*)
|
|
usage "Error: unhandled option: $opt"
|
|
;;
|
|
esac
|
|
done
|
|
|
|
loadConfigFile
|
|
expectedNodeCount=$((${#validatorIpList[@]} + 1))
|
|
|
|
build() {
|
|
declare MAYBE_DOCKER=
|
|
if [[ $(uname) != Linux ]]; then
|
|
MAYBE_DOCKER="ci/docker-run.sh solanalabs/rust"
|
|
fi
|
|
SECONDS=0
|
|
(
|
|
cd "$SOLANA_ROOT"
|
|
echo "--- Build started at $(date)"
|
|
|
|
set -x
|
|
rm -rf farf
|
|
$MAYBE_DOCKER cargo install --features="$cargoFeatures" --root farf
|
|
)
|
|
echo "Build took $SECONDS seconds"
|
|
}
|
|
|
|
startCommon() {
|
|
declare ipAddress=$1
|
|
test -d "$SOLANA_ROOT"
|
|
ssh "${sshOptions[@]}" "$ipAddress" "mkdir -p ~/solana ~/.cargo/bin"
|
|
rsync -vPrc -e "ssh ${sshOptions[*]}" \
|
|
"$SOLANA_ROOT"/{fetch-perf-libs.sh,scripts,net,multinode-demo} \
|
|
"$ipAddress":~/solana/
|
|
}
|
|
|
|
startLeader() {
|
|
declare ipAddress=$1
|
|
declare logFile="$2"
|
|
echo "--- Starting leader: $leaderIp"
|
|
echo "start log: $logFile"
|
|
|
|
# Deploy local binaries to leader. Validators and clients later fetch the
|
|
# binaries from the leader.
|
|
(
|
|
set -x
|
|
startCommon "$ipAddress" || exit 1
|
|
case $deployMethod in
|
|
snap)
|
|
rsync -vPrc -e "ssh ${sshOptions[*]}" "$snapFilename" "$ipAddress:~/solana/solana.snap"
|
|
;;
|
|
local)
|
|
rsync -vPrc -e "ssh ${sshOptions[*]}" "$SOLANA_ROOT"/farf/bin/* "$ipAddress:~/.cargo/bin/"
|
|
;;
|
|
*)
|
|
usage "Internal error: invalid deployMethod: $deployMethod"
|
|
;;
|
|
esac
|
|
|
|
ssh "${sshOptions[@]}" -n "$ipAddress" \
|
|
"./solana/net/remote/remote-node.sh $deployMethod leader $publicNetwork $entrypointIp $expectedNodeCount \"$RUST_LOG\""
|
|
) >> "$logFile" 2>&1 || {
|
|
cat "$logFile"
|
|
echo "^^^ +++"
|
|
exit 1
|
|
}
|
|
}
|
|
|
|
startValidator() {
|
|
declare ipAddress=$1
|
|
declare logFile="$netLogDir/validator-$ipAddress.log"
|
|
|
|
echo "--- Starting validator: $leaderIp"
|
|
echo "start log: $logFile"
|
|
(
|
|
set -x
|
|
startCommon "$ipAddress"
|
|
ssh "${sshOptions[@]}" -n "$ipAddress" \
|
|
"./solana/net/remote/remote-node.sh $deployMethod validator $publicNetwork $entrypointIp $expectedNodeCount \"$RUST_LOG\""
|
|
) >> "$logFile" 2>&1 &
|
|
declare pid=$!
|
|
ln -sfT "validator-$ipAddress.log" "$netLogDir/validator-$pid.log"
|
|
pids+=("$pid")
|
|
}
|
|
|
|
startClient() {
|
|
declare ipAddress=$1
|
|
declare logFile="$2"
|
|
echo "--- Starting client: $ipAddress"
|
|
echo "start log: $logFile"
|
|
(
|
|
set -x
|
|
startCommon "$ipAddress"
|
|
ssh "${sshOptions[@]}" -f "$ipAddress" \
|
|
"./solana/net/remote/remote-client.sh $deployMethod $entrypointIp $expectedNodeCount \"$RUST_LOG\""
|
|
) >> "$logFile" 2>&1 || {
|
|
cat "$logFile"
|
|
echo "^^^ +++"
|
|
exit 1
|
|
}
|
|
}
|
|
|
|
sanity() {
|
|
declare expectedNodeCount=$((${#validatorIpList[@]} + 1))
|
|
declare ok=true
|
|
|
|
echo "--- Sanity"
|
|
$metricsWriteDatapoint "testnet-deploy net-sanity-begin=1"
|
|
|
|
(
|
|
set -x
|
|
# shellcheck disable=SC2029 # remote-client.sh args are expanded on client side intentionally
|
|
ssh "${sshOptions[@]}" "$leaderIp" \
|
|
"./solana/net/remote/remote-sanity.sh $sanityExtraArgs"
|
|
) || ok=false
|
|
|
|
$metricsWriteDatapoint "testnet-deploy net-sanity-complete=1"
|
|
$ok || exit 1
|
|
}
|
|
|
|
start() {
|
|
case $deployMethod in
|
|
snap)
|
|
if [[ -n $snapChannel ]]; then
|
|
rm -f "$SOLANA_ROOT"/solana_*.snap
|
|
if [[ $(uname) != Linux ]]; then
|
|
(
|
|
set -x
|
|
SOLANA_DOCKER_RUN_NOSETUID=1 "$SOLANA_ROOT"/ci/docker-run.sh ubuntu:18.04 bash -c "
|
|
set -ex;
|
|
apt-get -qq update;
|
|
apt-get -qq -y install snapd;
|
|
snap download --channel=$snapChannel solana;
|
|
"
|
|
)
|
|
else
|
|
(
|
|
cd "$SOLANA_ROOT"
|
|
snap download --channel="$snapChannel" solana
|
|
)
|
|
fi
|
|
snapFilename="$(echo "$SOLANA_ROOT"/solana_*.snap)"
|
|
[[ -r $snapFilename ]] || {
|
|
echo "Error: Snap not readable: $snapFilename"
|
|
exit 1
|
|
}
|
|
fi
|
|
;;
|
|
local)
|
|
build
|
|
;;
|
|
*)
|
|
usage "Internal error: invalid deployMethod: $deployMethod"
|
|
;;
|
|
esac
|
|
|
|
echo "Deployment started at $(date)"
|
|
$metricsWriteDatapoint "testnet-deploy net-start-begin=1"
|
|
|
|
SECONDS=0
|
|
declare leaderDeployTime=
|
|
startLeader "$leaderIp" "$netLogDir/leader-$leaderIp.log"
|
|
leaderDeployTime=$SECONDS
|
|
$metricsWriteDatapoint "testnet-deploy net-leader-started=1"
|
|
|
|
SECONDS=0
|
|
pids=()
|
|
loopCount=0
|
|
for ipAddress in "${validatorIpList[@]}"; do
|
|
startValidator "$ipAddress"
|
|
|
|
# Staggering validator startup time. If too many validators
|
|
# bootup simultaneously, leader node gets more rsync requests
|
|
# from the validators than it can handle.
|
|
((loopCount++ % 2 == 0)) && sleep 2
|
|
done
|
|
|
|
for pid in "${pids[@]}"; do
|
|
declare ok=true
|
|
wait "$pid" || ok=false
|
|
if ! $ok; then
|
|
cat "$netLogDir/validator-$pid.log"
|
|
echo ^^^ +++
|
|
exit 1
|
|
fi
|
|
done
|
|
|
|
$metricsWriteDatapoint "testnet-deploy net-validators-started=1"
|
|
validatorDeployTime=$SECONDS
|
|
|
|
sanity
|
|
|
|
SECONDS=0
|
|
for ipAddress in "${clientIpList[@]}"; do
|
|
startClient "$ipAddress" "$netLogDir/client-$ipAddress.log"
|
|
done
|
|
clientDeployTime=$SECONDS
|
|
$metricsWriteDatapoint "testnet-deploy net-start-complete=1"
|
|
|
|
if [[ $deployMethod = "snap" ]]; then
|
|
declare networkVersion=unknown
|
|
IFS=\ read -r _ networkVersion _ < <(
|
|
ssh "${sshOptions[@]}" "$leaderIp" \
|
|
"snap info solana | grep \"^installed:\""
|
|
)
|
|
networkVersion=${networkVersion/0+git./}
|
|
$metricsWriteDatapoint "testnet-deploy version=\"$networkVersion\""
|
|
fi
|
|
|
|
echo
|
|
echo "+++ Deployment Successful"
|
|
echo "Leader deployment took $leaderDeployTime seconds"
|
|
echo "Validator deployment (${#validatorIpList[@]} instances) took $validatorDeployTime seconds"
|
|
echo "Client deployment (${#clientIpList[@]} instances) took $clientDeployTime seconds"
|
|
echo "Network start logs in $netLogDir:"
|
|
ls -l "$netLogDir"
|
|
}
|
|
|
|
|
|
stopNode() {
|
|
local ipAddress=$1
|
|
echo "--- Stopping node: $ipAddress"
|
|
(
|
|
set -x
|
|
ssh "${sshOptions[@]}" "$ipAddress" "
|
|
set -x
|
|
if snap list solana; then
|
|
sudo snap set solana mode=
|
|
sudo snap remove solana
|
|
fi
|
|
! tmux list-sessions || tmux kill-session
|
|
for pattern in solana- remote- oom-monitor net-stats; do
|
|
pkill -9 \$pattern
|
|
done
|
|
"
|
|
) || true
|
|
}
|
|
|
|
stop() {
|
|
SECONDS=0
|
|
$metricsWriteDatapoint "testnet-deploy net-stop-begin=1"
|
|
|
|
stopNode "$leaderIp"
|
|
|
|
for ipAddress in "${validatorIpList[@]}" "${clientIpList[@]}"; do
|
|
stopNode "$ipAddress"
|
|
done
|
|
|
|
$metricsWriteDatapoint "testnet-deploy net-stop-complete=1"
|
|
echo "Stopping nodes took $SECONDS seconds"
|
|
}
|
|
|
|
case $command in
|
|
restart)
|
|
stop
|
|
start
|
|
;;
|
|
start)
|
|
start
|
|
;;
|
|
sanity)
|
|
sanity
|
|
;;
|
|
stop)
|
|
stop
|
|
;;
|
|
*)
|
|
echo "Internal error: Unknown command: $command"
|
|
exit 1
|
|
esac
|