Files
solana/net/net.sh
Pankaj Garg be7cce1fd2 Tweak GCE scripts for higher node count (#1229)
* Tweak GCE scripts for higher node count

- Some validators were unable to rsync config from leader when
  the node count was high (e.g. 25). Looks like the leader node was
  getting more rsync requests in parallel than it count handle.
- This change staggers the validators bootup, and rsync time

* Address review comments
2018-09-14 17:17:08 -07:00

360 lines
8.4 KiB
Bash
Executable File

#!/bin/bash -e
here=$(dirname "$0")
SOLANA_ROOT="$(cd "$here"/..; pwd)"
# shellcheck source=net/common.sh
source "$here"/common.sh
usage() {
exitcode=0
if [[ -n "$1" ]]; then
exitcode=1
echo "Error: $*"
fi
cat <<EOF
usage: $0 [start|stop|restart|sanity] [command-specific options]
Operate a configured testnet
start - Start the network
sanity - Sanity check the network
stop - Stop the network
restart - Shortcut for stop then start
start-specific options:
-S [snapFilename] - Deploy the specified Snap file
-s edge|beta|stable - Deploy the latest Snap on the specified Snap release channel
-f [cargoFeatures] - List of |cargo --feaures=| to activate
(ignored if -s or -S is specified)
Note: if RUST_LOG is set in the environment it will be propogated into the
network nodes.
sanity/start-specific options:
-o noLedgerVerify - Skip ledger verification
-o noValidatorSanity - Skip validator sanity
-o rejectExtraNodes - Require the exact number of nodes
stop-specific options:
none
EOF
exit $exitcode
}
snapChannel=
snapFilename=
deployMethod=local
sanityExtraArgs=
cargoFeatures=
command=$1
[[ -n $command ]] || usage
shift
while getopts "h?S:s:o:f:" opt; do
case $opt in
h | \?)
usage
;;
S)
snapFilename=$OPTARG
[[ -f $snapFilename ]] || usage "Snap not readable: $snapFilename"
deployMethod=snap
;;
s)
case $OPTARG in
edge|beta|stable)
snapChannel=$OPTARG
deployMethod=snap
;;
*)
usage "Invalid snap channel: $OPTARG"
;;
esac
;;
f)
cargoFeatures=$OPTARG
;;
o)
case $OPTARG in
noLedgerVerify|noValidatorSanity|rejectExtraNodes)
sanityExtraArgs="$sanityExtraArgs -o $OPTARG"
;;
*)
echo "Error: unknown option: $OPTARG"
exit 1
;;
esac
;;
*)
usage "Error: unhandled option: $opt"
;;
esac
done
loadConfigFile
expectedNodeCount=$((${#validatorIpList[@]} + 1))
build() {
declare MAYBE_DOCKER=
if [[ $(uname) != Linux ]]; then
MAYBE_DOCKER="ci/docker-run.sh solanalabs/rust"
fi
SECONDS=0
(
cd "$SOLANA_ROOT"
echo "--- Build started at $(date)"
set -x
rm -rf farf
$MAYBE_DOCKER cargo install --features="$cargoFeatures" --root farf
)
echo "Build took $SECONDS seconds"
}
startCommon() {
declare ipAddress=$1
test -d "$SOLANA_ROOT"
ssh "${sshOptions[@]}" "$ipAddress" "mkdir -p ~/solana ~/.cargo/bin"
rsync -vPrc -e "ssh ${sshOptions[*]}" \
"$SOLANA_ROOT"/{fetch-perf-libs.sh,scripts,net,multinode-demo} \
"$ipAddress":~/solana/
}
startLeader() {
declare ipAddress=$1
declare logFile="$2"
echo "--- Starting leader: $leaderIp"
echo "start log: $logFile"
# Deploy local binaries to leader. Validators and clients later fetch the
# binaries from the leader.
(
set -x
startCommon "$ipAddress" || exit 1
case $deployMethod in
snap)
rsync -vPrc -e "ssh ${sshOptions[*]}" "$snapFilename" "$ipAddress:~/solana/solana.snap"
;;
local)
rsync -vPrc -e "ssh ${sshOptions[*]}" "$SOLANA_ROOT"/farf/bin/* "$ipAddress:~/.cargo/bin/"
;;
*)
usage "Internal error: invalid deployMethod: $deployMethod"
;;
esac
ssh "${sshOptions[@]}" -n "$ipAddress" \
"./solana/net/remote/remote-node.sh $deployMethod leader $publicNetwork $entrypointIp $expectedNodeCount \"$RUST_LOG\""
) >> "$logFile" 2>&1 || {
cat "$logFile"
echo "^^^ +++"
exit 1
}
}
startValidator() {
declare ipAddress=$1
declare logFile="$netLogDir/validator-$ipAddress.log"
echo "--- Starting validator: $leaderIp"
echo "start log: $logFile"
(
set -x
startCommon "$ipAddress"
ssh "${sshOptions[@]}" -n "$ipAddress" \
"./solana/net/remote/remote-node.sh $deployMethod validator $publicNetwork $entrypointIp $expectedNodeCount \"$RUST_LOG\""
) >> "$logFile" 2>&1 &
declare pid=$!
ln -sfT "validator-$ipAddress.log" "$netLogDir/validator-$pid.log"
pids+=("$pid")
}
startClient() {
declare ipAddress=$1
declare logFile="$2"
echo "--- Starting client: $ipAddress"
echo "start log: $logFile"
(
set -x
startCommon "$ipAddress"
ssh "${sshOptions[@]}" -f "$ipAddress" \
"./solana/net/remote/remote-client.sh $deployMethod $entrypointIp $expectedNodeCount \"$RUST_LOG\""
) >> "$logFile" 2>&1 || {
cat "$logFile"
echo "^^^ +++"
exit 1
}
}
sanity() {
declare expectedNodeCount=$((${#validatorIpList[@]} + 1))
declare ok=true
echo "--- Sanity"
$metricsWriteDatapoint "testnet-deploy net-sanity-begin=1"
(
set -x
# shellcheck disable=SC2029 # remote-client.sh args are expanded on client side intentionally
ssh "${sshOptions[@]}" "$leaderIp" \
"./solana/net/remote/remote-sanity.sh $sanityExtraArgs"
) || ok=false
$metricsWriteDatapoint "testnet-deploy net-sanity-complete=1"
$ok || exit 1
}
start() {
case $deployMethod in
snap)
if [[ -n $snapChannel ]]; then
rm -f "$SOLANA_ROOT"/solana_*.snap
if [[ $(uname) != Linux ]]; then
(
set -x
SOLANA_DOCKER_RUN_NOSETUID=1 "$SOLANA_ROOT"/ci/docker-run.sh ubuntu:18.04 bash -c "
set -ex;
apt-get -qq update;
apt-get -qq -y install snapd;
snap download --channel=$snapChannel solana;
"
)
else
(
cd "$SOLANA_ROOT"
snap download --channel="$snapChannel" solana
)
fi
snapFilename="$(echo "$SOLANA_ROOT"/solana_*.snap)"
[[ -r $snapFilename ]] || {
echo "Error: Snap not readable: $snapFilename"
exit 1
}
fi
;;
local)
build
;;
*)
usage "Internal error: invalid deployMethod: $deployMethod"
;;
esac
echo "Deployment started at $(date)"
$metricsWriteDatapoint "testnet-deploy net-start-begin=1"
SECONDS=0
declare leaderDeployTime=
startLeader "$leaderIp" "$netLogDir/leader-$leaderIp.log"
leaderDeployTime=$SECONDS
$metricsWriteDatapoint "testnet-deploy net-leader-started=1"
SECONDS=0
pids=()
loopCount=0
for ipAddress in "${validatorIpList[@]}"; do
startValidator "$ipAddress"
# Staggering validator startup time. If too many validators
# bootup simultaneously, leader node gets more rsync requests
# from the validators than it can handle.
((loopCount++ % 2 == 0)) && sleep 2
done
for pid in "${pids[@]}"; do
declare ok=true
wait "$pid" || ok=false
if ! $ok; then
cat "$netLogDir/validator-$pid.log"
echo ^^^ +++
exit 1
fi
done
$metricsWriteDatapoint "testnet-deploy net-validators-started=1"
validatorDeployTime=$SECONDS
sanity
SECONDS=0
for ipAddress in "${clientIpList[@]}"; do
startClient "$ipAddress" "$netLogDir/client-$ipAddress.log"
done
clientDeployTime=$SECONDS
$metricsWriteDatapoint "testnet-deploy net-start-complete=1"
if [[ $deployMethod = "snap" ]]; then
declare networkVersion=unknown
IFS=\ read -r _ networkVersion _ < <(
ssh "${sshOptions[@]}" "$leaderIp" \
"snap info solana | grep \"^installed:\""
)
networkVersion=${networkVersion/0+git./}
$metricsWriteDatapoint "testnet-deploy version=\"$networkVersion\""
fi
echo
echo "+++ Deployment Successful"
echo "Leader deployment took $leaderDeployTime seconds"
echo "Validator deployment (${#validatorIpList[@]} instances) took $validatorDeployTime seconds"
echo "Client deployment (${#clientIpList[@]} instances) took $clientDeployTime seconds"
echo "Network start logs in $netLogDir:"
ls -l "$netLogDir"
}
stopNode() {
local ipAddress=$1
echo "--- Stopping node: $ipAddress"
(
set -x
ssh "${sshOptions[@]}" "$ipAddress" "
set -x
if snap list solana; then
sudo snap set solana mode=
sudo snap remove solana
fi
! tmux list-sessions || tmux kill-session
for pattern in solana- remote- oom-monitor net-stats; do
pkill -9 \$pattern
done
"
) || true
}
stop() {
SECONDS=0
$metricsWriteDatapoint "testnet-deploy net-stop-begin=1"
stopNode "$leaderIp"
for ipAddress in "${validatorIpList[@]}" "${clientIpList[@]}"; do
stopNode "$ipAddress"
done
$metricsWriteDatapoint "testnet-deploy net-stop-complete=1"
echo "Stopping nodes took $SECONDS seconds"
}
case $command in
restart)
stop
start
;;
start)
start
;;
sanity)
sanity
;;
stop)
stop
;;
*)
echo "Internal error: Unknown command: $command"
exit 1
esac