solana/net/net.sh
Michael Vines 447fe48d2a
Revert "Add a stand-alone gossip node on the blocksteamer instance"
This reverts commit a21792056130457d3d9ceb75168973677ad6a050.

This commit is causing trouble when the TdS cluster is reset and
validators running an older genesis config are still present.
Occasionally an RPC URL from an older validator will be selected,
causing a new node to fail to boot.
2020-01-04 16:42:12 -07:00

1046 lines
31 KiB
Bash
Executable File

#!/usr/bin/env bash
set -e
here=$(dirname "$0")
SOLANA_ROOT="$(cd "$here"/..; pwd)"
# shellcheck source=net/common.sh
source "$here"/common.sh
usage() {
exitcode=0
if [[ -n "$1" ]]; then
exitcode=1
echo "Error: $*"
fi
cat <<EOF
usage: $0 [start|stop|restart|sanity] [command-specific options]
Operate a configured testnet
start - Start the network
sanity - Sanity check the network
stop - Stop the network
restart - Shortcut for stop then start
logs - Fetch remote logs from each network node
startnode- Start an individual node (previously stopped with stopNode)
stopnode - Stop an individual node
update - Deploy a new software update to the cluster
start-specific options:
-T [tarFilename] - Deploy the specified release tarball
-t edge|beta|stable|vX.Y.Z - Deploy the latest tarball release for the
specified release channel (edge|beta|stable) or release tag
(vX.Y.Z)
-r / --skip-setup - Reuse existing node/ledger configuration from a
previous |start| (ie, don't run ./multinode-demo/setup.sh).
-d / --debug - Build/deploy the testnet with debug binaries
-c clientType=numClients=extraArgs - Number of clientTypes to start. This options can be specified
more than once. Defaults to bench-tps for all clients if not
specified.
Valid client types are:
idle
bench-tps
bench-exchange
User can optionally provide extraArgs that are transparently
supplied to the client program as command line parameters.
For example,
-c bench-tps=2="--tx_count 25000"
This will start 2 bench-tps clients, and supply "--tx_count 25000"
to the bench-tps client.
--client-delay-start
- Number of seconds to wait after validators have finished starting before starting client programs
(default: $clientDelayStart)
-n NUM_VALIDATORS - Number of validators to apply command to.
--gpu-mode GPU_MODE - Specify GPU mode to launch validators with (default: $gpuMode).
MODE must be one of
on - GPU *required*, any vendor *
off - No GPU, CPU-only
auto - Use GPU if available, any vendor *
cuda - GPU *required*, Nvidia CUDA only
* Currently, Nvidia CUDA is the only supported GPU vendor
--hashes-per-tick NUM_HASHES|sleep|auto
- Override the default --hashes-per-tick for the cluster
--no-airdrop
- If set, disables airdrops. Nodes must be funded in genesis config when airdrops are disabled.
--faucet-lamports NUM_LAMPORTS_TO_MINT
- Override the default 500000000000000000 lamports minted in genesis
--internal-nodes-stake-lamports NUM_LAMPORTS_PER_NODE
- Amount to stake internal nodes.
--internal-nodes-lamports NUM_LAMPORTS_PER_NODE
- Amount to fund internal nodes in genesis config.
--external-accounts-file FILE_PATH
- A YML file with a list of account pubkeys and corresponding lamport balances
in genesis config for external nodes
--no-snapshot-fetch
- If set, disables booting validators from a snapshot
--skip-poh-verify
- If set, validators will skip verifying
the ledger they already have saved to disk at
boot (results in a much faster boot)
--no-deploy
- Don't deploy new software, use the
existing deployment
--no-build
- Don't build new software, deploy the
existing binaries
--deploy-if-newer - Only deploy if newer software is
available (requires -t or -T)
--use-move - Build the move-loader-program and add it to the cluster
--operating-mode development|softlaunch
- Specify whether or not to launch the cluster in "development" mode with all features enabled at epoch 0,
or "softlaunch" mode with some features disabled at epoch 0 (default: development)
sanity/start-specific options:
-F - Discard validator nodes that didn't bootup successfully
-o noInstallCheck - Skip solana-install sanity
-o rejectExtraNodes - Require the exact number of nodes
stop-specific options:
none
logs-specific options:
none
netem-specific options:
--config - Netem configuration (as a double quoted string)
--parition - Percentage of network that should be configured with netem
--config-file - Configuration file for partition and netem configuration
--netem-cmd - Optional command argument to netem. Default is "add". Use "cleanup" to remove rules.
update-specific options:
--platform linux|osx|windows - Deploy the tarball using 'solana-install deploy ...' for the
given platform (multiple platforms may be specified)
(-t option must be supplied as well)
startnode/stopnode-specific options:
-i [ip address] - IP Address of the node to start or stop
Note: if RUST_LOG is set in the environment it will be propogated into the
network nodes.
EOF
exit $exitcode
}
releaseChannel=
deployMethod=local
deployIfNewer=
sanityExtraArgs=
skipSetup=false
updatePlatforms=
nodeAddress=
numIdleClients=0
numBenchTpsClients=0
numBenchExchangeClients=0
benchTpsExtraArgs=
benchExchangeExtraArgs=
failOnValidatorBootupFailure=true
genesisOptions=
numValidatorsRequested=
externalPrimordialAccountsFile=
remoteExternalPrimordialAccountsFile=
internalNodesStakeLamports=
internalNodesLamports=
maybeNoSnapshot=""
maybeLimitLedgerSize=""
maybeSkipLedgerVerify=""
maybeDisableAirdrops=""
debugBuild=false
doBuild=true
gpuMode=auto
maybeUseMove=""
netemPartition=""
netemConfig=""
netemConfigFile=""
netemCommand="add"
clientDelayStart=0
command=$1
[[ -n $command ]] || usage
shift
shortArgs=()
while [[ -n $1 ]]; do
if [[ ${1:0:2} = -- ]]; then
if [[ $1 = --hashes-per-tick ]]; then
genesisOptions="$genesisOptions $1 $2"
shift 2
elif [[ $1 = --slots-per-epoch ]]; then
genesisOptions="$genesisOptions $1 $2"
shift 2
elif [[ $1 = --target-lamports-per-signature ]]; then
genesisOptions="$genesisOptions $1 $2"
shift 2
elif [[ $1 = --faucet-lamports ]]; then
genesisOptions="$genesisOptions $1 $2"
shift 2
elif [[ $1 = --operating-mode ]]; then
case "$2" in
development|softlaunch)
;;
*)
echo "Unexpected operating mode: \"$2\""
exit 1
;;
esac
genesisOptions="$genesisOptions $1 $2"
shift 2
elif [[ $1 = --no-snapshot-fetch ]]; then
maybeNoSnapshot="$1"
shift 1
elif [[ $1 = --deploy-if-newer ]]; then
deployIfNewer=1
shift 1
elif [[ $1 = --no-deploy ]]; then
deployMethod=skip
shift 1
elif [[ $1 = --no-build ]]; then
doBuild=false
shift 1
elif [[ $1 = --limit-ledger-size ]]; then
maybeLimitLedgerSize="$1"
shift 1
elif [[ $1 = --skip-poh-verify ]]; then
maybeSkipLedgerVerify="$1"
shift 1
elif [[ $1 = --skip-setup ]]; then
skipSetup=true
shift 1
elif [[ $1 = --platform ]]; then
updatePlatforms="$updatePlatforms $2"
shift 2
elif [[ $1 = --internal-nodes-stake-lamports ]]; then
internalNodesStakeLamports="$2"
shift 2
elif [[ $1 = --internal-nodes-lamports ]]; then
internalNodesLamports="$2"
shift 2
elif [[ $1 = --external-accounts-file ]]; then
externalPrimordialAccountsFile="$2"
remoteExternalPrimordialAccountsFile=/tmp/external-primordial-accounts.yml
shift 2
elif [[ $1 = --no-airdrop ]]; then
maybeDisableAirdrops="$1"
shift 1
elif [[ $1 = --debug ]]; then
debugBuild=true
shift 1
elif [[ $1 = --use-move ]]; then
maybeUseMove=$1
shift 1
elif [[ $1 = --partition ]]; then
netemPartition=$2
shift 2
elif [[ $1 = --config ]]; then
netemConfig=$2
shift 2
elif [[ $1 == --config-file ]]; then
netemConfigFile=$2
shift 2
elif [[ $1 == --netem-cmd ]]; then
netemCommand=$2
shift 2
elif [[ $1 = --gpu-mode ]]; then
gpuMode=$2
case "$gpuMode" in
on|off|auto|cuda)
;;
*)
echo "Unexpected GPU mode: \"$gpuMode\""
exit 1
;;
esac
shift 2
elif [[ $1 == --client-delay-start ]]; then
clientDelayStart=$2
shift 2
else
usage "Unknown long option: $1"
fi
else
shortArgs+=("$1")
shift
fi
done
while getopts "h?T:t:o:f:rc:Fn:i:d" opt "${shortArgs[@]}"; do
case $opt in
h | \?)
usage
;;
T)
tarballFilename=$OPTARG
[[ -r $tarballFilename ]] || usage "File not readable: $tarballFilename"
deployMethod=tar
;;
t)
case $OPTARG in
edge|beta|stable|v*)
releaseChannel=$OPTARG
deployMethod=tar
;;
*)
usage "Invalid release channel: $OPTARG"
;;
esac
;;
n)
numValidatorsRequested=$OPTARG
;;
r)
skipSetup=true
;;
o)
case $OPTARG in
rejectExtraNodes|noInstallCheck)
sanityExtraArgs="$sanityExtraArgs -o $OPTARG"
;;
*)
usage "Unknown option: $OPTARG"
;;
esac
;;
c)
getClientTypeAndNum() {
if ! [[ $OPTARG == *'='* ]]; then
echo "Error: Expecting tuple \"clientType=numClientType=extraArgs\" but got \"$OPTARG\""
exit 1
fi
local keyValue
IFS='=' read -ra keyValue <<< "$OPTARG"
local clientType=${keyValue[0]}
local numClients=${keyValue[1]}
local extraArgs=${keyValue[2]}
re='^[0-9]+$'
if ! [[ $numClients =~ $re ]] ; then
echo "error: numClientType must be a number but got \"$numClients\""
exit 1
fi
case $clientType in
idle)
numIdleClients=$numClients
# $extraArgs ignored for 'idle'
;;
bench-tps)
numBenchTpsClients=$numClients
benchTpsExtraArgs=$extraArgs
;;
bench-exchange)
numBenchExchangeClients=$numClients
benchExchangeExtraArgs=$extraArgs
;;
*)
echo "Unknown client type: $clientType"
exit 1
;;
esac
}
getClientTypeAndNum
;;
F)
failOnValidatorBootupFailure=false
;;
i)
nodeAddress=$OPTARG
;;
d)
debugBuild=true
;;
*)
usage "Error: unhandled option: $opt"
;;
esac
done
loadConfigFile
netLogDir=
initLogDir() { # Initializes the netLogDir global variable. Idempotent
[[ -z $netLogDir ]] || return 0
netLogDir="$netDir"/log
declare netLogDateDir
netLogDateDir="$netDir"/log-$(date +"%Y-%m-%d_%H_%M_%S")
if [[ -d $netLogDir && ! -L $netLogDir ]]; then
echo "Warning: moving $netLogDir to make way for symlink."
mv "$netLogDir" "$netDir"/log.old
elif [[ -L $netLogDir ]]; then
rm "$netLogDir"
fi
mkdir -p "$netConfigDir" "$netLogDateDir"
ln -sf "$netLogDateDir" "$netLogDir"
echo "Log directory: $netLogDateDir"
}
if [[ -n $numValidatorsRequested ]]; then
truncatedNodeList=( "${validatorIpList[@]:0:$numValidatorsRequested}" )
unset validatorIpList
validatorIpList=( "${truncatedNodeList[@]}" )
fi
numClients=${#clientIpList[@]}
numClientsRequested=$((numBenchTpsClients + numBenchExchangeClients + numIdleClients))
if [[ "$numClientsRequested" -eq 0 ]]; then
numBenchTpsClients=$numClients
numClientsRequested=$numClients
else
if [[ "$numClientsRequested" -gt "$numClients" ]]; then
echo "Error: More clients requested ($numClientsRequested) then available ($numClients)"
exit 1
fi
fi
annotate() {
[[ -z $BUILDKITE ]] || {
buildkite-agent annotate "$@"
}
}
annotateBlockexplorerUrl() {
declare blockstreamer=${blockstreamerIpList[0]}
if [[ -n $blockstreamer ]]; then
annotate --style info --context blockexplorer-url "Block explorer: http://$blockstreamer/"
fi
}
build() {
supported=("18.04")
declare MAYBE_DOCKER=
if [[ $(uname) != Linux || ! " ${supported[*]} " =~ $(lsb_release -sr) ]]; then
# shellcheck source=ci/rust-version.sh
source "$SOLANA_ROOT"/ci/rust-version.sh
MAYBE_DOCKER="ci/docker-run.sh $rust_stable_docker_image"
fi
SECONDS=0
(
cd "$SOLANA_ROOT"
echo "--- Build started at $(date)"
set -x
rm -rf farf
buildVariant=
if $debugBuild; then
buildVariant=debug
fi
$MAYBE_DOCKER bash -c "
set -ex
scripts/cargo-install-all.sh farf \"$buildVariant\" \"$maybeUseMove\"
"
)
echo "Build took $SECONDS seconds"
}
startCommon() {
declare ipAddress=$1
test -d "$SOLANA_ROOT"
if $skipSetup; then
ssh "${sshOptions[@]}" "$ipAddress" "
set -x;
mkdir -p ~/solana/config;
rm -rf ~/config;
mv ~/solana/config ~;
rm -rf ~/solana;
mkdir -p ~/solana ~/.cargo/bin;
mv ~/config ~/solana/
"
else
ssh "${sshOptions[@]}" "$ipAddress" "
set -x;
rm -rf ~/solana;
mkdir -p ~/.cargo/bin
"
fi
[[ -z "$externalNodeSshKey" ]] || ssh-copy-id -f -i "$externalNodeSshKey" "${sshOptions[@]}" "solana@$ipAddress"
syncScripts "$ipAddress"
}
syncScripts() {
echo "rsyncing scripts... to $ipAddress"
declare ipAddress=$1
rsync -vPrc -e "ssh ${sshOptions[*]}" \
--exclude 'net/log*' \
"$SOLANA_ROOT"/{fetch-perf-libs.sh,scripts,net,multinode-demo} \
"$ipAddress":~/solana/ > /dev/null
}
startBootstrapLeader() {
declare ipAddress=$1
declare nodeIndex="$2"
declare logFile="$3"
echo "--- Starting bootstrap leader: $ipAddress"
echo "start log: $logFile"
# Deploy local binaries to bootstrap validator. Other validators and clients later fetch the
# binaries from it
(
set -x
startCommon "$ipAddress" || exit 1
[[ -z "$externalPrimordialAccountsFile" ]] || rsync -vPrc -e "ssh ${sshOptions[*]}" "$externalPrimordialAccountsFile" \
"$ipAddress:$remoteExternalPrimordialAccountsFile"
case $deployMethod in
tar)
rsync -vPrc -e "ssh ${sshOptions[*]}" "$SOLANA_ROOT"/solana-release/bin/* "$ipAddress:~/.cargo/bin/"
rsync -vPrc -e "ssh ${sshOptions[*]}" "$SOLANA_ROOT"/solana-release/version.yml "$ipAddress:~/"
;;
local)
rsync -vPrc -e "ssh ${sshOptions[*]}" "$SOLANA_ROOT"/farf/bin/* "$ipAddress:~/.cargo/bin/"
ssh "${sshOptions[@]}" -n "$ipAddress" "rm -f ~/version.yml; touch ~/version.yml"
;;
skip)
;;
*)
usage "Internal error: invalid deployMethod: $deployMethod"
;;
esac
ssh "${sshOptions[@]}" -n "$ipAddress" \
"./solana/net/remote/remote-node.sh \
$deployMethod \
bootstrap-leader \
$entrypointIp \
$((${#validatorIpList[@]} + ${#blockstreamerIpList[@]} + ${#archiverIpList[@]})) \
\"$RUST_LOG\" \
$skipSetup \
$failOnValidatorBootupFailure \
\"$remoteExternalPrimordialAccountsFile\" \
\"$maybeDisableAirdrops\" \
\"$internalNodesStakeLamports\" \
\"$internalNodesLamports\" \
$nodeIndex \
$numBenchTpsClients \"$benchTpsExtraArgs\" \
$numBenchExchangeClients \"$benchExchangeExtraArgs\" \
\"$genesisOptions\" \
\"$maybeNoSnapshot $maybeSkipLedgerVerify $maybeLimitLedgerSize\" \
\"$gpuMode\" \
\"$GEOLOCATION_API_KEY\" \
"
) >> "$logFile" 2>&1 || {
cat "$logFile"
echo "^^^ +++"
exit 1
}
}
startNode() {
declare ipAddress=$1
declare nodeType=$2
declare nodeIndex="$3"
initLogDir
declare logFile="$netLogDir/validator-$ipAddress.log"
if [[ -z $nodeType ]]; then
echo nodeType not specified
exit 1
fi
if [[ -z $nodeIndex ]]; then
echo nodeIndex not specified
exit 1
fi
echo "--- Starting $nodeType: $ipAddress"
echo "start log: $logFile"
(
set -x
startCommon "$ipAddress"
if [[ $nodeType = blockstreamer ]] && [[ -n $letsEncryptDomainName ]]; then
#
# Create/renew TLS certificate
#
declare localArchive=~/letsencrypt-"$letsEncryptDomainName".tgz
if [[ -r "$localArchive" ]]; then
timeout 30s scp "${sshOptions[@]}" "$localArchive" "$ipAddress:letsencrypt.tgz"
fi
ssh "${sshOptions[@]}" -n "$ipAddress" \
"sudo -H /certbot-restore.sh $letsEncryptDomainName maintainers@solana.com"
rm -f letsencrypt.tgz
timeout 30s scp "${sshOptions[@]}" "$ipAddress:/letsencrypt.tgz" letsencrypt.tgz
test -s letsencrypt.tgz # Ensure non-empty before overwriting $localArchive
cp letsencrypt.tgz "$localArchive"
fi
ssh "${sshOptions[@]}" -n "$ipAddress" \
"./solana/net/remote/remote-node.sh \
$deployMethod \
$nodeType \
$entrypointIp \
$((${#validatorIpList[@]} + ${#blockstreamerIpList[@]} + ${#archiverIpList[@]})) \
\"$RUST_LOG\" \
$skipSetup \
$failOnValidatorBootupFailure \
\"$remoteExternalPrimordialAccountsFile\" \
\"$maybeDisableAirdrops\" \
\"$internalNodesStakeLamports\" \
\"$internalNodesLamports\" \
$nodeIndex \
$numBenchTpsClients \"$benchTpsExtraArgs\" \
$numBenchExchangeClients \"$benchExchangeExtraArgs\" \
\"$genesisOptions\" \
\"$maybeNoSnapshot $maybeSkipLedgerVerify $maybeLimitLedgerSize\" \
\"$gpuMode\" \
\"$GEOLOCATION_API_KEY\" \
"
) >> "$logFile" 2>&1 &
declare pid=$!
ln -sf "validator-$ipAddress.log" "$netLogDir/validator-$pid.log"
pids+=("$pid")
}
startClient() {
declare ipAddress=$1
declare clientToRun="$2"
declare clientIndex="$3"
initLogDir
declare logFile="$netLogDir/client-$clientToRun-$ipAddress.log"
echo "--- Starting client: $ipAddress - $clientToRun"
echo "start log: $logFile"
(
set -x
startCommon "$ipAddress"
ssh "${sshOptions[@]}" -f "$ipAddress" \
"./solana/net/remote/remote-client.sh $deployMethod $entrypointIp \
$clientToRun \"$RUST_LOG\" \"$benchTpsExtraArgs\" \"$benchExchangeExtraArgs\" $clientIndex"
) >> "$logFile" 2>&1 || {
cat "$logFile"
echo "^^^ +++"
exit 1
}
}
sanity() {
declare skipBlockstreamerSanity=$1
$metricsWriteDatapoint "testnet-deploy net-sanity-begin=1"
declare ok=true
declare bootstrapLeader=${validatorIpList[0]}
declare blockstreamer=${blockstreamerIpList[0]}
annotateBlockexplorerUrl
echo "--- Sanity: $bootstrapLeader"
(
set -x
# shellcheck disable=SC2029 # remote-client.sh args are expanded on client side intentionally
ssh "${sshOptions[@]}" "$bootstrapLeader" \
"./solana/net/remote/remote-sanity.sh $bootstrapLeader $sanityExtraArgs \"$RUST_LOG\""
) || ok=false
$ok || exit 1
if [[ -z $skipBlockstreamerSanity && -n $blockstreamer ]]; then
# If there's a blockstreamer node run a reduced sanity check on it as well
echo "--- Sanity: $blockstreamer"
(
set -x
# shellcheck disable=SC2029 # remote-client.sh args are expanded on client side intentionally
ssh "${sshOptions[@]}" "$blockstreamer" \
"./solana/net/remote/remote-sanity.sh $blockstreamer $sanityExtraArgs \"$RUST_LOG\""
) || ok=false
$ok || exit 1
fi
$metricsWriteDatapoint "testnet-deploy net-sanity-complete=1"
}
deployUpdate() {
if [[ -z $updatePlatforms ]]; then
echo "No update platforms"
return
fi
if [[ -z $releaseChannel ]]; then
echo "Release channel not specified (use -t option)"
exit 1
fi
declare ok=true
declare bootstrapLeader=${validatorIpList[0]}
for updatePlatform in $updatePlatforms; do
echo "--- Deploying solana-install update: $updatePlatform"
(
set -x
scripts/solana-install-update-manifest-keypair.sh "$updatePlatform"
timeout 30s scp "${sshOptions[@]}" \
update_manifest_keypair.json "$bootstrapLeader:solana/update_manifest_keypair.json"
# shellcheck disable=SC2029 # remote-deploy-update.sh args are expanded on client side intentionally
ssh "${sshOptions[@]}" "$bootstrapLeader" \
"./solana/net/remote/remote-deploy-update.sh $releaseChannel $updatePlatform"
) || ok=false
$ok || exit 1
done
}
getNodeType() {
echo "getNodeType: $nodeAddress"
[[ -n $nodeAddress ]] || {
echo "Error: nodeAddress not set"
exit 1
}
nodeIndex=0 # <-- global
nodeType=validator # <-- global
for ipAddress in "${validatorIpList[@]}" b "${blockstreamerIpList[@]}" r "${archiverIpList[@]}"; do
if [[ $ipAddress = b ]]; then
nodeType=blockstreamer
continue
elif [[ $ipAddress = r ]]; then
nodeType=archiver
continue
fi
if [[ $ipAddress = "$nodeAddress" ]]; then
echo "getNodeType: $nodeType ($nodeIndex)"
return
fi
((nodeIndex = nodeIndex + 1))
done
echo "Error: Unknown node: $nodeAddress"
exit 1
}
prepare_deploy() {
case $deployMethod in
tar)
if [[ -n $releaseChannel ]]; then
rm -f "$SOLANA_ROOT"/solana-release.tar.bz2
declare updateDownloadUrl=http://release.solana.com/"$releaseChannel"/solana-release-x86_64-unknown-linux-gnu.tar.bz2
(
set -x
curl --retry 5 --retry-delay 2 --retry-connrefused \
-o "$SOLANA_ROOT"/solana-release.tar.bz2 "$updateDownloadUrl"
)
tarballFilename="$SOLANA_ROOT"/solana-release.tar.bz2
fi
(
set -x
rm -rf "$SOLANA_ROOT"/solana-release
(cd "$SOLANA_ROOT"; tar jxv) < "$tarballFilename"
cat "$SOLANA_ROOT"/solana-release/version.yml
)
;;
local)
if $doBuild; then
build
else
echo "Build skipped due to --no-build"
fi
;;
skip)
;;
*)
usage "Internal error: invalid deployMethod: $deployMethod"
;;
esac
if [[ -n $deployIfNewer ]]; then
if [[ $deployMethod != tar ]]; then
echo "Error: --deploy-if-newer only supported for tar deployments"
exit 1
fi
echo "Fetching current software version"
(
set -x
rsync -vPrc -e "ssh ${sshOptions[*]}" "${validatorIpList[0]}":~/version.yml current-version.yml
)
cat current-version.yml
if ! diff -q current-version.yml "$SOLANA_ROOT"/solana-release/version.yml; then
echo "Cluster software version is old. Update required"
else
echo "Cluster software version is current. No update required"
exit 0
fi
fi
}
deploy() {
initLogDir
echo "Deployment started at $(date)"
$metricsWriteDatapoint "testnet-deploy net-start-begin=1"
declare bootstrapLeader=true
for nodeAddress in "${validatorIpList[@]}" "${blockstreamerIpList[@]}" "${archiverIpList[@]}"; do
nodeType=
nodeIndex=
getNodeType
if $bootstrapLeader; then
SECONDS=0
declare bootstrapNodeDeployTime=
startBootstrapLeader "$nodeAddress" $nodeIndex "$netLogDir/bootstrap-leader-$ipAddress.log"
bootstrapNodeDeployTime=$SECONDS
$metricsWriteDatapoint "testnet-deploy net-bootnode-leader-started=1"
bootstrapLeader=false
SECONDS=0
pids=()
else
startNode "$ipAddress" $nodeType $nodeIndex
# Stagger additional node start time. If too many nodes start simultaneously
# the bootstrap node gets more rsync requests from the additional nodes than
# it can handle.
sleep 2
fi
done
for pid in "${pids[@]}"; do
declare ok=true
wait "$pid" || ok=false
if ! $ok; then
echo "+++ validator failed to start"
cat "$netLogDir/validator-$pid.log"
if $failOnValidatorBootupFailure; then
exit 1
else
echo "Failure is non-fatal"
fi
fi
done
$metricsWriteDatapoint "testnet-deploy net-validators-started=1"
additionalNodeDeployTime=$SECONDS
annotateBlockexplorerUrl
sanity skipBlockstreamerSanity # skip sanity on blockstreamer node, it may not
# have caught up to the bootstrap leader yet
echo "--- Sleeping $clientDelayStart seconds after validators are started before starting clients"
sleep "$clientDelayStart"
SECONDS=0
for ((i=0; i < "$numClients" && i < "$numClientsRequested"; i++)) do
if [[ $i -lt "$numBenchTpsClients" ]]; then
startClient "${clientIpList[$i]}" "solana-bench-tps" "$i"
elif [[ $i -lt $((numBenchTpsClients + numBenchExchangeClients)) ]]; then
startClient "${clientIpList[$i]}" "solana-bench-exchange" $((i-numBenchTpsClients))
else
startClient "${clientIpList[$i]}" "idle"
fi
done
clientDeployTime=$SECONDS
$metricsWriteDatapoint "testnet-deploy net-start-complete=1"
declare networkVersion=unknown
case $deployMethod in
tar)
networkVersion="$(
(
set -o pipefail
grep "^commit: " "$SOLANA_ROOT"/solana-release/version.yml | head -n1 | cut -d\ -f2
) || echo "tar-unknown"
)"
;;
local)
networkVersion="$(git rev-parse HEAD || echo local-unknown)"
;;
skip)
;;
*)
usage "Internal error: invalid deployMethod: $deployMethod"
;;
esac
$metricsWriteDatapoint "testnet-deploy version=\"${networkVersion:0:9}\""
echo
echo "+++ Deployment Successful"
echo "Bootstrap leader deployment took $bootstrapNodeDeployTime seconds"
echo "Additional validator deployment (${#validatorIpList[@]} validators, ${#blockstreamerIpList[@]} blockstreamer nodes, ${#archiverIpList[@]} archivers) took $additionalNodeDeployTime seconds"
echo "Client deployment (${#clientIpList[@]} instances) took $clientDeployTime seconds"
echo "Network start logs in $netLogDir"
}
stopNode() {
local ipAddress=$1
local block=$2
initLogDir
declare logFile="$netLogDir/stop-validator-$ipAddress.log"
echo "--- Stopping node: $ipAddress"
echo "stop log: $logFile"
syncScripts "$ipAddress"
(
# Since cleanup.sh does a pkill, we cannot pass the command directly,
# otherwise the process which is doing the killing will be killed because
# the script itself will match the pkill pattern
set -x
# shellcheck disable=SC2029 # It's desired that PS4 be expanded on the client side
ssh "${sshOptions[@]}" "$ipAddress" "PS4=\"$PS4\" ./solana/net/remote/cleanup.sh"
) >> "$logFile" 2>&1 &
declare pid=$!
ln -sf "stop-validator-$ipAddress.log" "$netLogDir/stop-validator-$pid.log"
if $block; then
wait $pid
else
pids+=("$pid")
fi
}
stop() {
SECONDS=0
$metricsWriteDatapoint "testnet-deploy net-stop-begin=1"
declare loopCount=0
pids=()
for ipAddress in "${validatorIpList[@]}" "${blockstreamerIpList[@]}" "${archiverIpList[@]}" "${clientIpList[@]}"; do
stopNode "$ipAddress" false
# Stagger additional node stop time to avoid too many concurrent ssh
# sessions
((loopCount++ % 4 == 0)) && sleep 2
done
echo --- Waiting for nodes to finish stopping
for pid in "${pids[@]}"; do
echo -n "$pid "
wait "$pid" || true
done
echo
$metricsWriteDatapoint "testnet-deploy net-stop-complete=1"
echo "Stopping nodes took $SECONDS seconds"
}
checkPremptibleInstances() {
# The validatorIpList nodes may be preemptible instances that can disappear at
# any time. Try to detect when a validator has been preempted to help the user
# out.
#
# Of course this isn't airtight as an instance could always disappear
# immediately after its successfully pinged.
for ipAddress in "${validatorIpList[@]}"; do
(
set -x
timeout 5s ping -c 1 "$ipAddress" | tr - _
) || {
cat <<EOF
Warning: $ipAddress may have been preempted.
Run |./gce.sh config| to restart it
EOF
exit 1
}
done
}
checkPremptibleInstances
case $command in
restart)
prepare_deploy
stop
deploy
;;
start)
prepare_deploy
deploy
;;
sanity)
sanity
;;
stop)
stop
;;
update)
deployUpdate
;;
stopnode)
if [[ -z $nodeAddress ]]; then
usage "node address (-i) not specified"
exit 1
fi
stopNode "$nodeAddress" true
;;
startnode)
if [[ -z $nodeAddress ]]; then
usage "node address (-i) not specified"
exit 1
fi
nodeType=
nodeIndex=
getNodeType
startNode "$nodeAddress" $nodeType $nodeIndex
;;
logs)
initLogDir
fetchRemoteLog() {
declare ipAddress=$1
declare log=$2
echo "--- fetching $log from $ipAddress"
(
set -x
timeout 30s scp "${sshOptions[@]}" \
"$ipAddress":solana/"$log".log "$netLogDir"/remote-"$log"-"$ipAddress".log
) || echo "failed to fetch log"
}
fetchRemoteLog "${validatorIpList[0]}" faucet
for ipAddress in "${validatorIpList[@]}"; do
fetchRemoteLog "$ipAddress" validator
done
for ipAddress in "${clientIpList[@]}"; do
fetchRemoteLog "$ipAddress" client
done
for ipAddress in "${blockstreamerIpList[@]}"; do
fetchRemoteLog "$ipAddress" validator
done
for ipAddress in "${archiverIpList[@]}"; do
fetchRemoteLog "$ipAddress" validator
done
;;
netem)
if [[ -n $netemConfigFile ]]; then
if [[ $netemCommand = "add" ]]; then
for ipAddress in "${validatorIpList[@]}"; do
"$here"/scp.sh "$netemConfigFile" solana@"$ipAddress":~/solana
done
fi
for i in "${!validatorIpList[@]}"; do
"$here"/ssh.sh solana@"${validatorIpList[$i]}" 'solana/scripts/net-shaper.sh' \
"$netemCommand" ~solana/solana/"$netemConfigFile" "${#validatorIpList[@]}" "$i"
done
else
num_nodes=$((${#validatorIpList[@]}*netemPartition/100))
if [[ $((${#validatorIpList[@]}*netemPartition%100)) -gt 0 ]]; then
num_nodes=$((num_nodes+1))
fi
if [[ "$num_nodes" -gt "${#validatorIpList[@]}" ]]; then
num_nodes=${#validatorIpList[@]}
fi
# Stop netem on all nodes
for ipAddress in "${validatorIpList[@]}"; do
"$here"/ssh.sh solana@"$ipAddress" 'solana/scripts/netem.sh delete < solana/netem.cfg || true'
done
# Start netem on required nodes
for ((i=0; i<num_nodes; i++ )); do :
"$here"/ssh.sh solana@"${validatorIpList[$i]}" "echo $netemConfig > solana/netem.cfg; solana/scripts/netem.sh add \"$netemConfig\""
done
fi
;;
*)
echo "Internal error: Unknown command: $command"
usage
exit 1
esac