This reverts commit a21792056130457d3d9ceb75168973677ad6a050. This commit is causing trouble when the TdS cluster is reset and validators running an older genesis config are still present. Occasionally an RPC URL from an older validator will be selected, causing a new node to fail to boot.
1046 lines
31 KiB
Bash
Executable File
1046 lines
31 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
set -e
|
|
|
|
here=$(dirname "$0")
|
|
SOLANA_ROOT="$(cd "$here"/..; pwd)"
|
|
|
|
# shellcheck source=net/common.sh
|
|
source "$here"/common.sh
|
|
|
|
usage() {
|
|
exitcode=0
|
|
if [[ -n "$1" ]]; then
|
|
exitcode=1
|
|
echo "Error: $*"
|
|
fi
|
|
cat <<EOF
|
|
usage: $0 [start|stop|restart|sanity] [command-specific options]
|
|
|
|
Operate a configured testnet
|
|
|
|
start - Start the network
|
|
sanity - Sanity check the network
|
|
stop - Stop the network
|
|
restart - Shortcut for stop then start
|
|
logs - Fetch remote logs from each network node
|
|
startnode- Start an individual node (previously stopped with stopNode)
|
|
stopnode - Stop an individual node
|
|
update - Deploy a new software update to the cluster
|
|
|
|
start-specific options:
|
|
-T [tarFilename] - Deploy the specified release tarball
|
|
-t edge|beta|stable|vX.Y.Z - Deploy the latest tarball release for the
|
|
specified release channel (edge|beta|stable) or release tag
|
|
(vX.Y.Z)
|
|
-r / --skip-setup - Reuse existing node/ledger configuration from a
|
|
previous |start| (ie, don't run ./multinode-demo/setup.sh).
|
|
-d / --debug - Build/deploy the testnet with debug binaries
|
|
-c clientType=numClients=extraArgs - Number of clientTypes to start. This options can be specified
|
|
more than once. Defaults to bench-tps for all clients if not
|
|
specified.
|
|
Valid client types are:
|
|
idle
|
|
bench-tps
|
|
bench-exchange
|
|
User can optionally provide extraArgs that are transparently
|
|
supplied to the client program as command line parameters.
|
|
For example,
|
|
-c bench-tps=2="--tx_count 25000"
|
|
This will start 2 bench-tps clients, and supply "--tx_count 25000"
|
|
to the bench-tps client.
|
|
--client-delay-start
|
|
- Number of seconds to wait after validators have finished starting before starting client programs
|
|
(default: $clientDelayStart)
|
|
-n NUM_VALIDATORS - Number of validators to apply command to.
|
|
--gpu-mode GPU_MODE - Specify GPU mode to launch validators with (default: $gpuMode).
|
|
MODE must be one of
|
|
on - GPU *required*, any vendor *
|
|
off - No GPU, CPU-only
|
|
auto - Use GPU if available, any vendor *
|
|
cuda - GPU *required*, Nvidia CUDA only
|
|
* Currently, Nvidia CUDA is the only supported GPU vendor
|
|
--hashes-per-tick NUM_HASHES|sleep|auto
|
|
- Override the default --hashes-per-tick for the cluster
|
|
--no-airdrop
|
|
- If set, disables airdrops. Nodes must be funded in genesis config when airdrops are disabled.
|
|
--faucet-lamports NUM_LAMPORTS_TO_MINT
|
|
- Override the default 500000000000000000 lamports minted in genesis
|
|
--internal-nodes-stake-lamports NUM_LAMPORTS_PER_NODE
|
|
- Amount to stake internal nodes.
|
|
--internal-nodes-lamports NUM_LAMPORTS_PER_NODE
|
|
- Amount to fund internal nodes in genesis config.
|
|
--external-accounts-file FILE_PATH
|
|
- A YML file with a list of account pubkeys and corresponding lamport balances
|
|
in genesis config for external nodes
|
|
--no-snapshot-fetch
|
|
- If set, disables booting validators from a snapshot
|
|
--skip-poh-verify
|
|
- If set, validators will skip verifying
|
|
the ledger they already have saved to disk at
|
|
boot (results in a much faster boot)
|
|
--no-deploy
|
|
- Don't deploy new software, use the
|
|
existing deployment
|
|
--no-build
|
|
- Don't build new software, deploy the
|
|
existing binaries
|
|
|
|
--deploy-if-newer - Only deploy if newer software is
|
|
available (requires -t or -T)
|
|
|
|
--use-move - Build the move-loader-program and add it to the cluster
|
|
|
|
--operating-mode development|softlaunch
|
|
- Specify whether or not to launch the cluster in "development" mode with all features enabled at epoch 0,
|
|
or "softlaunch" mode with some features disabled at epoch 0 (default: development)
|
|
sanity/start-specific options:
|
|
-F - Discard validator nodes that didn't bootup successfully
|
|
-o noInstallCheck - Skip solana-install sanity
|
|
-o rejectExtraNodes - Require the exact number of nodes
|
|
|
|
stop-specific options:
|
|
none
|
|
|
|
logs-specific options:
|
|
none
|
|
|
|
netem-specific options:
|
|
--config - Netem configuration (as a double quoted string)
|
|
--parition - Percentage of network that should be configured with netem
|
|
--config-file - Configuration file for partition and netem configuration
|
|
--netem-cmd - Optional command argument to netem. Default is "add". Use "cleanup" to remove rules.
|
|
|
|
update-specific options:
|
|
--platform linux|osx|windows - Deploy the tarball using 'solana-install deploy ...' for the
|
|
given platform (multiple platforms may be specified)
|
|
(-t option must be supplied as well)
|
|
|
|
startnode/stopnode-specific options:
|
|
-i [ip address] - IP Address of the node to start or stop
|
|
|
|
Note: if RUST_LOG is set in the environment it will be propogated into the
|
|
network nodes.
|
|
EOF
|
|
exit $exitcode
|
|
}
|
|
|
|
releaseChannel=
|
|
deployMethod=local
|
|
deployIfNewer=
|
|
sanityExtraArgs=
|
|
skipSetup=false
|
|
updatePlatforms=
|
|
nodeAddress=
|
|
numIdleClients=0
|
|
numBenchTpsClients=0
|
|
numBenchExchangeClients=0
|
|
benchTpsExtraArgs=
|
|
benchExchangeExtraArgs=
|
|
failOnValidatorBootupFailure=true
|
|
genesisOptions=
|
|
numValidatorsRequested=
|
|
externalPrimordialAccountsFile=
|
|
remoteExternalPrimordialAccountsFile=
|
|
internalNodesStakeLamports=
|
|
internalNodesLamports=
|
|
maybeNoSnapshot=""
|
|
maybeLimitLedgerSize=""
|
|
maybeSkipLedgerVerify=""
|
|
maybeDisableAirdrops=""
|
|
debugBuild=false
|
|
doBuild=true
|
|
gpuMode=auto
|
|
maybeUseMove=""
|
|
netemPartition=""
|
|
netemConfig=""
|
|
netemConfigFile=""
|
|
netemCommand="add"
|
|
clientDelayStart=0
|
|
|
|
command=$1
|
|
[[ -n $command ]] || usage
|
|
shift
|
|
|
|
shortArgs=()
|
|
while [[ -n $1 ]]; do
|
|
if [[ ${1:0:2} = -- ]]; then
|
|
if [[ $1 = --hashes-per-tick ]]; then
|
|
genesisOptions="$genesisOptions $1 $2"
|
|
shift 2
|
|
elif [[ $1 = --slots-per-epoch ]]; then
|
|
genesisOptions="$genesisOptions $1 $2"
|
|
shift 2
|
|
elif [[ $1 = --target-lamports-per-signature ]]; then
|
|
genesisOptions="$genesisOptions $1 $2"
|
|
shift 2
|
|
elif [[ $1 = --faucet-lamports ]]; then
|
|
genesisOptions="$genesisOptions $1 $2"
|
|
shift 2
|
|
elif [[ $1 = --operating-mode ]]; then
|
|
case "$2" in
|
|
development|softlaunch)
|
|
;;
|
|
*)
|
|
echo "Unexpected operating mode: \"$2\""
|
|
exit 1
|
|
;;
|
|
esac
|
|
genesisOptions="$genesisOptions $1 $2"
|
|
shift 2
|
|
elif [[ $1 = --no-snapshot-fetch ]]; then
|
|
maybeNoSnapshot="$1"
|
|
shift 1
|
|
elif [[ $1 = --deploy-if-newer ]]; then
|
|
deployIfNewer=1
|
|
shift 1
|
|
elif [[ $1 = --no-deploy ]]; then
|
|
deployMethod=skip
|
|
shift 1
|
|
elif [[ $1 = --no-build ]]; then
|
|
doBuild=false
|
|
shift 1
|
|
elif [[ $1 = --limit-ledger-size ]]; then
|
|
maybeLimitLedgerSize="$1"
|
|
shift 1
|
|
elif [[ $1 = --skip-poh-verify ]]; then
|
|
maybeSkipLedgerVerify="$1"
|
|
shift 1
|
|
elif [[ $1 = --skip-setup ]]; then
|
|
skipSetup=true
|
|
shift 1
|
|
elif [[ $1 = --platform ]]; then
|
|
updatePlatforms="$updatePlatforms $2"
|
|
shift 2
|
|
elif [[ $1 = --internal-nodes-stake-lamports ]]; then
|
|
internalNodesStakeLamports="$2"
|
|
shift 2
|
|
elif [[ $1 = --internal-nodes-lamports ]]; then
|
|
internalNodesLamports="$2"
|
|
shift 2
|
|
elif [[ $1 = --external-accounts-file ]]; then
|
|
externalPrimordialAccountsFile="$2"
|
|
remoteExternalPrimordialAccountsFile=/tmp/external-primordial-accounts.yml
|
|
shift 2
|
|
elif [[ $1 = --no-airdrop ]]; then
|
|
maybeDisableAirdrops="$1"
|
|
shift 1
|
|
elif [[ $1 = --debug ]]; then
|
|
debugBuild=true
|
|
shift 1
|
|
elif [[ $1 = --use-move ]]; then
|
|
maybeUseMove=$1
|
|
shift 1
|
|
elif [[ $1 = --partition ]]; then
|
|
netemPartition=$2
|
|
shift 2
|
|
elif [[ $1 = --config ]]; then
|
|
netemConfig=$2
|
|
shift 2
|
|
elif [[ $1 == --config-file ]]; then
|
|
netemConfigFile=$2
|
|
shift 2
|
|
elif [[ $1 == --netem-cmd ]]; then
|
|
netemCommand=$2
|
|
shift 2
|
|
elif [[ $1 = --gpu-mode ]]; then
|
|
gpuMode=$2
|
|
case "$gpuMode" in
|
|
on|off|auto|cuda)
|
|
;;
|
|
*)
|
|
echo "Unexpected GPU mode: \"$gpuMode\""
|
|
exit 1
|
|
;;
|
|
esac
|
|
shift 2
|
|
elif [[ $1 == --client-delay-start ]]; then
|
|
clientDelayStart=$2
|
|
shift 2
|
|
else
|
|
usage "Unknown long option: $1"
|
|
fi
|
|
else
|
|
shortArgs+=("$1")
|
|
shift
|
|
fi
|
|
done
|
|
|
|
while getopts "h?T:t:o:f:rc:Fn:i:d" opt "${shortArgs[@]}"; do
|
|
case $opt in
|
|
h | \?)
|
|
usage
|
|
;;
|
|
T)
|
|
tarballFilename=$OPTARG
|
|
[[ -r $tarballFilename ]] || usage "File not readable: $tarballFilename"
|
|
deployMethod=tar
|
|
;;
|
|
t)
|
|
case $OPTARG in
|
|
edge|beta|stable|v*)
|
|
releaseChannel=$OPTARG
|
|
deployMethod=tar
|
|
;;
|
|
*)
|
|
usage "Invalid release channel: $OPTARG"
|
|
;;
|
|
esac
|
|
;;
|
|
n)
|
|
numValidatorsRequested=$OPTARG
|
|
;;
|
|
r)
|
|
skipSetup=true
|
|
;;
|
|
o)
|
|
case $OPTARG in
|
|
rejectExtraNodes|noInstallCheck)
|
|
sanityExtraArgs="$sanityExtraArgs -o $OPTARG"
|
|
;;
|
|
*)
|
|
usage "Unknown option: $OPTARG"
|
|
;;
|
|
esac
|
|
;;
|
|
c)
|
|
getClientTypeAndNum() {
|
|
if ! [[ $OPTARG == *'='* ]]; then
|
|
echo "Error: Expecting tuple \"clientType=numClientType=extraArgs\" but got \"$OPTARG\""
|
|
exit 1
|
|
fi
|
|
local keyValue
|
|
IFS='=' read -ra keyValue <<< "$OPTARG"
|
|
local clientType=${keyValue[0]}
|
|
local numClients=${keyValue[1]}
|
|
local extraArgs=${keyValue[2]}
|
|
re='^[0-9]+$'
|
|
if ! [[ $numClients =~ $re ]] ; then
|
|
echo "error: numClientType must be a number but got \"$numClients\""
|
|
exit 1
|
|
fi
|
|
case $clientType in
|
|
idle)
|
|
numIdleClients=$numClients
|
|
# $extraArgs ignored for 'idle'
|
|
;;
|
|
bench-tps)
|
|
numBenchTpsClients=$numClients
|
|
benchTpsExtraArgs=$extraArgs
|
|
;;
|
|
bench-exchange)
|
|
numBenchExchangeClients=$numClients
|
|
benchExchangeExtraArgs=$extraArgs
|
|
;;
|
|
*)
|
|
echo "Unknown client type: $clientType"
|
|
exit 1
|
|
;;
|
|
esac
|
|
}
|
|
getClientTypeAndNum
|
|
;;
|
|
F)
|
|
failOnValidatorBootupFailure=false
|
|
;;
|
|
i)
|
|
nodeAddress=$OPTARG
|
|
;;
|
|
d)
|
|
debugBuild=true
|
|
;;
|
|
*)
|
|
usage "Error: unhandled option: $opt"
|
|
;;
|
|
esac
|
|
done
|
|
|
|
loadConfigFile
|
|
|
|
netLogDir=
|
|
initLogDir() { # Initializes the netLogDir global variable. Idempotent
|
|
[[ -z $netLogDir ]] || return 0
|
|
|
|
netLogDir="$netDir"/log
|
|
declare netLogDateDir
|
|
netLogDateDir="$netDir"/log-$(date +"%Y-%m-%d_%H_%M_%S")
|
|
if [[ -d $netLogDir && ! -L $netLogDir ]]; then
|
|
echo "Warning: moving $netLogDir to make way for symlink."
|
|
mv "$netLogDir" "$netDir"/log.old
|
|
elif [[ -L $netLogDir ]]; then
|
|
rm "$netLogDir"
|
|
fi
|
|
mkdir -p "$netConfigDir" "$netLogDateDir"
|
|
ln -sf "$netLogDateDir" "$netLogDir"
|
|
echo "Log directory: $netLogDateDir"
|
|
}
|
|
|
|
if [[ -n $numValidatorsRequested ]]; then
|
|
truncatedNodeList=( "${validatorIpList[@]:0:$numValidatorsRequested}" )
|
|
unset validatorIpList
|
|
validatorIpList=( "${truncatedNodeList[@]}" )
|
|
fi
|
|
|
|
numClients=${#clientIpList[@]}
|
|
numClientsRequested=$((numBenchTpsClients + numBenchExchangeClients + numIdleClients))
|
|
if [[ "$numClientsRequested" -eq 0 ]]; then
|
|
numBenchTpsClients=$numClients
|
|
numClientsRequested=$numClients
|
|
else
|
|
if [[ "$numClientsRequested" -gt "$numClients" ]]; then
|
|
echo "Error: More clients requested ($numClientsRequested) then available ($numClients)"
|
|
exit 1
|
|
fi
|
|
fi
|
|
|
|
annotate() {
|
|
[[ -z $BUILDKITE ]] || {
|
|
buildkite-agent annotate "$@"
|
|
}
|
|
}
|
|
|
|
annotateBlockexplorerUrl() {
|
|
declare blockstreamer=${blockstreamerIpList[0]}
|
|
|
|
if [[ -n $blockstreamer ]]; then
|
|
annotate --style info --context blockexplorer-url "Block explorer: http://$blockstreamer/"
|
|
fi
|
|
}
|
|
|
|
build() {
|
|
supported=("18.04")
|
|
declare MAYBE_DOCKER=
|
|
if [[ $(uname) != Linux || ! " ${supported[*]} " =~ $(lsb_release -sr) ]]; then
|
|
# shellcheck source=ci/rust-version.sh
|
|
source "$SOLANA_ROOT"/ci/rust-version.sh
|
|
MAYBE_DOCKER="ci/docker-run.sh $rust_stable_docker_image"
|
|
fi
|
|
SECONDS=0
|
|
(
|
|
cd "$SOLANA_ROOT"
|
|
echo "--- Build started at $(date)"
|
|
|
|
set -x
|
|
rm -rf farf
|
|
|
|
buildVariant=
|
|
if $debugBuild; then
|
|
buildVariant=debug
|
|
fi
|
|
|
|
$MAYBE_DOCKER bash -c "
|
|
set -ex
|
|
scripts/cargo-install-all.sh farf \"$buildVariant\" \"$maybeUseMove\"
|
|
"
|
|
)
|
|
echo "Build took $SECONDS seconds"
|
|
}
|
|
|
|
startCommon() {
|
|
declare ipAddress=$1
|
|
test -d "$SOLANA_ROOT"
|
|
if $skipSetup; then
|
|
ssh "${sshOptions[@]}" "$ipAddress" "
|
|
set -x;
|
|
mkdir -p ~/solana/config;
|
|
rm -rf ~/config;
|
|
mv ~/solana/config ~;
|
|
rm -rf ~/solana;
|
|
mkdir -p ~/solana ~/.cargo/bin;
|
|
mv ~/config ~/solana/
|
|
"
|
|
else
|
|
ssh "${sshOptions[@]}" "$ipAddress" "
|
|
set -x;
|
|
rm -rf ~/solana;
|
|
mkdir -p ~/.cargo/bin
|
|
"
|
|
fi
|
|
[[ -z "$externalNodeSshKey" ]] || ssh-copy-id -f -i "$externalNodeSshKey" "${sshOptions[@]}" "solana@$ipAddress"
|
|
syncScripts "$ipAddress"
|
|
}
|
|
|
|
syncScripts() {
|
|
echo "rsyncing scripts... to $ipAddress"
|
|
declare ipAddress=$1
|
|
rsync -vPrc -e "ssh ${sshOptions[*]}" \
|
|
--exclude 'net/log*' \
|
|
"$SOLANA_ROOT"/{fetch-perf-libs.sh,scripts,net,multinode-demo} \
|
|
"$ipAddress":~/solana/ > /dev/null
|
|
}
|
|
|
|
startBootstrapLeader() {
|
|
declare ipAddress=$1
|
|
declare nodeIndex="$2"
|
|
declare logFile="$3"
|
|
echo "--- Starting bootstrap leader: $ipAddress"
|
|
echo "start log: $logFile"
|
|
|
|
# Deploy local binaries to bootstrap validator. Other validators and clients later fetch the
|
|
# binaries from it
|
|
(
|
|
set -x
|
|
startCommon "$ipAddress" || exit 1
|
|
[[ -z "$externalPrimordialAccountsFile" ]] || rsync -vPrc -e "ssh ${sshOptions[*]}" "$externalPrimordialAccountsFile" \
|
|
"$ipAddress:$remoteExternalPrimordialAccountsFile"
|
|
case $deployMethod in
|
|
tar)
|
|
rsync -vPrc -e "ssh ${sshOptions[*]}" "$SOLANA_ROOT"/solana-release/bin/* "$ipAddress:~/.cargo/bin/"
|
|
rsync -vPrc -e "ssh ${sshOptions[*]}" "$SOLANA_ROOT"/solana-release/version.yml "$ipAddress:~/"
|
|
;;
|
|
local)
|
|
rsync -vPrc -e "ssh ${sshOptions[*]}" "$SOLANA_ROOT"/farf/bin/* "$ipAddress:~/.cargo/bin/"
|
|
ssh "${sshOptions[@]}" -n "$ipAddress" "rm -f ~/version.yml; touch ~/version.yml"
|
|
;;
|
|
skip)
|
|
;;
|
|
*)
|
|
usage "Internal error: invalid deployMethod: $deployMethod"
|
|
;;
|
|
esac
|
|
|
|
ssh "${sshOptions[@]}" -n "$ipAddress" \
|
|
"./solana/net/remote/remote-node.sh \
|
|
$deployMethod \
|
|
bootstrap-leader \
|
|
$entrypointIp \
|
|
$((${#validatorIpList[@]} + ${#blockstreamerIpList[@]} + ${#archiverIpList[@]})) \
|
|
\"$RUST_LOG\" \
|
|
$skipSetup \
|
|
$failOnValidatorBootupFailure \
|
|
\"$remoteExternalPrimordialAccountsFile\" \
|
|
\"$maybeDisableAirdrops\" \
|
|
\"$internalNodesStakeLamports\" \
|
|
\"$internalNodesLamports\" \
|
|
$nodeIndex \
|
|
$numBenchTpsClients \"$benchTpsExtraArgs\" \
|
|
$numBenchExchangeClients \"$benchExchangeExtraArgs\" \
|
|
\"$genesisOptions\" \
|
|
\"$maybeNoSnapshot $maybeSkipLedgerVerify $maybeLimitLedgerSize\" \
|
|
\"$gpuMode\" \
|
|
\"$GEOLOCATION_API_KEY\" \
|
|
"
|
|
|
|
) >> "$logFile" 2>&1 || {
|
|
cat "$logFile"
|
|
echo "^^^ +++"
|
|
exit 1
|
|
}
|
|
}
|
|
|
|
startNode() {
|
|
declare ipAddress=$1
|
|
declare nodeType=$2
|
|
declare nodeIndex="$3"
|
|
|
|
initLogDir
|
|
declare logFile="$netLogDir/validator-$ipAddress.log"
|
|
|
|
if [[ -z $nodeType ]]; then
|
|
echo nodeType not specified
|
|
exit 1
|
|
fi
|
|
|
|
if [[ -z $nodeIndex ]]; then
|
|
echo nodeIndex not specified
|
|
exit 1
|
|
fi
|
|
|
|
echo "--- Starting $nodeType: $ipAddress"
|
|
echo "start log: $logFile"
|
|
(
|
|
set -x
|
|
startCommon "$ipAddress"
|
|
|
|
if [[ $nodeType = blockstreamer ]] && [[ -n $letsEncryptDomainName ]]; then
|
|
#
|
|
# Create/renew TLS certificate
|
|
#
|
|
declare localArchive=~/letsencrypt-"$letsEncryptDomainName".tgz
|
|
if [[ -r "$localArchive" ]]; then
|
|
timeout 30s scp "${sshOptions[@]}" "$localArchive" "$ipAddress:letsencrypt.tgz"
|
|
fi
|
|
ssh "${sshOptions[@]}" -n "$ipAddress" \
|
|
"sudo -H /certbot-restore.sh $letsEncryptDomainName maintainers@solana.com"
|
|
rm -f letsencrypt.tgz
|
|
timeout 30s scp "${sshOptions[@]}" "$ipAddress:/letsencrypt.tgz" letsencrypt.tgz
|
|
test -s letsencrypt.tgz # Ensure non-empty before overwriting $localArchive
|
|
cp letsencrypt.tgz "$localArchive"
|
|
fi
|
|
|
|
ssh "${sshOptions[@]}" -n "$ipAddress" \
|
|
"./solana/net/remote/remote-node.sh \
|
|
$deployMethod \
|
|
$nodeType \
|
|
$entrypointIp \
|
|
$((${#validatorIpList[@]} + ${#blockstreamerIpList[@]} + ${#archiverIpList[@]})) \
|
|
\"$RUST_LOG\" \
|
|
$skipSetup \
|
|
$failOnValidatorBootupFailure \
|
|
\"$remoteExternalPrimordialAccountsFile\" \
|
|
\"$maybeDisableAirdrops\" \
|
|
\"$internalNodesStakeLamports\" \
|
|
\"$internalNodesLamports\" \
|
|
$nodeIndex \
|
|
$numBenchTpsClients \"$benchTpsExtraArgs\" \
|
|
$numBenchExchangeClients \"$benchExchangeExtraArgs\" \
|
|
\"$genesisOptions\" \
|
|
\"$maybeNoSnapshot $maybeSkipLedgerVerify $maybeLimitLedgerSize\" \
|
|
\"$gpuMode\" \
|
|
\"$GEOLOCATION_API_KEY\" \
|
|
"
|
|
) >> "$logFile" 2>&1 &
|
|
declare pid=$!
|
|
ln -sf "validator-$ipAddress.log" "$netLogDir/validator-$pid.log"
|
|
pids+=("$pid")
|
|
}
|
|
|
|
startClient() {
|
|
declare ipAddress=$1
|
|
declare clientToRun="$2"
|
|
declare clientIndex="$3"
|
|
|
|
initLogDir
|
|
declare logFile="$netLogDir/client-$clientToRun-$ipAddress.log"
|
|
|
|
echo "--- Starting client: $ipAddress - $clientToRun"
|
|
echo "start log: $logFile"
|
|
(
|
|
set -x
|
|
startCommon "$ipAddress"
|
|
ssh "${sshOptions[@]}" -f "$ipAddress" \
|
|
"./solana/net/remote/remote-client.sh $deployMethod $entrypointIp \
|
|
$clientToRun \"$RUST_LOG\" \"$benchTpsExtraArgs\" \"$benchExchangeExtraArgs\" $clientIndex"
|
|
) >> "$logFile" 2>&1 || {
|
|
cat "$logFile"
|
|
echo "^^^ +++"
|
|
exit 1
|
|
}
|
|
}
|
|
|
|
sanity() {
|
|
declare skipBlockstreamerSanity=$1
|
|
|
|
$metricsWriteDatapoint "testnet-deploy net-sanity-begin=1"
|
|
|
|
declare ok=true
|
|
declare bootstrapLeader=${validatorIpList[0]}
|
|
declare blockstreamer=${blockstreamerIpList[0]}
|
|
|
|
annotateBlockexplorerUrl
|
|
|
|
echo "--- Sanity: $bootstrapLeader"
|
|
(
|
|
set -x
|
|
# shellcheck disable=SC2029 # remote-client.sh args are expanded on client side intentionally
|
|
ssh "${sshOptions[@]}" "$bootstrapLeader" \
|
|
"./solana/net/remote/remote-sanity.sh $bootstrapLeader $sanityExtraArgs \"$RUST_LOG\""
|
|
) || ok=false
|
|
$ok || exit 1
|
|
|
|
if [[ -z $skipBlockstreamerSanity && -n $blockstreamer ]]; then
|
|
# If there's a blockstreamer node run a reduced sanity check on it as well
|
|
echo "--- Sanity: $blockstreamer"
|
|
(
|
|
set -x
|
|
# shellcheck disable=SC2029 # remote-client.sh args are expanded on client side intentionally
|
|
ssh "${sshOptions[@]}" "$blockstreamer" \
|
|
"./solana/net/remote/remote-sanity.sh $blockstreamer $sanityExtraArgs \"$RUST_LOG\""
|
|
) || ok=false
|
|
$ok || exit 1
|
|
fi
|
|
|
|
$metricsWriteDatapoint "testnet-deploy net-sanity-complete=1"
|
|
}
|
|
|
|
deployUpdate() {
|
|
if [[ -z $updatePlatforms ]]; then
|
|
echo "No update platforms"
|
|
return
|
|
fi
|
|
if [[ -z $releaseChannel ]]; then
|
|
echo "Release channel not specified (use -t option)"
|
|
exit 1
|
|
fi
|
|
|
|
declare ok=true
|
|
declare bootstrapLeader=${validatorIpList[0]}
|
|
|
|
for updatePlatform in $updatePlatforms; do
|
|
echo "--- Deploying solana-install update: $updatePlatform"
|
|
(
|
|
set -x
|
|
|
|
scripts/solana-install-update-manifest-keypair.sh "$updatePlatform"
|
|
|
|
timeout 30s scp "${sshOptions[@]}" \
|
|
update_manifest_keypair.json "$bootstrapLeader:solana/update_manifest_keypair.json"
|
|
|
|
# shellcheck disable=SC2029 # remote-deploy-update.sh args are expanded on client side intentionally
|
|
ssh "${sshOptions[@]}" "$bootstrapLeader" \
|
|
"./solana/net/remote/remote-deploy-update.sh $releaseChannel $updatePlatform"
|
|
) || ok=false
|
|
$ok || exit 1
|
|
done
|
|
}
|
|
|
|
getNodeType() {
|
|
echo "getNodeType: $nodeAddress"
|
|
[[ -n $nodeAddress ]] || {
|
|
echo "Error: nodeAddress not set"
|
|
exit 1
|
|
}
|
|
nodeIndex=0 # <-- global
|
|
nodeType=validator # <-- global
|
|
|
|
for ipAddress in "${validatorIpList[@]}" b "${blockstreamerIpList[@]}" r "${archiverIpList[@]}"; do
|
|
if [[ $ipAddress = b ]]; then
|
|
nodeType=blockstreamer
|
|
continue
|
|
elif [[ $ipAddress = r ]]; then
|
|
nodeType=archiver
|
|
continue
|
|
fi
|
|
|
|
if [[ $ipAddress = "$nodeAddress" ]]; then
|
|
echo "getNodeType: $nodeType ($nodeIndex)"
|
|
return
|
|
fi
|
|
((nodeIndex = nodeIndex + 1))
|
|
done
|
|
|
|
echo "Error: Unknown node: $nodeAddress"
|
|
exit 1
|
|
}
|
|
|
|
prepare_deploy() {
|
|
case $deployMethod in
|
|
tar)
|
|
if [[ -n $releaseChannel ]]; then
|
|
rm -f "$SOLANA_ROOT"/solana-release.tar.bz2
|
|
declare updateDownloadUrl=http://release.solana.com/"$releaseChannel"/solana-release-x86_64-unknown-linux-gnu.tar.bz2
|
|
(
|
|
set -x
|
|
curl --retry 5 --retry-delay 2 --retry-connrefused \
|
|
-o "$SOLANA_ROOT"/solana-release.tar.bz2 "$updateDownloadUrl"
|
|
)
|
|
tarballFilename="$SOLANA_ROOT"/solana-release.tar.bz2
|
|
fi
|
|
(
|
|
set -x
|
|
rm -rf "$SOLANA_ROOT"/solana-release
|
|
(cd "$SOLANA_ROOT"; tar jxv) < "$tarballFilename"
|
|
cat "$SOLANA_ROOT"/solana-release/version.yml
|
|
)
|
|
;;
|
|
local)
|
|
if $doBuild; then
|
|
build
|
|
else
|
|
echo "Build skipped due to --no-build"
|
|
fi
|
|
;;
|
|
skip)
|
|
;;
|
|
*)
|
|
usage "Internal error: invalid deployMethod: $deployMethod"
|
|
;;
|
|
esac
|
|
|
|
if [[ -n $deployIfNewer ]]; then
|
|
if [[ $deployMethod != tar ]]; then
|
|
echo "Error: --deploy-if-newer only supported for tar deployments"
|
|
exit 1
|
|
fi
|
|
|
|
echo "Fetching current software version"
|
|
(
|
|
set -x
|
|
rsync -vPrc -e "ssh ${sshOptions[*]}" "${validatorIpList[0]}":~/version.yml current-version.yml
|
|
)
|
|
cat current-version.yml
|
|
if ! diff -q current-version.yml "$SOLANA_ROOT"/solana-release/version.yml; then
|
|
echo "Cluster software version is old. Update required"
|
|
else
|
|
echo "Cluster software version is current. No update required"
|
|
exit 0
|
|
fi
|
|
fi
|
|
}
|
|
|
|
deploy() {
|
|
initLogDir
|
|
|
|
echo "Deployment started at $(date)"
|
|
$metricsWriteDatapoint "testnet-deploy net-start-begin=1"
|
|
|
|
declare bootstrapLeader=true
|
|
for nodeAddress in "${validatorIpList[@]}" "${blockstreamerIpList[@]}" "${archiverIpList[@]}"; do
|
|
nodeType=
|
|
nodeIndex=
|
|
getNodeType
|
|
if $bootstrapLeader; then
|
|
SECONDS=0
|
|
declare bootstrapNodeDeployTime=
|
|
startBootstrapLeader "$nodeAddress" $nodeIndex "$netLogDir/bootstrap-leader-$ipAddress.log"
|
|
bootstrapNodeDeployTime=$SECONDS
|
|
$metricsWriteDatapoint "testnet-deploy net-bootnode-leader-started=1"
|
|
|
|
bootstrapLeader=false
|
|
SECONDS=0
|
|
pids=()
|
|
else
|
|
startNode "$ipAddress" $nodeType $nodeIndex
|
|
|
|
# Stagger additional node start time. If too many nodes start simultaneously
|
|
# the bootstrap node gets more rsync requests from the additional nodes than
|
|
# it can handle.
|
|
sleep 2
|
|
fi
|
|
done
|
|
|
|
|
|
for pid in "${pids[@]}"; do
|
|
declare ok=true
|
|
wait "$pid" || ok=false
|
|
if ! $ok; then
|
|
echo "+++ validator failed to start"
|
|
cat "$netLogDir/validator-$pid.log"
|
|
if $failOnValidatorBootupFailure; then
|
|
exit 1
|
|
else
|
|
echo "Failure is non-fatal"
|
|
fi
|
|
fi
|
|
done
|
|
|
|
$metricsWriteDatapoint "testnet-deploy net-validators-started=1"
|
|
additionalNodeDeployTime=$SECONDS
|
|
|
|
annotateBlockexplorerUrl
|
|
|
|
sanity skipBlockstreamerSanity # skip sanity on blockstreamer node, it may not
|
|
# have caught up to the bootstrap leader yet
|
|
|
|
echo "--- Sleeping $clientDelayStart seconds after validators are started before starting clients"
|
|
sleep "$clientDelayStart"
|
|
|
|
SECONDS=0
|
|
for ((i=0; i < "$numClients" && i < "$numClientsRequested"; i++)) do
|
|
if [[ $i -lt "$numBenchTpsClients" ]]; then
|
|
startClient "${clientIpList[$i]}" "solana-bench-tps" "$i"
|
|
elif [[ $i -lt $((numBenchTpsClients + numBenchExchangeClients)) ]]; then
|
|
startClient "${clientIpList[$i]}" "solana-bench-exchange" $((i-numBenchTpsClients))
|
|
else
|
|
startClient "${clientIpList[$i]}" "idle"
|
|
fi
|
|
done
|
|
clientDeployTime=$SECONDS
|
|
|
|
$metricsWriteDatapoint "testnet-deploy net-start-complete=1"
|
|
|
|
declare networkVersion=unknown
|
|
case $deployMethod in
|
|
tar)
|
|
networkVersion="$(
|
|
(
|
|
set -o pipefail
|
|
grep "^commit: " "$SOLANA_ROOT"/solana-release/version.yml | head -n1 | cut -d\ -f2
|
|
) || echo "tar-unknown"
|
|
)"
|
|
;;
|
|
local)
|
|
networkVersion="$(git rev-parse HEAD || echo local-unknown)"
|
|
;;
|
|
skip)
|
|
;;
|
|
*)
|
|
usage "Internal error: invalid deployMethod: $deployMethod"
|
|
;;
|
|
esac
|
|
$metricsWriteDatapoint "testnet-deploy version=\"${networkVersion:0:9}\""
|
|
|
|
echo
|
|
echo "+++ Deployment Successful"
|
|
echo "Bootstrap leader deployment took $bootstrapNodeDeployTime seconds"
|
|
echo "Additional validator deployment (${#validatorIpList[@]} validators, ${#blockstreamerIpList[@]} blockstreamer nodes, ${#archiverIpList[@]} archivers) took $additionalNodeDeployTime seconds"
|
|
echo "Client deployment (${#clientIpList[@]} instances) took $clientDeployTime seconds"
|
|
echo "Network start logs in $netLogDir"
|
|
}
|
|
|
|
stopNode() {
|
|
local ipAddress=$1
|
|
local block=$2
|
|
|
|
initLogDir
|
|
declare logFile="$netLogDir/stop-validator-$ipAddress.log"
|
|
|
|
echo "--- Stopping node: $ipAddress"
|
|
echo "stop log: $logFile"
|
|
syncScripts "$ipAddress"
|
|
(
|
|
# Since cleanup.sh does a pkill, we cannot pass the command directly,
|
|
# otherwise the process which is doing the killing will be killed because
|
|
# the script itself will match the pkill pattern
|
|
set -x
|
|
# shellcheck disable=SC2029 # It's desired that PS4 be expanded on the client side
|
|
ssh "${sshOptions[@]}" "$ipAddress" "PS4=\"$PS4\" ./solana/net/remote/cleanup.sh"
|
|
) >> "$logFile" 2>&1 &
|
|
|
|
declare pid=$!
|
|
ln -sf "stop-validator-$ipAddress.log" "$netLogDir/stop-validator-$pid.log"
|
|
if $block; then
|
|
wait $pid
|
|
else
|
|
pids+=("$pid")
|
|
fi
|
|
}
|
|
|
|
stop() {
|
|
SECONDS=0
|
|
$metricsWriteDatapoint "testnet-deploy net-stop-begin=1"
|
|
|
|
declare loopCount=0
|
|
pids=()
|
|
for ipAddress in "${validatorIpList[@]}" "${blockstreamerIpList[@]}" "${archiverIpList[@]}" "${clientIpList[@]}"; do
|
|
stopNode "$ipAddress" false
|
|
|
|
# Stagger additional node stop time to avoid too many concurrent ssh
|
|
# sessions
|
|
((loopCount++ % 4 == 0)) && sleep 2
|
|
done
|
|
|
|
echo --- Waiting for nodes to finish stopping
|
|
for pid in "${pids[@]}"; do
|
|
echo -n "$pid "
|
|
wait "$pid" || true
|
|
done
|
|
echo
|
|
|
|
$metricsWriteDatapoint "testnet-deploy net-stop-complete=1"
|
|
echo "Stopping nodes took $SECONDS seconds"
|
|
}
|
|
|
|
checkPremptibleInstances() {
|
|
# The validatorIpList nodes may be preemptible instances that can disappear at
|
|
# any time. Try to detect when a validator has been preempted to help the user
|
|
# out.
|
|
#
|
|
# Of course this isn't airtight as an instance could always disappear
|
|
# immediately after its successfully pinged.
|
|
for ipAddress in "${validatorIpList[@]}"; do
|
|
(
|
|
set -x
|
|
timeout 5s ping -c 1 "$ipAddress" | tr - _
|
|
) || {
|
|
cat <<EOF
|
|
|
|
Warning: $ipAddress may have been preempted.
|
|
|
|
Run |./gce.sh config| to restart it
|
|
EOF
|
|
exit 1
|
|
}
|
|
done
|
|
}
|
|
|
|
checkPremptibleInstances
|
|
|
|
case $command in
|
|
restart)
|
|
prepare_deploy
|
|
stop
|
|
deploy
|
|
;;
|
|
start)
|
|
prepare_deploy
|
|
deploy
|
|
;;
|
|
sanity)
|
|
sanity
|
|
;;
|
|
stop)
|
|
stop
|
|
;;
|
|
update)
|
|
deployUpdate
|
|
;;
|
|
stopnode)
|
|
if [[ -z $nodeAddress ]]; then
|
|
usage "node address (-i) not specified"
|
|
exit 1
|
|
fi
|
|
stopNode "$nodeAddress" true
|
|
;;
|
|
startnode)
|
|
if [[ -z $nodeAddress ]]; then
|
|
usage "node address (-i) not specified"
|
|
exit 1
|
|
fi
|
|
nodeType=
|
|
nodeIndex=
|
|
getNodeType
|
|
startNode "$nodeAddress" $nodeType $nodeIndex
|
|
;;
|
|
logs)
|
|
initLogDir
|
|
fetchRemoteLog() {
|
|
declare ipAddress=$1
|
|
declare log=$2
|
|
echo "--- fetching $log from $ipAddress"
|
|
(
|
|
set -x
|
|
timeout 30s scp "${sshOptions[@]}" \
|
|
"$ipAddress":solana/"$log".log "$netLogDir"/remote-"$log"-"$ipAddress".log
|
|
) || echo "failed to fetch log"
|
|
}
|
|
fetchRemoteLog "${validatorIpList[0]}" faucet
|
|
for ipAddress in "${validatorIpList[@]}"; do
|
|
fetchRemoteLog "$ipAddress" validator
|
|
done
|
|
for ipAddress in "${clientIpList[@]}"; do
|
|
fetchRemoteLog "$ipAddress" client
|
|
done
|
|
for ipAddress in "${blockstreamerIpList[@]}"; do
|
|
fetchRemoteLog "$ipAddress" validator
|
|
done
|
|
for ipAddress in "${archiverIpList[@]}"; do
|
|
fetchRemoteLog "$ipAddress" validator
|
|
done
|
|
;;
|
|
netem)
|
|
if [[ -n $netemConfigFile ]]; then
|
|
if [[ $netemCommand = "add" ]]; then
|
|
for ipAddress in "${validatorIpList[@]}"; do
|
|
"$here"/scp.sh "$netemConfigFile" solana@"$ipAddress":~/solana
|
|
done
|
|
fi
|
|
for i in "${!validatorIpList[@]}"; do
|
|
"$here"/ssh.sh solana@"${validatorIpList[$i]}" 'solana/scripts/net-shaper.sh' \
|
|
"$netemCommand" ~solana/solana/"$netemConfigFile" "${#validatorIpList[@]}" "$i"
|
|
done
|
|
else
|
|
num_nodes=$((${#validatorIpList[@]}*netemPartition/100))
|
|
if [[ $((${#validatorIpList[@]}*netemPartition%100)) -gt 0 ]]; then
|
|
num_nodes=$((num_nodes+1))
|
|
fi
|
|
if [[ "$num_nodes" -gt "${#validatorIpList[@]}" ]]; then
|
|
num_nodes=${#validatorIpList[@]}
|
|
fi
|
|
|
|
# Stop netem on all nodes
|
|
for ipAddress in "${validatorIpList[@]}"; do
|
|
"$here"/ssh.sh solana@"$ipAddress" 'solana/scripts/netem.sh delete < solana/netem.cfg || true'
|
|
done
|
|
|
|
# Start netem on required nodes
|
|
for ((i=0; i<num_nodes; i++ )); do :
|
|
"$here"/ssh.sh solana@"${validatorIpList[$i]}" "echo $netemConfig > solana/netem.cfg; solana/scripts/netem.sh add \"$netemConfig\""
|
|
done
|
|
fi
|
|
;;
|
|
*)
|
|
echo "Internal error: Unknown command: $command"
|
|
usage
|
|
exit 1
|
|
esac
|