snap mode is for daemons, remove it from client (i.e. common.sh) supply leader info to client via snap
482 lines
13 KiB
Bash
Executable File
482 lines
13 KiB
Bash
Executable File
#!/bin/bash -e
|
|
#
|
|
# Deploys the Solana software running on the testnet full nodes
|
|
#
|
|
# This script must be run by a user/machine that has successfully authenticated
|
|
# with GCP and has sufficient permission.
|
|
#
|
|
|
|
here=$(dirname "$0")
|
|
metrics_write_datapoint="$here"/../scripts/metrics-write-datapoint.sh
|
|
|
|
# shellcheck source=scripts/gcloud.sh
|
|
source "$here"/../scripts/gcloud.sh
|
|
|
|
# TODO: Switch over to rolling updates
|
|
ROLLING_UPDATE=false
|
|
#ROLLING_UPDATE=true
|
|
|
|
if [[ -z $SOLANA_METRICS_CONFIG ]]; then
|
|
echo Error: SOLANA_METRICS_CONFIG environment variable is unset
|
|
exit 1
|
|
fi
|
|
|
|
# Default to edge channel. To select the beta channel:
|
|
# export SOLANA_SNAP_CHANNEL=beta
|
|
if [[ -z $SOLANA_SNAP_CHANNEL ]]; then
|
|
SOLANA_SNAP_CHANNEL=edge
|
|
fi
|
|
|
|
# Select default network URL based on SOLANA_SNAP_CHANNEL if SOLANA_NET_ENTRYPOINT is
|
|
# unspecified
|
|
if [[ -z $SOLANA_NET_ENTRYPOINT ]]; then
|
|
case $SOLANA_SNAP_CHANNEL in
|
|
edge)
|
|
SOLANA_NET_ENTRYPOINT=master.testnet.solana.com
|
|
unset SOLANA_NET_NAME
|
|
;;
|
|
beta)
|
|
SOLANA_NET_ENTRYPOINT=testnet.solana.com
|
|
unset SOLANA_NET_NAME
|
|
;;
|
|
*)
|
|
echo Error: Unknown SOLANA_SNAP_CHANNEL=$SOLANA_SNAP_CHANNEL
|
|
exit 1
|
|
;;
|
|
esac
|
|
fi
|
|
|
|
if [[ -z $SOLANA_NET_NAME ]]; then
|
|
SOLANA_NET_NAME=${SOLANA_NET_ENTRYPOINT//./-}
|
|
fi
|
|
|
|
: ${SOLANA_NET_NAME:?$SOLANA_NET_ENTRYPOINT}
|
|
netBasename=${SOLANA_NET_NAME/-testnet-solana-com/}
|
|
if [[ $netBasename != testnet ]]; then
|
|
netBasename="testnet-$netBasename"
|
|
fi
|
|
|
|
# Figure installation command
|
|
SNAP_INSTALL_CMD="\
|
|
for i in {1..3}; do \
|
|
sudo snap install solana --$SOLANA_SNAP_CHANNEL --devmode && break;
|
|
sleep 1; \
|
|
done \
|
|
"
|
|
LOCAL_SNAP=$1
|
|
if [[ -n $LOCAL_SNAP ]]; then
|
|
if [[ ! -f $LOCAL_SNAP ]]; then
|
|
echo "Error: $LOCAL_SNAP is not a file"
|
|
exit 1
|
|
fi
|
|
SNAP_INSTALL_CMD="sudo snap install ~/solana_local.snap --devmode --dangerous"
|
|
fi
|
|
SNAP_INSTALL_CMD="sudo snap remove solana; $SNAP_INSTALL_CMD"
|
|
|
|
EARLYOOM_INSTALL_CMD="\
|
|
wget --retry-connrefused --waitretry=1 \
|
|
--read-timeout=20 --timeout=15 --tries=5 \
|
|
-O install-earlyoom.sh \
|
|
https://raw.githubusercontent.com/solana-labs/solana/master/ci/install-earlyoom.sh; \
|
|
bash install-earlyoom.sh \
|
|
"
|
|
SNAP_INSTALL_CMD="$EARLYOOM_INSTALL_CMD; $SNAP_INSTALL_CMD"
|
|
|
|
# `export SKIP_INSTALL=1` to reset the network without reinstalling the snap
|
|
if [[ -n $SKIP_INSTALL ]]; then
|
|
SNAP_INSTALL_CMD="echo Install skipped"
|
|
fi
|
|
|
|
echo "+++ Configuration for $netBasename"
|
|
publicUrl="$SOLANA_NET_ENTRYPOINT"
|
|
if [[ $publicUrl = testnet.solana.com ]]; then
|
|
publicIp="" # Use default value
|
|
else
|
|
publicIp=$(dig +short $publicUrl | head -n1)
|
|
fi
|
|
|
|
echo "Network name: $SOLANA_NET_NAME"
|
|
echo "Network entry point URL: $publicUrl ($publicIp)"
|
|
echo "Snap channel: $SOLANA_SNAP_CHANNEL"
|
|
echo "Install command: $SNAP_INSTALL_CMD"
|
|
echo "Setup args: $SOLANA_SETUP_ARGS"
|
|
[[ -z $LOCAL_SNAP ]] || echo "Local snap: $LOCAL_SNAP"
|
|
|
|
vmlist=() # Each array element is formatted as "class:vmName:vmZone:vmPublicIp"
|
|
|
|
vm_exec() {
|
|
declare vmName=$1
|
|
declare vmZone=$2
|
|
declare vmPublicIp=$3
|
|
declare message=$4
|
|
declare cmd=$5
|
|
|
|
echo "--- $message $vmName in zone $vmZone ($vmPublicIp)"
|
|
ssh -o BatchMode=yes -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \
|
|
testnet-deploy@"$vmPublicIp" "$cmd"
|
|
}
|
|
|
|
#
|
|
# vm_foreach [cmd] [extra args to cmd]
|
|
# where
|
|
# cmd - the command to execute on each VM
|
|
# The command will receive three fixed arguments, followed by any
|
|
# additionl arguments supplied to vm_foreach:
|
|
# vmName - GCP name of the VM
|
|
# vmZone - The GCP zone the VM is located in
|
|
# vmPublicIp - The public IP address of this VM
|
|
# vmClass - The 'class' of this VM
|
|
# count - Monotonically increasing count for each
|
|
# invocation of cmd, starting at 1
|
|
# ... - Extra args to cmd..
|
|
#
|
|
#
|
|
vm_foreach() {
|
|
declare cmd=$1
|
|
shift
|
|
|
|
declare count=1
|
|
for info in "${vmlist[@]}"; do
|
|
declare vmClass vmName vmZone vmPublicIp
|
|
IFS=: read -r vmClass vmName vmZone vmPublicIp _ < <(echo "$info")
|
|
|
|
eval "$cmd" "$vmName" "$vmZone" "$vmPublicIp" "$vmClass" "$count" "$@"
|
|
count=$((count + 1))
|
|
done
|
|
}
|
|
|
|
#
|
|
# vm_foreach_in_class [class] [cmd]
|
|
# where
|
|
# class - the desired VM class to operate on
|
|
# cmd - the command to execute on each VM in the desired class.
|
|
# The command will receive three arguments:
|
|
# vmName - GCP name of the VM
|
|
# vmZone - The GCP zone the VM is located in
|
|
# vmPublicIp - The public IP address of this VM
|
|
# count - Monotonically increasing count for each
|
|
# invocation of cmd, starting at 1
|
|
#
|
|
#
|
|
_run_cmd_if_class() {
|
|
declare vmName=$1
|
|
declare vmZone=$2
|
|
declare vmPublicIp=$3
|
|
declare vmClass=$4
|
|
declare count=$5
|
|
declare class=$6
|
|
declare cmd=$7
|
|
if [[ $class = "$vmClass" ]]; then
|
|
eval "$cmd" "$vmName" "$vmZone" "$vmPublicIp" "$count"
|
|
fi
|
|
}
|
|
|
|
vm_foreach_in_class() {
|
|
declare class=$1
|
|
declare cmd=$2
|
|
vm_foreach _run_cmd_if_class "$1" "$2"
|
|
}
|
|
|
|
#
|
|
# Load all VMs matching the specified filter and tag them with the specified
|
|
# class into the `vmlist` array.
|
|
findVms() {
|
|
declare class="$1"
|
|
declare filter="$2"
|
|
|
|
gcloud_FindInstances "$filter" show
|
|
vmlist+=("${instances[@]/#/$class:}")
|
|
}
|
|
|
|
wait_for_pids() {
|
|
echo "--- Waiting for $*"
|
|
for pid in "${pids[@]}"; do
|
|
declare ok=true
|
|
wait "$pid" || ok=false
|
|
cat "log-$pid.txt"
|
|
if ! $ok; then
|
|
echo ^^^ +++
|
|
exit 1
|
|
fi
|
|
rm "log-$pid.txt"
|
|
done
|
|
}
|
|
|
|
delete_unreachable_validators() {
|
|
declare vmName=$1
|
|
declare vmZone=$2
|
|
declare vmPublicIp=$3
|
|
|
|
touch "log-$vmName.txt"
|
|
(
|
|
SECONDS=0
|
|
if ! vm_exec "$vmName" "$vmZone" "$vmPublicIp" "Checking $vmName" uptime; then
|
|
echo "^^^ +++"
|
|
|
|
# Validators are managed by a Compute Engine Instance Group, so deleting
|
|
# one will just cause a new one to be spawned.
|
|
echo "Warning: $vmName is unreachable, deleting it"
|
|
gcloud_DeleteInstances "$vmName"
|
|
fi
|
|
echo "validator checked in ${SECONDS} seconds"
|
|
) >> "log-$vmName.txt" 2>&1 &
|
|
declare pid=$!
|
|
|
|
# Rename log file so it can be discovered later by $pid
|
|
mv "log-$vmName.txt" "log-$pid.txt"
|
|
pids+=("$pid")
|
|
}
|
|
|
|
|
|
echo "Validator nodes (unverified):"
|
|
findVms validator "name~^$SOLANA_NET_NAME-validator-"
|
|
pids=()
|
|
vm_foreach_in_class validator delete_unreachable_validators
|
|
wait_for_pids validator sanity check
|
|
vmlist=()
|
|
|
|
echo "Leader node:"
|
|
findVms leader "name=$SOLANA_NET_NAME"
|
|
[[ ${#vmlist[@]} = 1 ]] || {
|
|
echo "Unable to find $SOLANA_NET_NAME"
|
|
exit 1
|
|
}
|
|
|
|
echo "Client node(s):"
|
|
findVms client "name~^$SOLANA_NET_NAME-client"
|
|
|
|
echo "Validator nodes:"
|
|
findVms validator "name~^$SOLANA_NET_NAME-validator-"
|
|
|
|
fullnode_count=0
|
|
inc_fullnode_count() {
|
|
fullnode_count=$((fullnode_count + 1))
|
|
}
|
|
vm_foreach_in_class leader inc_fullnode_count
|
|
vm_foreach_in_class validator inc_fullnode_count
|
|
|
|
# Add "network stopping" datapoint
|
|
$metrics_write_datapoint "testnet-deploy,name=$netBasename stop=1"
|
|
|
|
client_start() {
|
|
declare vmName=$1
|
|
declare vmZone=$2
|
|
declare vmPublicIp=$3
|
|
declare count=$4
|
|
|
|
nodeConfig="\
|
|
rust-log=$RUST_LOG \
|
|
default-metrics-rate=$SOLANA_DEFAULT_METRICS_RATE \
|
|
metrics-config=$SOLANA_METRICS_CONFIG \
|
|
setup-args=$SOLANA_SETUP_ARGS \
|
|
leader-ip=$publicIp \
|
|
"
|
|
|
|
vm_exec "$vmName" "$vmZone" "$vmPublicIp" \
|
|
"Starting client $count:" \
|
|
"\
|
|
set -x;
|
|
sudo snap set solana $nodeConfig; \
|
|
snap info solana; \
|
|
sudo snap get solana; \
|
|
threadCount=\$(nproc); \
|
|
if [[ \$threadCount -gt 4 ]]; then threadCount=4; fi; \
|
|
tmux kill-session -t solana; \
|
|
tmux new -s solana -d \" \
|
|
set -x; \
|
|
sudo rm /tmp/solana.log; \
|
|
while : ; do \
|
|
/snap/bin/solana.bench-tps --num-nodes $fullnode_count --seconds 600 --sustained --threads \$threadCount 2>&1 | tee -a /tmp/solana.log; \
|
|
echo 'https://metrics.solana.com:8086/write?db=${INFLUX_DATABASE}&u=${INFLUX_USERNAME}&p=${INFLUX_PASSWORD}' \
|
|
| xargs curl --max-time 5 -XPOST --data-binary 'testnet-deploy,name=$netBasename clientexit=1'; \
|
|
echo Error: bench-tps should never exit | tee -a /tmp/solana.log; \
|
|
done; \
|
|
bash \
|
|
\"; \
|
|
sleep 2; \
|
|
tmux capture-pane -t solana -p -S -100; \
|
|
tail /tmp/solana.log; \
|
|
"
|
|
}
|
|
|
|
client_stop() {
|
|
declare vmName=$1
|
|
declare vmZone=$2
|
|
declare vmPublicIp=$3
|
|
declare count=$4
|
|
|
|
touch "log-$vmName.txt"
|
|
(
|
|
SECONDS=0
|
|
vm_exec "$vmName" "$vmZone" "$vmPublicIp" \
|
|
"Stopping client $vmName ($count):" \
|
|
"\
|
|
set -x;
|
|
tmux list-sessions; \
|
|
tmux capture-pane -t solana -p; \
|
|
tmux kill-session -t solana; \
|
|
$SNAP_INSTALL_CMD; \
|
|
sudo snap set solana metrics-config=$SOLANA_METRICS_CONFIG \
|
|
rust-log=$RUST_LOG \
|
|
default-metrics-rate=$SOLANA_DEFAULT_METRICS_RATE \
|
|
; \
|
|
"
|
|
echo "Client stopped in ${SECONDS} seconds"
|
|
) >> "log-$vmName.txt" 2>&1 &
|
|
declare pid=$!
|
|
|
|
# Rename log file so it can be discovered later by $pid
|
|
mv "log-$vmName.txt" "log-$pid.txt"
|
|
pids+=("$pid")
|
|
}
|
|
|
|
fullnode_start() {
|
|
declare class=$1
|
|
declare vmName=$2
|
|
declare vmZone=$3
|
|
declare vmPublicIp=$4
|
|
declare count=$5
|
|
|
|
touch "log-$vmName.txt"
|
|
(
|
|
SECONDS=0
|
|
commonNodeConfig="\
|
|
rust-log=$RUST_LOG \
|
|
default-metrics-rate=$SOLANA_DEFAULT_METRICS_RATE \
|
|
metrics-config=$SOLANA_METRICS_CONFIG \
|
|
setup-args=$SOLANA_SETUP_ARGS \
|
|
leader-ip=$publicIp \
|
|
"
|
|
if [[ $class = leader ]]; then
|
|
nodeConfig="mode=leader+drone $commonNodeConfig"
|
|
if [[ -n $SOLANA_CUDA ]]; then
|
|
nodeConfig="$nodeConfig enable-cuda=1"
|
|
fi
|
|
else
|
|
nodeConfig="mode=validator $commonNodeConfig"
|
|
fi
|
|
|
|
vm_exec "$vmName" "$vmZone" "$vmPublicIp" "Starting $class $count:" \
|
|
"\
|
|
set -ex; \
|
|
logmarker='solana deploy $(date)/$RANDOM'; \
|
|
logger \"\$logmarker\"; \
|
|
$SNAP_INSTALL_CMD; \
|
|
sudo snap set solana $nodeConfig; \
|
|
snap info solana; \
|
|
sudo snap get solana; \
|
|
echo Slight delay to get more syslog output; \
|
|
sleep 2; \
|
|
sudo grep -Pzo \"\$logmarker(.|\\n)*\" /var/log/syslog \
|
|
"
|
|
echo "Succeeded in ${SECONDS} seconds"
|
|
) >> "log-$vmName.txt" 2>&1 &
|
|
declare pid=$!
|
|
|
|
# Rename log file so it can be discovered later by $pid
|
|
mv "log-$vmName.txt" "log-$pid.txt"
|
|
|
|
pids+=("$pid")
|
|
}
|
|
|
|
leader_start() {
|
|
fullnode_start leader "$@"
|
|
}
|
|
|
|
validator_start() {
|
|
fullnode_start validator "$@"
|
|
}
|
|
|
|
fullnode_stop() {
|
|
declare vmName=$1
|
|
declare vmZone=$2
|
|
declare vmPublicIp=$3
|
|
declare count=$4
|
|
|
|
touch "log-$vmName.txt"
|
|
(
|
|
SECONDS=0
|
|
# Try to ping the machine first. When a machine (validator) is restarted,
|
|
# there can be a delay between when the instance is reported as RUNNING and when
|
|
# it's reachable over the network
|
|
timeout 30s bash -c "set -o pipefail; until ping -c 3 $vmPublicIp | tr - _; do echo .; done"
|
|
vm_exec "$vmName" "$vmZone" "$vmPublicIp" "Shutting down" "\
|
|
if snap list solana; then \
|
|
sudo snap set solana mode=; \
|
|
fi"
|
|
echo "Succeeded in ${SECONDS} seconds"
|
|
) >> "log-$vmName.txt" 2>&1 &
|
|
declare pid=$!
|
|
|
|
# Rename log file so it can be discovered later by $pid
|
|
mv "log-$vmName.txt" "log-$pid.txt"
|
|
|
|
pids+=("$pid")
|
|
}
|
|
|
|
if [[ -n $LOCAL_SNAP ]]; then
|
|
echo "--- Transferring $LOCAL_SNAP to node(s)"
|
|
|
|
transfer_local_snap() {
|
|
declare vmName=$1
|
|
declare vmZone=$2
|
|
declare vmPublicIp=$3
|
|
declare vmClass=$4
|
|
declare count=$5
|
|
|
|
echo "--- $vmName in zone $vmZone ($count)"
|
|
SECONDS=0
|
|
scp -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \
|
|
"$LOCAL_SNAP" testnet-deploy@"$vmPublicIp":solana_local.snap
|
|
echo "Succeeded in ${SECONDS} seconds"
|
|
}
|
|
vm_foreach transfer_local_snap
|
|
fi
|
|
|
|
echo "--- Stopping client node(s)"
|
|
pids=()
|
|
vm_foreach_in_class client client_stop
|
|
client_stop_pids=("${pids[@]}")
|
|
|
|
if ! $ROLLING_UPDATE; then
|
|
pids=()
|
|
echo "--- Shutting down all full nodes"
|
|
vm_foreach_in_class leader fullnode_stop
|
|
vm_foreach_in_class validator fullnode_stop
|
|
wait_for_pids fullnode shutdown
|
|
fi
|
|
|
|
pids=()
|
|
echo --- Starting leader node
|
|
vm_foreach_in_class leader leader_start
|
|
wait_for_pids leader
|
|
|
|
pids=()
|
|
echo --- Starting validator nodes
|
|
vm_foreach_in_class validator validator_start
|
|
wait_for_pids validators
|
|
|
|
echo "--- $publicUrl sanity test"
|
|
if [[ -z $CI ]]; then
|
|
# TODO: ssh into a node and run testnet-sanity.sh there. It's not safe to
|
|
# assume the correct Snap is installed on the current non-CI machine
|
|
echo Skipped for non-CI deploy
|
|
snapVersion=unknown
|
|
else
|
|
(
|
|
set -x
|
|
USE_SNAP=1 ci/testnet-sanity.sh $publicUrl $fullnode_count
|
|
)
|
|
IFS=\ read -r _ snapVersion _ < <(snap info solana | grep "^installed:")
|
|
snapVersion=${snapVersion/0+git./}
|
|
fi
|
|
|
|
pids=("${client_stop_pids[@]}")
|
|
wait_for_pids client shutdown
|
|
vm_foreach_in_class client client_start
|
|
|
|
# Add "network started" datapoint
|
|
$metrics_write_datapoint "testnet-deploy,name=$netBasename start=1,version=\"$snapVersion\""
|
|
|
|
exit 0
|