Add -R option to restart the cluster incrementally

This commit is contained in:
Michael Vines
2019-01-21 14:33:46 -08:00
committed by Grimes
parent dca0ba6a5d
commit 1cdab81a3c

View File

@ -3,6 +3,7 @@ set -e
iterations=1 iterations=1
restartInterval=never restartInterval=never
rollingRestart=false
maybeNoLeaderRotation= maybeNoLeaderRotation=
extraNodes=0 extraNodes=0
walletRpcEndpoint= walletRpcEndpoint=
@ -21,6 +22,9 @@ Start a local cluster and run sanity on it
options: options:
-i [number] - Number of times to run sanity (default: $iterations) -i [number] - Number of times to run sanity (default: $iterations)
-k [number] - Restart the cluster after this number of sanity iterations (default: $restartInterval) -k [number] - Restart the cluster after this number of sanity iterations (default: $restartInterval)
-R - Restart the cluster by incrementially stopping and restarting
nodes (at the cadence specified by -k). When disabled all
nodes will be first killed then restarted (default: $rollingRestart)
-b - Disable leader rotation -b - Disable leader rotation
-x - Add an extra fullnode (may be supplied multiple times) -x - Add an extra fullnode (may be supplied multiple times)
-r - Select the RPC endpoint hosted by a node that starts as -r - Select the RPC endpoint hosted by a node that starts as
@ -33,7 +37,7 @@ EOF
cd "$(dirname "$0")"/.. cd "$(dirname "$0")"/..
while getopts "h?i:k:brx" opt; do while getopts "h?i:k:brxR" opt; do
case $opt in case $opt in
h | \?) h | \?)
usage usage
@ -53,6 +57,9 @@ while getopts "h?i:k:brx" opt; do
r) r)
walletRpcEndpoint="--rpc-port 18899" walletRpcEndpoint="--rpc-port 18899"
;; ;;
R)
rollingRestart=true
;;
*) *)
usage "Error: unhandled option: $opt" usage "Error: unhandled option: $opt"
;; ;;
@ -76,40 +83,100 @@ numNodes=$((2 + extraNodes))
pids=() pids=()
logs=() logs=()
getNodeLogFile() {
declare cmd=$1
declare baseCmd
baseCmd=$(basename "${cmd// */}" .sh)
echo "log-$baseCmd.txt"
}
startNode() {
declare cmd=$1
echo "--- Start $cmd"
declare log
log=$(getNodeLogFile "$cmd")
rm -f "$log"
$cmd > "$log" 2>&1 &
declare pid=$!
pids+=("$pid")
echo "pid: $pid"
}
startNodes() { startNodes() {
declare addLogs=false declare addLogs=false
if [[ ${#logs[@]} -eq 0 ]]; then if [[ ${#logs[@]} -eq 0 ]]; then
addLogs=true addLogs=true
fi fi
for cmd in "${nodes[@]}"; do for cmd in "${nodes[@]}"; do
echo "--- Start $cmd" startNode "$cmd"
baseCmd=$(basename "${cmd// */}" .sh)
declare log=log-$baseCmd.txt
rm -f "$log"
$cmd > "$log" 2>&1 &
if $addLogs; then if $addLogs; then
logs+=("$log") logs+=("$(getNodeLogFile "$cmd")")
fi fi
declare pid=$!
pids+=("$pid")
echo "pid: $pid"
done done
} }
killNodes() { killNode() {
echo "--- Killing nodes" declare pid=$1
echo "kill $pid"
set +e set +e
for pid in "${pids[@]}"; do
if kill "$pid"; then if kill "$pid"; then
wait "$pid" wait "$pid"
else else
echo -e "^^^ +++\\nWarning: unable to kill $pid" echo -e "^^^ +++\\nWarning: unable to kill $pid"
fi fi
done
set -e set -e
}
killNodes() {
echo "--- Killing nodes"
for pid in "${pids[@]}"; do
killNode "$pid"
done
pids=() pids=()
} }
rollingNodeRestart() {
if [[ ${#logs[@]} -ne ${#nodes[@]} ]]; then
echo "Error: log/nodes array length mismatch"
exit 1
fi
if [[ ${#pids[@]} -ne ${#nodes[@]} ]]; then
echo "Error: pids/nodes array length mismatch"
exit 1
fi
declare oldPids=("${pids[@]}")
for i in $(seq 0 $((${#logs[@]} - 1))); do
declare pid=${oldPids[$i]}
declare cmd=${nodes[$i]}
if [[ $i -eq 0 ]]; then
# First cmd should be the drone, don't restart it.
[[ "$cmd" = "multinode-demo/drone.sh" ]]
pids+=("$pid")
else
echo "--- Restarting $pid: $cmd"
killNode "$pid"
# Delay 20 seconds to ensure the remaining cluster nodes will
# hit CRDS_GOSSIP_PULL_CRDS_TIMEOUT_MS (currently 15 seconds) for the
# node that was just stopped
echo "(sleeping for 20 seconds)"
sleep 20
startNode "$cmd"
fi
done
# 'Atomically' remove the old pids from the pids array
declare oldPidsList
oldPidsList="$(printf ":%s" "${oldPids[@]}"):"
declare newPids=("${pids[0]}") # 0 = drone pid
for pid in "${pids[@]}"; do
[[ $oldPidsList =~ :$pid: ]] || {
newPids+=("$pid")
}
done
pids=("${newPids[@]}")
}
verifyLedger() { verifyLedger() {
for ledger in bootstrap-leader fullnode; do for ledger in bootstrap-leader fullnode; do
echo "--- $ledger ledger verification" echo "--- $ledger ledger verification"
@ -200,10 +267,14 @@ while [[ $iteration -le $iterations ]]; do
iteration=$((iteration + 1)) iteration=$((iteration + 1))
if [[ $restartInterval != never && $((iteration % restartInterval)) -eq 0 ]]; then if [[ $restartInterval != never && $((iteration % restartInterval)) -eq 0 ]]; then
if $rollingRestart; then
rollingNodeRestart
else
killNodes killNodes
verifyLedger verifyLedger
startNodes startNodes
fi fi
fi
done done
killNodes killNodes