v0.14: various net/ fixes for large clusters (#4080)

* net.sh: Add -F to discard validator nodes that didn't bootup successfully

* Relax sanity node count when validator bootup failure is permitted

* Less sanity for testnet-demo

* net.sh: Add -F to discard validator nodes that didn't bootup successfully
This commit is contained in:
Michael Vines
2019-04-29 21:38:03 -07:00
parent bd0871cbe7
commit d21fa4a177
7 changed files with 71 additions and 41 deletions

View File

@@ -439,13 +439,15 @@ EOF
for zone in "${zones[@]}"; do
echo "Looking for additional fullnode instances in $zone ..."
cloud_FindInstances "$prefix-$zone-fullnode"
[[ ${#instances[@]} -gt 0 ]] || {
if [[ ${#instances[@]} -gt 0 ]]; then
fetchPrivateKey
cloud_ForEachInstance recordInstanceIp "$failOnValidatorBootupFailure" fullnodeIpList
else
echo "Unable to find additional fullnodes"
exit 1
}
fetchPrivateKey
cloud_ForEachInstance recordInstanceIp "$failOnValidatorBootupFailure" fullnodeIpList
if $failOnValidatorBootupFailure; then
exit 1
fi
fi
done
fi

View File

@@ -51,6 +51,7 @@ Operate a configured testnet
to the bench-tps client.
sanity/start/update-specific options:
-F - Discard validator nodes that didn't bootup successfully
-o noLedgerVerify - Skip ledger verification
-o noValidatorSanity - Skip fullnode sanity
-o rejectExtraNodes - Require the exact number of nodes
@@ -80,12 +81,13 @@ numBenchTpsClients=0
numBenchExchangeClients=0
benchTpsExtraArgs=
benchExchangeExtraArgs=
failOnValidatorBootupFailure=true
command=$1
[[ -n $command ]] || usage
shift
while getopts "h?T:t:o:f:rD:i:c:" opt; do
while getopts "h?T:t:o:f:rD:i:c:F" opt; do
case $opt in
h | \?)
usage
@@ -167,6 +169,9 @@ while getopts "h?T:t:o:f:rD:i:c:" opt; do
}
getClientTypeAndNum
;;
F)
failOnValidatorBootupFailure=false
;;
*)
usage "Error: unhandled option: $opt"
;;
@@ -291,6 +296,7 @@ startBootstrapLeader() {
\"$RUST_LOG\" \
$skipSetup \
$leaderRotation \
$failOnValidatorBootupFailure \
"
) >> "$logFile" 2>&1 || {
cat "$logFile"
@@ -319,6 +325,7 @@ startNode() {
\"$RUST_LOG\" \
$skipSetup \
$leaderRotation \
$failOnValidatorBootupFailure \
"
) >> "$logFile" 2>&1 &
declare pid=$!
@@ -475,9 +482,13 @@ start() {
declare ok=true
wait "$pid" || ok=false
if ! $ok; then
echo "+++ fullnode failed to start"
cat "$netLogDir/fullnode-$pid.log"
echo ^^^ +++
exit 1
if $failOnValidatorBootupFailure; then
exit 1
else
echo "Failure is non-fatal"
fi
fi
done

View File

@@ -12,6 +12,7 @@ numNodes="$5"
RUST_LOG="$6"
skipSetup="$7"
leaderRotation="$8"
failOnValidatorBootupFailure="$9"
set +x
export RUST_LOG
@@ -35,12 +36,14 @@ missing() {
[[ -n $numNodes ]] || missing numNodes
[[ -n $skipSetup ]] || missing skipSetup
[[ -n $leaderRotation ]] || missing leaderRotation
[[ -n $failOnValidatorBootupFailure ]] || missing failOnValidatorBootupFailure
cat > deployConfig <<EOF
deployMethod="$deployMethod"
entrypointIp="$entrypointIp"
numNodes="$numNodes"
leaderRotation=$leaderRotation
failOnValidatorBootupFailure=$failOnValidatorBootupFailure
EOF
source net/common.sh

View File

@@ -9,6 +9,7 @@ cd "$(dirname "$0")"/../..
deployMethod=
entrypointIp=
numNodes=
failOnValidatorBootupFailure=
[[ -r deployConfig ]] || {
echo deployConfig missing
@@ -26,6 +27,7 @@ missing() {
[[ -n $entrypointIp ]] || missing entrypointIp
[[ -n $numNodes ]] || missing numNodes
[[ -n $leaderRotation ]] || missing leaderRotation
[[ -n $failOnValidatorBootupFailure ]] || missing failOnValidatorBootupFailure
ledgerVerify=true
validatorSanity=true
@@ -79,7 +81,17 @@ local|tar)
exit 1
esac
echo "+++ $entrypointIp: node count ($numNodes expected)"
if $failOnValidatorBootupFailure; then
numSanityNodes=1
if $rejectExtraNodes; then
echo "rejectExtraNodes cannot be used with failOnValidatorBootupFailure"
exit 1
fi
else
numSanityNodes="$numNodes"
fi
echo "+++ $entrypointIp: node count ($numSanityNodes expected)"
(
set -x
$solana_keygen -o "$client_id"
@@ -90,7 +102,7 @@ echo "+++ $entrypointIp: node count ($numNodes expected)"
fi
timeout 2m $solana_gossip --network "$entrypointIp:8001" \
spy --$nodeArg "$numNodes" \
spy --$nodeArg "$numSanityNodes" \
)
echo "--- RPC API: getTransactionCount"

View File

@@ -34,8 +34,10 @@ __cloud_FindInstances() {
instances+=("$name:$publicIp:$privateIp:$zone")
done < <(gcloud compute instances list \
--filter "$filter" \
--format 'value(name,networkInterfaces[0].accessConfigs[0].natIP,networkInterfaces[0].networkIP,status,zone)')
--format 'value(name,networkInterfaces[0].accessConfigs[0].natIP,networkInterfaces[0].networkIP,status,zone)' \
| grep RUNNING)
}
#
# cloud_FindInstances [namePrefix]
#