v0.14: various net/ fixes for large clusters (#4080)
* net.sh: Add -F to discard validator nodes that didn't bootup successfully * Relax sanity node count when validator bootup failure is permitted * Less sanity for testnet-demo * net.sh: Add -F to discard validator nodes that didn't bootup successfully
This commit is contained in:
14
net/gce.sh
14
net/gce.sh
@@ -439,13 +439,15 @@ EOF
|
||||
for zone in "${zones[@]}"; do
|
||||
echo "Looking for additional fullnode instances in $zone ..."
|
||||
cloud_FindInstances "$prefix-$zone-fullnode"
|
||||
[[ ${#instances[@]} -gt 0 ]] || {
|
||||
if [[ ${#instances[@]} -gt 0 ]]; then
|
||||
fetchPrivateKey
|
||||
cloud_ForEachInstance recordInstanceIp "$failOnValidatorBootupFailure" fullnodeIpList
|
||||
else
|
||||
echo "Unable to find additional fullnodes"
|
||||
exit 1
|
||||
}
|
||||
|
||||
fetchPrivateKey
|
||||
cloud_ForEachInstance recordInstanceIp "$failOnValidatorBootupFailure" fullnodeIpList
|
||||
if $failOnValidatorBootupFailure; then
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
done
|
||||
fi
|
||||
|
||||
|
17
net/net.sh
17
net/net.sh
@@ -51,6 +51,7 @@ Operate a configured testnet
|
||||
to the bench-tps client.
|
||||
|
||||
sanity/start/update-specific options:
|
||||
-F - Discard validator nodes that didn't bootup successfully
|
||||
-o noLedgerVerify - Skip ledger verification
|
||||
-o noValidatorSanity - Skip fullnode sanity
|
||||
-o rejectExtraNodes - Require the exact number of nodes
|
||||
@@ -80,12 +81,13 @@ numBenchTpsClients=0
|
||||
numBenchExchangeClients=0
|
||||
benchTpsExtraArgs=
|
||||
benchExchangeExtraArgs=
|
||||
failOnValidatorBootupFailure=true
|
||||
|
||||
command=$1
|
||||
[[ -n $command ]] || usage
|
||||
shift
|
||||
|
||||
while getopts "h?T:t:o:f:rD:i:c:" opt; do
|
||||
while getopts "h?T:t:o:f:rD:i:c:F" opt; do
|
||||
case $opt in
|
||||
h | \?)
|
||||
usage
|
||||
@@ -167,6 +169,9 @@ while getopts "h?T:t:o:f:rD:i:c:" opt; do
|
||||
}
|
||||
getClientTypeAndNum
|
||||
;;
|
||||
F)
|
||||
failOnValidatorBootupFailure=false
|
||||
;;
|
||||
*)
|
||||
usage "Error: unhandled option: $opt"
|
||||
;;
|
||||
@@ -291,6 +296,7 @@ startBootstrapLeader() {
|
||||
\"$RUST_LOG\" \
|
||||
$skipSetup \
|
||||
$leaderRotation \
|
||||
$failOnValidatorBootupFailure \
|
||||
"
|
||||
) >> "$logFile" 2>&1 || {
|
||||
cat "$logFile"
|
||||
@@ -319,6 +325,7 @@ startNode() {
|
||||
\"$RUST_LOG\" \
|
||||
$skipSetup \
|
||||
$leaderRotation \
|
||||
$failOnValidatorBootupFailure \
|
||||
"
|
||||
) >> "$logFile" 2>&1 &
|
||||
declare pid=$!
|
||||
@@ -475,9 +482,13 @@ start() {
|
||||
declare ok=true
|
||||
wait "$pid" || ok=false
|
||||
if ! $ok; then
|
||||
echo "+++ fullnode failed to start"
|
||||
cat "$netLogDir/fullnode-$pid.log"
|
||||
echo ^^^ +++
|
||||
exit 1
|
||||
if $failOnValidatorBootupFailure; then
|
||||
exit 1
|
||||
else
|
||||
echo "Failure is non-fatal"
|
||||
fi
|
||||
fi
|
||||
done
|
||||
|
||||
|
@@ -12,6 +12,7 @@ numNodes="$5"
|
||||
RUST_LOG="$6"
|
||||
skipSetup="$7"
|
||||
leaderRotation="$8"
|
||||
failOnValidatorBootupFailure="$9"
|
||||
set +x
|
||||
export RUST_LOG
|
||||
|
||||
@@ -35,12 +36,14 @@ missing() {
|
||||
[[ -n $numNodes ]] || missing numNodes
|
||||
[[ -n $skipSetup ]] || missing skipSetup
|
||||
[[ -n $leaderRotation ]] || missing leaderRotation
|
||||
[[ -n $failOnValidatorBootupFailure ]] || missing failOnValidatorBootupFailure
|
||||
|
||||
cat > deployConfig <<EOF
|
||||
deployMethod="$deployMethod"
|
||||
entrypointIp="$entrypointIp"
|
||||
numNodes="$numNodes"
|
||||
leaderRotation=$leaderRotation
|
||||
failOnValidatorBootupFailure=$failOnValidatorBootupFailure
|
||||
EOF
|
||||
|
||||
source net/common.sh
|
||||
|
@@ -9,6 +9,7 @@ cd "$(dirname "$0")"/../..
|
||||
deployMethod=
|
||||
entrypointIp=
|
||||
numNodes=
|
||||
failOnValidatorBootupFailure=
|
||||
|
||||
[[ -r deployConfig ]] || {
|
||||
echo deployConfig missing
|
||||
@@ -26,6 +27,7 @@ missing() {
|
||||
[[ -n $entrypointIp ]] || missing entrypointIp
|
||||
[[ -n $numNodes ]] || missing numNodes
|
||||
[[ -n $leaderRotation ]] || missing leaderRotation
|
||||
[[ -n $failOnValidatorBootupFailure ]] || missing failOnValidatorBootupFailure
|
||||
|
||||
ledgerVerify=true
|
||||
validatorSanity=true
|
||||
@@ -79,7 +81,17 @@ local|tar)
|
||||
exit 1
|
||||
esac
|
||||
|
||||
echo "+++ $entrypointIp: node count ($numNodes expected)"
|
||||
if $failOnValidatorBootupFailure; then
|
||||
numSanityNodes=1
|
||||
if $rejectExtraNodes; then
|
||||
echo "rejectExtraNodes cannot be used with failOnValidatorBootupFailure"
|
||||
exit 1
|
||||
fi
|
||||
else
|
||||
numSanityNodes="$numNodes"
|
||||
fi
|
||||
|
||||
echo "+++ $entrypointIp: node count ($numSanityNodes expected)"
|
||||
(
|
||||
set -x
|
||||
$solana_keygen -o "$client_id"
|
||||
@@ -90,7 +102,7 @@ echo "+++ $entrypointIp: node count ($numNodes expected)"
|
||||
fi
|
||||
|
||||
timeout 2m $solana_gossip --network "$entrypointIp:8001" \
|
||||
spy --$nodeArg "$numNodes" \
|
||||
spy --$nodeArg "$numSanityNodes" \
|
||||
)
|
||||
|
||||
echo "--- RPC API: getTransactionCount"
|
||||
|
@@ -34,8 +34,10 @@ __cloud_FindInstances() {
|
||||
instances+=("$name:$publicIp:$privateIp:$zone")
|
||||
done < <(gcloud compute instances list \
|
||||
--filter "$filter" \
|
||||
--format 'value(name,networkInterfaces[0].accessConfigs[0].natIP,networkInterfaces[0].networkIP,status,zone)')
|
||||
--format 'value(name,networkInterfaces[0].accessConfigs[0].natIP,networkInterfaces[0].networkIP,status,zone)' \
|
||||
| grep RUNNING)
|
||||
}
|
||||
|
||||
#
|
||||
# cloud_FindInstances [namePrefix]
|
||||
#
|
||||
|
Reference in New Issue
Block a user