v0.14: various net/ fixes for large clusters (#4080)

* net.sh: Add -F to discard validator nodes that didn't bootup successfully

* Relax sanity node count when validator bootup failure is permitted

* Less sanity for testnet-demo

* net.sh: Add -F to discard validator nodes that didn't bootup successfully
This commit is contained in:
Michael Vines
2019-04-29 21:38:03 -07:00
parent bd0871cbe7
commit d21fa4a177
7 changed files with 71 additions and 41 deletions

View File

@ -282,39 +282,34 @@ if ! $skipStart; then
op=start op=start
fi fi
echo "--- net.sh $op" echo "--- net.sh $op"
args=("$op" -t "$tarChannelOrTag")
maybeRejectExtraNodes=
if ! $publicNetwork; then if ! $publicNetwork; then
maybeRejectExtraNodes="-o rejectExtraNodes" args+=(-o rejectExtraNodes)
fi fi
maybeNoValidatorSanity=
if [[ -n $NO_VALIDATOR_SANITY ]]; then if [[ -n $NO_VALIDATOR_SANITY ]]; then
maybeNoValidatorSanity="-o noValidatorSanity" args+=(-o noValidatorSanity)
fi fi
maybeNoLedgerVerify=
if [[ -n $NO_LEDGER_VERIFY ]]; then if [[ -n $NO_LEDGER_VERIFY ]]; then
maybeNoLedgerVerify="-o noLedgerVerify" args+=(-o noLedgerVerify)
fi fi
maybeReuseLedger=
if $reuseLedger; then if $reuseLedger; then
maybeReuseLedger="-r" args+=(-r)
fi
if ! $failOnValidatorBootupFailure; then
args+=(-F)
fi fi
maybeUpdateManifestKeypairFile=
# shellcheck disable=SC2154 # SOLANA_INSTALL_UPDATE_MANIFEST_KEYPAIR_x86_64_unknown_linux_gnu comes from .buildkite/env/ # shellcheck disable=SC2154 # SOLANA_INSTALL_UPDATE_MANIFEST_KEYPAIR_x86_64_unknown_linux_gnu comes from .buildkite/env/
if [[ -n $SOLANA_INSTALL_UPDATE_MANIFEST_KEYPAIR_x86_64_unknown_linux_gnu ]]; then if [[ -n $SOLANA_INSTALL_UPDATE_MANIFEST_KEYPAIR_x86_64_unknown_linux_gnu ]]; then
echo "$SOLANA_INSTALL_UPDATE_MANIFEST_KEYPAIR_x86_64_unknown_linux_gnu" > update_manifest_keypair.json echo "$SOLANA_INSTALL_UPDATE_MANIFEST_KEYPAIR_x86_64_unknown_linux_gnu" > update_manifest_keypair.json
maybeUpdateManifestKeypairFile="-i update_manifest_keypair.json" args+=(-i update_manifest_keypair.json)
fi fi
# shellcheck disable=SC2086 # Don't want to double quote the $maybeXYZ variables # shellcheck disable=SC2086 # Don't want to double quote the $maybeXYZ variables
time net/net.sh $op -t "$tarChannelOrTag" \ time net/net.sh "${args[@]}"
$maybeUpdateManifestKeypairFile \
$maybeReuseLedger \
$maybeRejectExtraNodes \
$maybeNoValidatorSanity \
$maybeNoLedgerVerify
) || ok=false ) || ok=false
net/net.sh logs net/net.sh logs

View File

@ -267,6 +267,7 @@ sanity() {
ok=true ok=true
if [[ -n $GCE_NODE_COUNT ]]; then if [[ -n $GCE_NODE_COUNT ]]; then
NO_LEDGER_VERIFY=1 \ NO_LEDGER_VERIFY=1 \
NO_VALIDATOR_SANITY=1 \
ci/testnet-sanity.sh demo-testnet-solana-com gce "${GCE_ZONES[0]}" -f || ok=false ci/testnet-sanity.sh demo-testnet-solana-com gce "${GCE_ZONES[0]}" -f || ok=false
else else
echo "Error: no GCE nodes" echo "Error: no GCE nodes"
@ -413,22 +414,26 @@ deploy() {
fi fi
# shellcheck disable=SC2068 # shellcheck disable=SC2068
ci/testnet-deploy.sh -p demo-testnet-solana-com -C gce ${GCE_ZONE_ARGS[@]} \ NO_LEDGER_VERIFY=1 \
-t "$CHANNEL_OR_TAG" -n "$GCE_NODE_COUNT" -c 0 -P -u -f \ NO_VALIDATOR_SANITY=1 \
-a demo-testnet-solana-com \ ci/testnet-deploy.sh -p demo-testnet-solana-com -C gce ${GCE_ZONE_ARGS[@]} \
${skipCreate:+-e} \ -t "$CHANNEL_OR_TAG" -n "$GCE_NODE_COUNT" -c 0 -P -u -f \
${maybeSkipStart:+-s} \ -a demo-testnet-solana-com \
${maybeStop:+-S} \ ${skipCreate:+-e} \
${maybeDelete:+-D} ${maybeSkipStart:+-s} \
${maybeStop:+-S} \
${maybeDelete:+-D}
if [[ -n $GCE_LOW_QUOTA_NODE_COUNT ]]; then if [[ -n $GCE_LOW_QUOTA_NODE_COUNT ]]; then
# shellcheck disable=SC2068 # shellcheck disable=SC2068
ci/testnet-deploy.sh -p demo-testnet-solana-com2 -C gce ${GCE_LOW_QUOTA_ZONE_ARGS[@]} \ NO_LEDGER_VERIFY=1 \
-t "$CHANNEL_OR_TAG" -n "$GCE_LOW_QUOTA_NODE_COUNT" -c 0 -P -f -x \ NO_VALIDATOR_SANITY=1 \
${skipCreate:+-e} \ ci/testnet-deploy.sh -p demo-testnet-solana-com2 -C gce ${GCE_LOW_QUOTA_ZONE_ARGS[@]} \
${skipStart:+-s} \ -t "$CHANNEL_OR_TAG" -n "$GCE_LOW_QUOTA_NODE_COUNT" -c 0 -P -f -x \
${maybeStop:+-S} \ ${skipCreate:+-e} \
${maybeDelete:+-D} ${skipStart:+-s} \
${maybeStop:+-S} \
${maybeDelete:+-D}
fi fi
) )
;; ;;

View File

@ -439,13 +439,15 @@ EOF
for zone in "${zones[@]}"; do for zone in "${zones[@]}"; do
echo "Looking for additional fullnode instances in $zone ..." echo "Looking for additional fullnode instances in $zone ..."
cloud_FindInstances "$prefix-$zone-fullnode" cloud_FindInstances "$prefix-$zone-fullnode"
[[ ${#instances[@]} -gt 0 ]] || { if [[ ${#instances[@]} -gt 0 ]]; then
fetchPrivateKey
cloud_ForEachInstance recordInstanceIp "$failOnValidatorBootupFailure" fullnodeIpList
else
echo "Unable to find additional fullnodes" echo "Unable to find additional fullnodes"
exit 1 if $failOnValidatorBootupFailure; then
} exit 1
fi
fetchPrivateKey fi
cloud_ForEachInstance recordInstanceIp "$failOnValidatorBootupFailure" fullnodeIpList
done done
fi fi

View File

@ -51,6 +51,7 @@ Operate a configured testnet
to the bench-tps client. to the bench-tps client.
sanity/start/update-specific options: sanity/start/update-specific options:
-F - Discard validator nodes that didn't bootup successfully
-o noLedgerVerify - Skip ledger verification -o noLedgerVerify - Skip ledger verification
-o noValidatorSanity - Skip fullnode sanity -o noValidatorSanity - Skip fullnode sanity
-o rejectExtraNodes - Require the exact number of nodes -o rejectExtraNodes - Require the exact number of nodes
@ -80,12 +81,13 @@ numBenchTpsClients=0
numBenchExchangeClients=0 numBenchExchangeClients=0
benchTpsExtraArgs= benchTpsExtraArgs=
benchExchangeExtraArgs= benchExchangeExtraArgs=
failOnValidatorBootupFailure=true
command=$1 command=$1
[[ -n $command ]] || usage [[ -n $command ]] || usage
shift shift
while getopts "h?T:t:o:f:rD:i:c:" opt; do while getopts "h?T:t:o:f:rD:i:c:F" opt; do
case $opt in case $opt in
h | \?) h | \?)
usage usage
@ -167,6 +169,9 @@ while getopts "h?T:t:o:f:rD:i:c:" opt; do
} }
getClientTypeAndNum getClientTypeAndNum
;; ;;
F)
failOnValidatorBootupFailure=false
;;
*) *)
usage "Error: unhandled option: $opt" usage "Error: unhandled option: $opt"
;; ;;
@ -291,6 +296,7 @@ startBootstrapLeader() {
\"$RUST_LOG\" \ \"$RUST_LOG\" \
$skipSetup \ $skipSetup \
$leaderRotation \ $leaderRotation \
$failOnValidatorBootupFailure \
" "
) >> "$logFile" 2>&1 || { ) >> "$logFile" 2>&1 || {
cat "$logFile" cat "$logFile"
@ -319,6 +325,7 @@ startNode() {
\"$RUST_LOG\" \ \"$RUST_LOG\" \
$skipSetup \ $skipSetup \
$leaderRotation \ $leaderRotation \
$failOnValidatorBootupFailure \
" "
) >> "$logFile" 2>&1 & ) >> "$logFile" 2>&1 &
declare pid=$! declare pid=$!
@ -475,9 +482,13 @@ start() {
declare ok=true declare ok=true
wait "$pid" || ok=false wait "$pid" || ok=false
if ! $ok; then if ! $ok; then
echo "+++ fullnode failed to start"
cat "$netLogDir/fullnode-$pid.log" cat "$netLogDir/fullnode-$pid.log"
echo ^^^ +++ if $failOnValidatorBootupFailure; then
exit 1 exit 1
else
echo "Failure is non-fatal"
fi
fi fi
done done

View File

@ -12,6 +12,7 @@ numNodes="$5"
RUST_LOG="$6" RUST_LOG="$6"
skipSetup="$7" skipSetup="$7"
leaderRotation="$8" leaderRotation="$8"
failOnValidatorBootupFailure="$9"
set +x set +x
export RUST_LOG export RUST_LOG
@ -35,12 +36,14 @@ missing() {
[[ -n $numNodes ]] || missing numNodes [[ -n $numNodes ]] || missing numNodes
[[ -n $skipSetup ]] || missing skipSetup [[ -n $skipSetup ]] || missing skipSetup
[[ -n $leaderRotation ]] || missing leaderRotation [[ -n $leaderRotation ]] || missing leaderRotation
[[ -n $failOnValidatorBootupFailure ]] || missing failOnValidatorBootupFailure
cat > deployConfig <<EOF cat > deployConfig <<EOF
deployMethod="$deployMethod" deployMethod="$deployMethod"
entrypointIp="$entrypointIp" entrypointIp="$entrypointIp"
numNodes="$numNodes" numNodes="$numNodes"
leaderRotation=$leaderRotation leaderRotation=$leaderRotation
failOnValidatorBootupFailure=$failOnValidatorBootupFailure
EOF EOF
source net/common.sh source net/common.sh

View File

@ -9,6 +9,7 @@ cd "$(dirname "$0")"/../..
deployMethod= deployMethod=
entrypointIp= entrypointIp=
numNodes= numNodes=
failOnValidatorBootupFailure=
[[ -r deployConfig ]] || { [[ -r deployConfig ]] || {
echo deployConfig missing echo deployConfig missing
@ -26,6 +27,7 @@ missing() {
[[ -n $entrypointIp ]] || missing entrypointIp [[ -n $entrypointIp ]] || missing entrypointIp
[[ -n $numNodes ]] || missing numNodes [[ -n $numNodes ]] || missing numNodes
[[ -n $leaderRotation ]] || missing leaderRotation [[ -n $leaderRotation ]] || missing leaderRotation
[[ -n $failOnValidatorBootupFailure ]] || missing failOnValidatorBootupFailure
ledgerVerify=true ledgerVerify=true
validatorSanity=true validatorSanity=true
@ -79,7 +81,17 @@ local|tar)
exit 1 exit 1
esac esac
echo "+++ $entrypointIp: node count ($numNodes expected)" if $failOnValidatorBootupFailure; then
numSanityNodes=1
if $rejectExtraNodes; then
echo "rejectExtraNodes cannot be used with failOnValidatorBootupFailure"
exit 1
fi
else
numSanityNodes="$numNodes"
fi
echo "+++ $entrypointIp: node count ($numSanityNodes expected)"
( (
set -x set -x
$solana_keygen -o "$client_id" $solana_keygen -o "$client_id"
@ -90,7 +102,7 @@ echo "+++ $entrypointIp: node count ($numNodes expected)"
fi fi
timeout 2m $solana_gossip --network "$entrypointIp:8001" \ timeout 2m $solana_gossip --network "$entrypointIp:8001" \
spy --$nodeArg "$numNodes" \ spy --$nodeArg "$numSanityNodes" \
) )
echo "--- RPC API: getTransactionCount" echo "--- RPC API: getTransactionCount"

View File

@ -34,8 +34,10 @@ __cloud_FindInstances() {
instances+=("$name:$publicIp:$privateIp:$zone") instances+=("$name:$publicIp:$privateIp:$zone")
done < <(gcloud compute instances list \ done < <(gcloud compute instances list \
--filter "$filter" \ --filter "$filter" \
--format 'value(name,networkInterfaces[0].accessConfigs[0].natIP,networkInterfaces[0].networkIP,status,zone)') --format 'value(name,networkInterfaces[0].accessConfigs[0].natIP,networkInterfaces[0].networkIP,status,zone)' \
| grep RUNNING)
} }
# #
# cloud_FindInstances [namePrefix] # cloud_FindInstances [namePrefix]
# #