From d21fa4a1776fe131a3f805c33546d1253c6ebbf2 Mon Sep 17 00:00:00 2001 From: Michael Vines Date: Mon, 29 Apr 2019 21:38:03 -0700 Subject: [PATCH] v0.14: various net/ fixes for large clusters (#4080) * net.sh: Add -F to discard validator nodes that didn't bootup successfully * Relax sanity node count when validator bootup failure is permitted * Less sanity for testnet-demo * net.sh: Add -F to discard validator nodes that didn't bootup successfully --- ci/testnet-deploy.sh | 27 +++++++++++---------------- ci/testnet-manager.sh | 31 ++++++++++++++++++------------- net/gce.sh | 14 ++++++++------ net/net.sh | 17 ++++++++++++++--- net/remote/remote-node.sh | 3 +++ net/remote/remote-sanity.sh | 16 ++++++++++++++-- net/scripts/gce-provider.sh | 4 +++- 7 files changed, 71 insertions(+), 41 deletions(-) diff --git a/ci/testnet-deploy.sh b/ci/testnet-deploy.sh index c788548075..76c5225591 100755 --- a/ci/testnet-deploy.sh +++ b/ci/testnet-deploy.sh @@ -282,39 +282,34 @@ if ! $skipStart; then op=start fi echo "--- net.sh $op" + args=("$op" -t "$tarChannelOrTag") - maybeRejectExtraNodes= if ! $publicNetwork; then - maybeRejectExtraNodes="-o rejectExtraNodes" + args+=(-o rejectExtraNodes) fi - maybeNoValidatorSanity= if [[ -n $NO_VALIDATOR_SANITY ]]; then - maybeNoValidatorSanity="-o noValidatorSanity" + args+=(-o noValidatorSanity) fi - maybeNoLedgerVerify= if [[ -n $NO_LEDGER_VERIFY ]]; then - maybeNoLedgerVerify="-o noLedgerVerify" + args+=(-o noLedgerVerify) fi - maybeReuseLedger= if $reuseLedger; then - maybeReuseLedger="-r" + args+=(-r) + fi + + if ! $failOnValidatorBootupFailure; then + args+=(-F) fi - maybeUpdateManifestKeypairFile= # shellcheck disable=SC2154 # SOLANA_INSTALL_UPDATE_MANIFEST_KEYPAIR_x86_64_unknown_linux_gnu comes from .buildkite/env/ if [[ -n $SOLANA_INSTALL_UPDATE_MANIFEST_KEYPAIR_x86_64_unknown_linux_gnu ]]; then echo "$SOLANA_INSTALL_UPDATE_MANIFEST_KEYPAIR_x86_64_unknown_linux_gnu" > update_manifest_keypair.json - maybeUpdateManifestKeypairFile="-i update_manifest_keypair.json" + args+=(-i update_manifest_keypair.json) fi # shellcheck disable=SC2086 # Don't want to double quote the $maybeXYZ variables - time net/net.sh $op -t "$tarChannelOrTag" \ - $maybeUpdateManifestKeypairFile \ - $maybeReuseLedger \ - $maybeRejectExtraNodes \ - $maybeNoValidatorSanity \ - $maybeNoLedgerVerify + time net/net.sh "${args[@]}" ) || ok=false net/net.sh logs diff --git a/ci/testnet-manager.sh b/ci/testnet-manager.sh index 10558d3ded..379bfa3cf7 100755 --- a/ci/testnet-manager.sh +++ b/ci/testnet-manager.sh @@ -267,6 +267,7 @@ sanity() { ok=true if [[ -n $GCE_NODE_COUNT ]]; then NO_LEDGER_VERIFY=1 \ + NO_VALIDATOR_SANITY=1 \ ci/testnet-sanity.sh demo-testnet-solana-com gce "${GCE_ZONES[0]}" -f || ok=false else echo "Error: no GCE nodes" @@ -413,22 +414,26 @@ deploy() { fi # shellcheck disable=SC2068 - ci/testnet-deploy.sh -p demo-testnet-solana-com -C gce ${GCE_ZONE_ARGS[@]} \ - -t "$CHANNEL_OR_TAG" -n "$GCE_NODE_COUNT" -c 0 -P -u -f \ - -a demo-testnet-solana-com \ - ${skipCreate:+-e} \ - ${maybeSkipStart:+-s} \ - ${maybeStop:+-S} \ - ${maybeDelete:+-D} + NO_LEDGER_VERIFY=1 \ + NO_VALIDATOR_SANITY=1 \ + ci/testnet-deploy.sh -p demo-testnet-solana-com -C gce ${GCE_ZONE_ARGS[@]} \ + -t "$CHANNEL_OR_TAG" -n "$GCE_NODE_COUNT" -c 0 -P -u -f \ + -a demo-testnet-solana-com \ + ${skipCreate:+-e} \ + ${maybeSkipStart:+-s} \ + ${maybeStop:+-S} \ + ${maybeDelete:+-D} if [[ -n $GCE_LOW_QUOTA_NODE_COUNT ]]; then # shellcheck disable=SC2068 - ci/testnet-deploy.sh -p demo-testnet-solana-com2 -C gce ${GCE_LOW_QUOTA_ZONE_ARGS[@]} \ - -t "$CHANNEL_OR_TAG" -n "$GCE_LOW_QUOTA_NODE_COUNT" -c 0 -P -f -x \ - ${skipCreate:+-e} \ - ${skipStart:+-s} \ - ${maybeStop:+-S} \ - ${maybeDelete:+-D} + NO_LEDGER_VERIFY=1 \ + NO_VALIDATOR_SANITY=1 \ + ci/testnet-deploy.sh -p demo-testnet-solana-com2 -C gce ${GCE_LOW_QUOTA_ZONE_ARGS[@]} \ + -t "$CHANNEL_OR_TAG" -n "$GCE_LOW_QUOTA_NODE_COUNT" -c 0 -P -f -x \ + ${skipCreate:+-e} \ + ${skipStart:+-s} \ + ${maybeStop:+-S} \ + ${maybeDelete:+-D} fi ) ;; diff --git a/net/gce.sh b/net/gce.sh index aeae569560..6158eddafb 100755 --- a/net/gce.sh +++ b/net/gce.sh @@ -439,13 +439,15 @@ EOF for zone in "${zones[@]}"; do echo "Looking for additional fullnode instances in $zone ..." cloud_FindInstances "$prefix-$zone-fullnode" - [[ ${#instances[@]} -gt 0 ]] || { + if [[ ${#instances[@]} -gt 0 ]]; then + fetchPrivateKey + cloud_ForEachInstance recordInstanceIp "$failOnValidatorBootupFailure" fullnodeIpList + else echo "Unable to find additional fullnodes" - exit 1 - } - - fetchPrivateKey - cloud_ForEachInstance recordInstanceIp "$failOnValidatorBootupFailure" fullnodeIpList + if $failOnValidatorBootupFailure; then + exit 1 + fi + fi done fi diff --git a/net/net.sh b/net/net.sh index 7c62c6a086..5b9c2e8394 100755 --- a/net/net.sh +++ b/net/net.sh @@ -51,6 +51,7 @@ Operate a configured testnet to the bench-tps client. sanity/start/update-specific options: + -F - Discard validator nodes that didn't bootup successfully -o noLedgerVerify - Skip ledger verification -o noValidatorSanity - Skip fullnode sanity -o rejectExtraNodes - Require the exact number of nodes @@ -80,12 +81,13 @@ numBenchTpsClients=0 numBenchExchangeClients=0 benchTpsExtraArgs= benchExchangeExtraArgs= +failOnValidatorBootupFailure=true command=$1 [[ -n $command ]] || usage shift -while getopts "h?T:t:o:f:rD:i:c:" opt; do +while getopts "h?T:t:o:f:rD:i:c:F" opt; do case $opt in h | \?) usage @@ -167,6 +169,9 @@ while getopts "h?T:t:o:f:rD:i:c:" opt; do } getClientTypeAndNum ;; + F) + failOnValidatorBootupFailure=false + ;; *) usage "Error: unhandled option: $opt" ;; @@ -291,6 +296,7 @@ startBootstrapLeader() { \"$RUST_LOG\" \ $skipSetup \ $leaderRotation \ + $failOnValidatorBootupFailure \ " ) >> "$logFile" 2>&1 || { cat "$logFile" @@ -319,6 +325,7 @@ startNode() { \"$RUST_LOG\" \ $skipSetup \ $leaderRotation \ + $failOnValidatorBootupFailure \ " ) >> "$logFile" 2>&1 & declare pid=$! @@ -475,9 +482,13 @@ start() { declare ok=true wait "$pid" || ok=false if ! $ok; then + echo "+++ fullnode failed to start" cat "$netLogDir/fullnode-$pid.log" - echo ^^^ +++ - exit 1 + if $failOnValidatorBootupFailure; then + exit 1 + else + echo "Failure is non-fatal" + fi fi done diff --git a/net/remote/remote-node.sh b/net/remote/remote-node.sh index 81d6426ad5..2732ad1283 100755 --- a/net/remote/remote-node.sh +++ b/net/remote/remote-node.sh @@ -12,6 +12,7 @@ numNodes="$5" RUST_LOG="$6" skipSetup="$7" leaderRotation="$8" +failOnValidatorBootupFailure="$9" set +x export RUST_LOG @@ -35,12 +36,14 @@ missing() { [[ -n $numNodes ]] || missing numNodes [[ -n $skipSetup ]] || missing skipSetup [[ -n $leaderRotation ]] || missing leaderRotation +[[ -n $failOnValidatorBootupFailure ]] || missing failOnValidatorBootupFailure cat > deployConfig <