v0.14: various net/ fixes for large clusters (#4080)
* net.sh: Add -F to discard validator nodes that didn't bootup successfully * Relax sanity node count when validator bootup failure is permitted * Less sanity for testnet-demo * net.sh: Add -F to discard validator nodes that didn't bootup successfully
This commit is contained in:
@ -282,39 +282,34 @@ if ! $skipStart; then
|
|||||||
op=start
|
op=start
|
||||||
fi
|
fi
|
||||||
echo "--- net.sh $op"
|
echo "--- net.sh $op"
|
||||||
|
args=("$op" -t "$tarChannelOrTag")
|
||||||
|
|
||||||
maybeRejectExtraNodes=
|
|
||||||
if ! $publicNetwork; then
|
if ! $publicNetwork; then
|
||||||
maybeRejectExtraNodes="-o rejectExtraNodes"
|
args+=(-o rejectExtraNodes)
|
||||||
fi
|
fi
|
||||||
maybeNoValidatorSanity=
|
|
||||||
if [[ -n $NO_VALIDATOR_SANITY ]]; then
|
if [[ -n $NO_VALIDATOR_SANITY ]]; then
|
||||||
maybeNoValidatorSanity="-o noValidatorSanity"
|
args+=(-o noValidatorSanity)
|
||||||
fi
|
fi
|
||||||
maybeNoLedgerVerify=
|
|
||||||
if [[ -n $NO_LEDGER_VERIFY ]]; then
|
if [[ -n $NO_LEDGER_VERIFY ]]; then
|
||||||
maybeNoLedgerVerify="-o noLedgerVerify"
|
args+=(-o noLedgerVerify)
|
||||||
fi
|
fi
|
||||||
|
|
||||||
maybeReuseLedger=
|
|
||||||
if $reuseLedger; then
|
if $reuseLedger; then
|
||||||
maybeReuseLedger="-r"
|
args+=(-r)
|
||||||
|
fi
|
||||||
|
|
||||||
|
if ! $failOnValidatorBootupFailure; then
|
||||||
|
args+=(-F)
|
||||||
fi
|
fi
|
||||||
|
|
||||||
maybeUpdateManifestKeypairFile=
|
|
||||||
# shellcheck disable=SC2154 # SOLANA_INSTALL_UPDATE_MANIFEST_KEYPAIR_x86_64_unknown_linux_gnu comes from .buildkite/env/
|
# shellcheck disable=SC2154 # SOLANA_INSTALL_UPDATE_MANIFEST_KEYPAIR_x86_64_unknown_linux_gnu comes from .buildkite/env/
|
||||||
if [[ -n $SOLANA_INSTALL_UPDATE_MANIFEST_KEYPAIR_x86_64_unknown_linux_gnu ]]; then
|
if [[ -n $SOLANA_INSTALL_UPDATE_MANIFEST_KEYPAIR_x86_64_unknown_linux_gnu ]]; then
|
||||||
echo "$SOLANA_INSTALL_UPDATE_MANIFEST_KEYPAIR_x86_64_unknown_linux_gnu" > update_manifest_keypair.json
|
echo "$SOLANA_INSTALL_UPDATE_MANIFEST_KEYPAIR_x86_64_unknown_linux_gnu" > update_manifest_keypair.json
|
||||||
maybeUpdateManifestKeypairFile="-i update_manifest_keypair.json"
|
args+=(-i update_manifest_keypair.json)
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# shellcheck disable=SC2086 # Don't want to double quote the $maybeXYZ variables
|
# shellcheck disable=SC2086 # Don't want to double quote the $maybeXYZ variables
|
||||||
time net/net.sh $op -t "$tarChannelOrTag" \
|
time net/net.sh "${args[@]}"
|
||||||
$maybeUpdateManifestKeypairFile \
|
|
||||||
$maybeReuseLedger \
|
|
||||||
$maybeRejectExtraNodes \
|
|
||||||
$maybeNoValidatorSanity \
|
|
||||||
$maybeNoLedgerVerify
|
|
||||||
) || ok=false
|
) || ok=false
|
||||||
|
|
||||||
net/net.sh logs
|
net/net.sh logs
|
||||||
|
@ -267,6 +267,7 @@ sanity() {
|
|||||||
ok=true
|
ok=true
|
||||||
if [[ -n $GCE_NODE_COUNT ]]; then
|
if [[ -n $GCE_NODE_COUNT ]]; then
|
||||||
NO_LEDGER_VERIFY=1 \
|
NO_LEDGER_VERIFY=1 \
|
||||||
|
NO_VALIDATOR_SANITY=1 \
|
||||||
ci/testnet-sanity.sh demo-testnet-solana-com gce "${GCE_ZONES[0]}" -f || ok=false
|
ci/testnet-sanity.sh demo-testnet-solana-com gce "${GCE_ZONES[0]}" -f || ok=false
|
||||||
else
|
else
|
||||||
echo "Error: no GCE nodes"
|
echo "Error: no GCE nodes"
|
||||||
@ -413,22 +414,26 @@ deploy() {
|
|||||||
fi
|
fi
|
||||||
|
|
||||||
# shellcheck disable=SC2068
|
# shellcheck disable=SC2068
|
||||||
ci/testnet-deploy.sh -p demo-testnet-solana-com -C gce ${GCE_ZONE_ARGS[@]} \
|
NO_LEDGER_VERIFY=1 \
|
||||||
-t "$CHANNEL_OR_TAG" -n "$GCE_NODE_COUNT" -c 0 -P -u -f \
|
NO_VALIDATOR_SANITY=1 \
|
||||||
-a demo-testnet-solana-com \
|
ci/testnet-deploy.sh -p demo-testnet-solana-com -C gce ${GCE_ZONE_ARGS[@]} \
|
||||||
${skipCreate:+-e} \
|
-t "$CHANNEL_OR_TAG" -n "$GCE_NODE_COUNT" -c 0 -P -u -f \
|
||||||
${maybeSkipStart:+-s} \
|
-a demo-testnet-solana-com \
|
||||||
${maybeStop:+-S} \
|
${skipCreate:+-e} \
|
||||||
${maybeDelete:+-D}
|
${maybeSkipStart:+-s} \
|
||||||
|
${maybeStop:+-S} \
|
||||||
|
${maybeDelete:+-D}
|
||||||
|
|
||||||
if [[ -n $GCE_LOW_QUOTA_NODE_COUNT ]]; then
|
if [[ -n $GCE_LOW_QUOTA_NODE_COUNT ]]; then
|
||||||
# shellcheck disable=SC2068
|
# shellcheck disable=SC2068
|
||||||
ci/testnet-deploy.sh -p demo-testnet-solana-com2 -C gce ${GCE_LOW_QUOTA_ZONE_ARGS[@]} \
|
NO_LEDGER_VERIFY=1 \
|
||||||
-t "$CHANNEL_OR_TAG" -n "$GCE_LOW_QUOTA_NODE_COUNT" -c 0 -P -f -x \
|
NO_VALIDATOR_SANITY=1 \
|
||||||
${skipCreate:+-e} \
|
ci/testnet-deploy.sh -p demo-testnet-solana-com2 -C gce ${GCE_LOW_QUOTA_ZONE_ARGS[@]} \
|
||||||
${skipStart:+-s} \
|
-t "$CHANNEL_OR_TAG" -n "$GCE_LOW_QUOTA_NODE_COUNT" -c 0 -P -f -x \
|
||||||
${maybeStop:+-S} \
|
${skipCreate:+-e} \
|
||||||
${maybeDelete:+-D}
|
${skipStart:+-s} \
|
||||||
|
${maybeStop:+-S} \
|
||||||
|
${maybeDelete:+-D}
|
||||||
fi
|
fi
|
||||||
)
|
)
|
||||||
;;
|
;;
|
||||||
|
14
net/gce.sh
14
net/gce.sh
@ -439,13 +439,15 @@ EOF
|
|||||||
for zone in "${zones[@]}"; do
|
for zone in "${zones[@]}"; do
|
||||||
echo "Looking for additional fullnode instances in $zone ..."
|
echo "Looking for additional fullnode instances in $zone ..."
|
||||||
cloud_FindInstances "$prefix-$zone-fullnode"
|
cloud_FindInstances "$prefix-$zone-fullnode"
|
||||||
[[ ${#instances[@]} -gt 0 ]] || {
|
if [[ ${#instances[@]} -gt 0 ]]; then
|
||||||
|
fetchPrivateKey
|
||||||
|
cloud_ForEachInstance recordInstanceIp "$failOnValidatorBootupFailure" fullnodeIpList
|
||||||
|
else
|
||||||
echo "Unable to find additional fullnodes"
|
echo "Unable to find additional fullnodes"
|
||||||
exit 1
|
if $failOnValidatorBootupFailure; then
|
||||||
}
|
exit 1
|
||||||
|
fi
|
||||||
fetchPrivateKey
|
fi
|
||||||
cloud_ForEachInstance recordInstanceIp "$failOnValidatorBootupFailure" fullnodeIpList
|
|
||||||
done
|
done
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
17
net/net.sh
17
net/net.sh
@ -51,6 +51,7 @@ Operate a configured testnet
|
|||||||
to the bench-tps client.
|
to the bench-tps client.
|
||||||
|
|
||||||
sanity/start/update-specific options:
|
sanity/start/update-specific options:
|
||||||
|
-F - Discard validator nodes that didn't bootup successfully
|
||||||
-o noLedgerVerify - Skip ledger verification
|
-o noLedgerVerify - Skip ledger verification
|
||||||
-o noValidatorSanity - Skip fullnode sanity
|
-o noValidatorSanity - Skip fullnode sanity
|
||||||
-o rejectExtraNodes - Require the exact number of nodes
|
-o rejectExtraNodes - Require the exact number of nodes
|
||||||
@ -80,12 +81,13 @@ numBenchTpsClients=0
|
|||||||
numBenchExchangeClients=0
|
numBenchExchangeClients=0
|
||||||
benchTpsExtraArgs=
|
benchTpsExtraArgs=
|
||||||
benchExchangeExtraArgs=
|
benchExchangeExtraArgs=
|
||||||
|
failOnValidatorBootupFailure=true
|
||||||
|
|
||||||
command=$1
|
command=$1
|
||||||
[[ -n $command ]] || usage
|
[[ -n $command ]] || usage
|
||||||
shift
|
shift
|
||||||
|
|
||||||
while getopts "h?T:t:o:f:rD:i:c:" opt; do
|
while getopts "h?T:t:o:f:rD:i:c:F" opt; do
|
||||||
case $opt in
|
case $opt in
|
||||||
h | \?)
|
h | \?)
|
||||||
usage
|
usage
|
||||||
@ -167,6 +169,9 @@ while getopts "h?T:t:o:f:rD:i:c:" opt; do
|
|||||||
}
|
}
|
||||||
getClientTypeAndNum
|
getClientTypeAndNum
|
||||||
;;
|
;;
|
||||||
|
F)
|
||||||
|
failOnValidatorBootupFailure=false
|
||||||
|
;;
|
||||||
*)
|
*)
|
||||||
usage "Error: unhandled option: $opt"
|
usage "Error: unhandled option: $opt"
|
||||||
;;
|
;;
|
||||||
@ -291,6 +296,7 @@ startBootstrapLeader() {
|
|||||||
\"$RUST_LOG\" \
|
\"$RUST_LOG\" \
|
||||||
$skipSetup \
|
$skipSetup \
|
||||||
$leaderRotation \
|
$leaderRotation \
|
||||||
|
$failOnValidatorBootupFailure \
|
||||||
"
|
"
|
||||||
) >> "$logFile" 2>&1 || {
|
) >> "$logFile" 2>&1 || {
|
||||||
cat "$logFile"
|
cat "$logFile"
|
||||||
@ -319,6 +325,7 @@ startNode() {
|
|||||||
\"$RUST_LOG\" \
|
\"$RUST_LOG\" \
|
||||||
$skipSetup \
|
$skipSetup \
|
||||||
$leaderRotation \
|
$leaderRotation \
|
||||||
|
$failOnValidatorBootupFailure \
|
||||||
"
|
"
|
||||||
) >> "$logFile" 2>&1 &
|
) >> "$logFile" 2>&1 &
|
||||||
declare pid=$!
|
declare pid=$!
|
||||||
@ -475,9 +482,13 @@ start() {
|
|||||||
declare ok=true
|
declare ok=true
|
||||||
wait "$pid" || ok=false
|
wait "$pid" || ok=false
|
||||||
if ! $ok; then
|
if ! $ok; then
|
||||||
|
echo "+++ fullnode failed to start"
|
||||||
cat "$netLogDir/fullnode-$pid.log"
|
cat "$netLogDir/fullnode-$pid.log"
|
||||||
echo ^^^ +++
|
if $failOnValidatorBootupFailure; then
|
||||||
exit 1
|
exit 1
|
||||||
|
else
|
||||||
|
echo "Failure is non-fatal"
|
||||||
|
fi
|
||||||
fi
|
fi
|
||||||
done
|
done
|
||||||
|
|
||||||
|
@ -12,6 +12,7 @@ numNodes="$5"
|
|||||||
RUST_LOG="$6"
|
RUST_LOG="$6"
|
||||||
skipSetup="$7"
|
skipSetup="$7"
|
||||||
leaderRotation="$8"
|
leaderRotation="$8"
|
||||||
|
failOnValidatorBootupFailure="$9"
|
||||||
set +x
|
set +x
|
||||||
export RUST_LOG
|
export RUST_LOG
|
||||||
|
|
||||||
@ -35,12 +36,14 @@ missing() {
|
|||||||
[[ -n $numNodes ]] || missing numNodes
|
[[ -n $numNodes ]] || missing numNodes
|
||||||
[[ -n $skipSetup ]] || missing skipSetup
|
[[ -n $skipSetup ]] || missing skipSetup
|
||||||
[[ -n $leaderRotation ]] || missing leaderRotation
|
[[ -n $leaderRotation ]] || missing leaderRotation
|
||||||
|
[[ -n $failOnValidatorBootupFailure ]] || missing failOnValidatorBootupFailure
|
||||||
|
|
||||||
cat > deployConfig <<EOF
|
cat > deployConfig <<EOF
|
||||||
deployMethod="$deployMethod"
|
deployMethod="$deployMethod"
|
||||||
entrypointIp="$entrypointIp"
|
entrypointIp="$entrypointIp"
|
||||||
numNodes="$numNodes"
|
numNodes="$numNodes"
|
||||||
leaderRotation=$leaderRotation
|
leaderRotation=$leaderRotation
|
||||||
|
failOnValidatorBootupFailure=$failOnValidatorBootupFailure
|
||||||
EOF
|
EOF
|
||||||
|
|
||||||
source net/common.sh
|
source net/common.sh
|
||||||
|
@ -9,6 +9,7 @@ cd "$(dirname "$0")"/../..
|
|||||||
deployMethod=
|
deployMethod=
|
||||||
entrypointIp=
|
entrypointIp=
|
||||||
numNodes=
|
numNodes=
|
||||||
|
failOnValidatorBootupFailure=
|
||||||
|
|
||||||
[[ -r deployConfig ]] || {
|
[[ -r deployConfig ]] || {
|
||||||
echo deployConfig missing
|
echo deployConfig missing
|
||||||
@ -26,6 +27,7 @@ missing() {
|
|||||||
[[ -n $entrypointIp ]] || missing entrypointIp
|
[[ -n $entrypointIp ]] || missing entrypointIp
|
||||||
[[ -n $numNodes ]] || missing numNodes
|
[[ -n $numNodes ]] || missing numNodes
|
||||||
[[ -n $leaderRotation ]] || missing leaderRotation
|
[[ -n $leaderRotation ]] || missing leaderRotation
|
||||||
|
[[ -n $failOnValidatorBootupFailure ]] || missing failOnValidatorBootupFailure
|
||||||
|
|
||||||
ledgerVerify=true
|
ledgerVerify=true
|
||||||
validatorSanity=true
|
validatorSanity=true
|
||||||
@ -79,7 +81,17 @@ local|tar)
|
|||||||
exit 1
|
exit 1
|
||||||
esac
|
esac
|
||||||
|
|
||||||
echo "+++ $entrypointIp: node count ($numNodes expected)"
|
if $failOnValidatorBootupFailure; then
|
||||||
|
numSanityNodes=1
|
||||||
|
if $rejectExtraNodes; then
|
||||||
|
echo "rejectExtraNodes cannot be used with failOnValidatorBootupFailure"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
numSanityNodes="$numNodes"
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "+++ $entrypointIp: node count ($numSanityNodes expected)"
|
||||||
(
|
(
|
||||||
set -x
|
set -x
|
||||||
$solana_keygen -o "$client_id"
|
$solana_keygen -o "$client_id"
|
||||||
@ -90,7 +102,7 @@ echo "+++ $entrypointIp: node count ($numNodes expected)"
|
|||||||
fi
|
fi
|
||||||
|
|
||||||
timeout 2m $solana_gossip --network "$entrypointIp:8001" \
|
timeout 2m $solana_gossip --network "$entrypointIp:8001" \
|
||||||
spy --$nodeArg "$numNodes" \
|
spy --$nodeArg "$numSanityNodes" \
|
||||||
)
|
)
|
||||||
|
|
||||||
echo "--- RPC API: getTransactionCount"
|
echo "--- RPC API: getTransactionCount"
|
||||||
|
@ -34,8 +34,10 @@ __cloud_FindInstances() {
|
|||||||
instances+=("$name:$publicIp:$privateIp:$zone")
|
instances+=("$name:$publicIp:$privateIp:$zone")
|
||||||
done < <(gcloud compute instances list \
|
done < <(gcloud compute instances list \
|
||||||
--filter "$filter" \
|
--filter "$filter" \
|
||||||
--format 'value(name,networkInterfaces[0].accessConfigs[0].natIP,networkInterfaces[0].networkIP,status,zone)')
|
--format 'value(name,networkInterfaces[0].accessConfigs[0].natIP,networkInterfaces[0].networkIP,status,zone)' \
|
||||||
|
| grep RUNNING)
|
||||||
}
|
}
|
||||||
|
|
||||||
#
|
#
|
||||||
# cloud_FindInstances [namePrefix]
|
# cloud_FindInstances [namePrefix]
|
||||||
#
|
#
|
||||||
|
Reference in New Issue
Block a user