diff --git a/net/net.sh b/net/net.sh index 97860c9c6e..8c258e0600 100755 --- a/net/net.sh +++ b/net/net.sh @@ -13,7 +13,7 @@ usage() { echo "Error: $*" fi cat <> "$logFile" 2>&1 + test -d "$SOLANA_ROOT" + ssh "${sshOptions[@]}" "$ipAddress" "mkdir -p ~/solana ~/.cargo/bin" + rsync -vPrc -e "ssh ${sshOptions[*]}" \ + "$SOLANA_ROOT"/{fetch-perf-libs.sh,scripts,net,multinode-demo} \ + "$ipAddress":~/solana/ } startLeader() { @@ -131,18 +126,17 @@ startLeader() { declare logFile="$2" echo "--- Starting leader: $leaderIp" - common_start_setup "$ipAddress" "$logFile" - # Deploy local binaries to leader. Validators and clients later fetch the # binaries from the leader. ( set -x + common_start_setup "$ipAddress" || exit 1 case $deployMethod in snap) - rsync -vPr -e "ssh ${sshOptions[*]}" "$snapFilename" "$ipAddress:~/solana/solana.snap" + rsync -vPrc -e "ssh ${sshOptions[*]}" "$snapFilename" "$ipAddress:~/solana/solana.snap" ;; local) - rsync -vPr -e "ssh ${sshOptions[*]}" "$SOLANA_ROOT"/farf/bin/* "$ipAddress:~/.cargo/bin/" + rsync -vPrc -e "ssh ${sshOptions[*]}" "$SOLANA_ROOT"/farf/bin/* "$ipAddress:~/.cargo/bin/" ;; *) usage "Internal error: invalid deployMethod: $deployMethod" @@ -151,7 +145,11 @@ startLeader() { ssh "${sshOptions[@]}" -n "$ipAddress" \ "./solana/net/remote/remote-node.sh $deployMethod leader $leaderIp $expectedNodeCount \"$nodeSetupArgs\" \"$RUST_LOG\"" - ) >> "$logFile" 2>&1 + ) >> "$logFile" 2>&1 || { + cat "$logFile" + echo "^^^ +++" + exit 1 + } } startValidator() { @@ -160,8 +158,8 @@ startValidator() { echo "--- Starting validator: $leaderIp" ( - common_start_setup "$ipAddress" /dev/stdout set -x + common_start_setup "$ipAddress" ssh "${sshOptions[@]}" -n "$ipAddress" \ "./solana/net/remote/remote-node.sh $deployMethod validator $leaderIp $expectedNodeCount \"$nodeSetupArgs\" \"$RUST_LOG\"" ) >> "$netLogDir/validator-$ipAddress.log" 2>&1 & @@ -173,14 +171,18 @@ startValidator() { startClient() { declare ipAddress=$1 declare logFile="$2" - echo "--- Starting client: $leaderIp" - common_start_setup "$ipAddress" "$logFile" + echo "--- Starting client: $ipAddress" ( set -x + common_start_setup "$ipAddress" ssh "${sshOptions[@]}" -f "$ipAddress" \ "./solana/net/remote/remote-client.sh $deployMethod $leaderIp $expectedNodeCount \"$RUST_LOG\"" - ) >> "$logFile" 2>&1 + ) >> "$logFile" 2>&1 || { + cat "$logFile" + echo "^^^ +++" + exit 1 + } } sanity() { @@ -191,7 +193,7 @@ sanity() { # shellcheck disable=SC2029 # remote-client.sh args are expanded on client side intentionally ssh "${sshOptions[@]}" "$leaderIp" \ "./solana/net/remote/remote-sanity.sh $sanityExtraArgs" - ) + ) || exit 1 } start() { diff --git a/net/remote/remote-client.sh b/net/remote/remote-client.sh old mode 100644 new mode 100755 index 510951bc4d..78ebc485c1 --- a/net/remote/remote-client.sh +++ b/net/remote/remote-client.sh @@ -1,4 +1,4 @@ -#!/bin/bash -e +#!/bin/bash -ex cd "$(dirname "$0")"/../.. @@ -22,12 +22,13 @@ scripts/install-earlyoom.sh case $deployMethod in snap) - rsync -vPr "$leaderIp:~/solana/solana.snap" . + rsync -vPrc "$leaderIp:~/solana/solana.snap" . sudo snap install solana.snap --devmode --dangerous rm solana.snap nodeConfig="\ leader-ip=$leaderIp \ + default-metrics-rate=1 \ metrics-config=$SOLANA_METRICS_CONFIG \ rust-log=$RUST_LOG \ " @@ -39,9 +40,10 @@ snap) local) PATH="$HOME"/.cargo/bin:"$PATH" export USE_INSTALL=1 + export SOLANA_DEFAULT_METRICS_RATE=1 export RUST_LOG - rsync -vPr "$leaderIp:~/.cargo/bin/solana*" ~/.cargo/bin/ + rsync -vPrc "$leaderIp:~/.cargo/bin/solana*" ~/.cargo/bin/ solana_bench_tps="multinode-demo/client.sh $leaderIp:~/solana" ;; *) @@ -49,14 +51,16 @@ local) exit 1 esac -scripts/oom-monitor.sh > oom-monitor.log 2>&1 & +scripts/oom-monitor.sh > oom-monitor.log 2>&1 & while true; do echo "=== Client start: $(date)" >> client.log - clientCommand="$solana_bench_tps --num-nodes $numNodes --loop -s 600 --sustained -t threadCount" + clientCommand="$solana_bench_tps --num-nodes $numNodes --seconds 600 --sustained --threads $threadCount" echo "$ $clientCommand" >> client.log + set +e $clientCommand >> client.log 2>&1 + set -e $metricsWriteDatapoint "testnet-deploy,name=$netBasename clientexit=1" echo Error: bench-tps should never exit | tee -a client.log diff --git a/net/remote/remote-node.sh b/net/remote/remote-node.sh index d1439ac7fa..246f143d46 100755 --- a/net/remote/remote-node.sh +++ b/net/remote/remote-node.sh @@ -28,11 +28,12 @@ scripts/install-earlyoom.sh case $deployMethod in snap) SECONDS=0 - rsync -vPr "$leaderIp:~/solana/solana.snap" . + rsync -vPrc "$leaderIp:~/solana/solana.snap" . sudo snap install solana.snap --devmode --dangerous commonNodeConfig="\ leader-ip=$leaderIp \ + default-metrics-rate=1 \ metrics-config=$SOLANA_METRICS_CONFIG \ rust-log=$RUST_LOG \ setup-args=$setupArgs \ @@ -65,6 +66,7 @@ local) PATH="$HOME"/.cargo/bin:"$PATH" export USE_INSTALL=1 export RUST_LOG + export SOLANA_DEFAULT_METRICS_RATE=1 if [[ -e /dev/nvidia0 ]]; then export SOLANA_CUDA=1 fi @@ -80,7 +82,7 @@ local) ./multinode-demo/leader.sh > leader.log 2>&1 & ;; validator) - rsync -vPr "$leaderIp:~/.cargo/bin/solana*" ~/.cargo/bin/ + rsync -vPrc "$leaderIp:~/.cargo/bin/solana*" ~/.cargo/bin/ # shellcheck disable=SC2086 # Don't want to double quote "$setupArgs" ./multinode-demo/setup.sh -t validator -p $setupArgs diff --git a/net/remote/remote-sanity.sh b/net/remote/remote-sanity.sh index f4cae9bf04..7f0bf257dc 100755 --- a/net/remote/remote-sanity.sh +++ b/net/remote/remote-sanity.sh @@ -100,10 +100,13 @@ fi echo "--- $leaderIp: validator sanity" if $validatorSanity; then ( + set -ex -o pipefail ./multinode-demo/setup.sh -t validator - set -e pipefail - timeout 10s ./multinode-demo/validator.sh "$leaderIp" 2>&1 | tee validator.log - ) + timeout 10s ./multinode-demo/validator.sh "$leaderIp" "$leaderIp:8001" 2>&1 | tee validator.log + ) || { + exitcode=$? + [[ $exitcode -eq 124 ]] || exit $exitcode + } wc -l validator.log if grep -C100 panic validator.log; then echo "^^^ +++"