Improve error monitoring

This commit is contained in:
Michael Vines
2018-09-04 15:16:25 -07:00
parent 06fd945f85
commit db9219ccc8
4 changed files with 42 additions and 31 deletions

14
net/remote/remote-client.sh Normal file → Executable file
View File

@@ -1,4 +1,4 @@
#!/bin/bash -e
#!/bin/bash -ex
cd "$(dirname "$0")"/../..
@@ -22,12 +22,13 @@ scripts/install-earlyoom.sh
case $deployMethod in
snap)
rsync -vPr "$leaderIp:~/solana/solana.snap" .
rsync -vPrc "$leaderIp:~/solana/solana.snap" .
sudo snap install solana.snap --devmode --dangerous
rm solana.snap
nodeConfig="\
leader-ip=$leaderIp \
default-metrics-rate=1 \
metrics-config=$SOLANA_METRICS_CONFIG \
rust-log=$RUST_LOG \
"
@@ -39,9 +40,10 @@ snap)
local)
PATH="$HOME"/.cargo/bin:"$PATH"
export USE_INSTALL=1
export SOLANA_DEFAULT_METRICS_RATE=1
export RUST_LOG
rsync -vPr "$leaderIp:~/.cargo/bin/solana*" ~/.cargo/bin/
rsync -vPrc "$leaderIp:~/.cargo/bin/solana*" ~/.cargo/bin/
solana_bench_tps="multinode-demo/client.sh $leaderIp:~/solana"
;;
*)
@@ -49,14 +51,16 @@ local)
exit 1
esac
scripts/oom-monitor.sh > oom-monitor.log 2>&1 &
scripts/oom-monitor.sh > oom-monitor.log 2>&1 &
while true; do
echo "=== Client start: $(date)" >> client.log
clientCommand="$solana_bench_tps --num-nodes $numNodes --loop -s 600 --sustained -t threadCount"
clientCommand="$solana_bench_tps --num-nodes $numNodes --seconds 600 --sustained --threads $threadCount"
echo "$ $clientCommand" >> client.log
set +e
$clientCommand >> client.log 2>&1
set -e
$metricsWriteDatapoint "testnet-deploy,name=$netBasename clientexit=1"
echo Error: bench-tps should never exit | tee -a client.log

View File

@@ -28,11 +28,12 @@ scripts/install-earlyoom.sh
case $deployMethod in
snap)
SECONDS=0
rsync -vPr "$leaderIp:~/solana/solana.snap" .
rsync -vPrc "$leaderIp:~/solana/solana.snap" .
sudo snap install solana.snap --devmode --dangerous
commonNodeConfig="\
leader-ip=$leaderIp \
default-metrics-rate=1 \
metrics-config=$SOLANA_METRICS_CONFIG \
rust-log=$RUST_LOG \
setup-args=$setupArgs \
@@ -65,6 +66,7 @@ local)
PATH="$HOME"/.cargo/bin:"$PATH"
export USE_INSTALL=1
export RUST_LOG
export SOLANA_DEFAULT_METRICS_RATE=1
if [[ -e /dev/nvidia0 ]]; then
export SOLANA_CUDA=1
fi
@@ -80,7 +82,7 @@ local)
./multinode-demo/leader.sh > leader.log 2>&1 &
;;
validator)
rsync -vPr "$leaderIp:~/.cargo/bin/solana*" ~/.cargo/bin/
rsync -vPrc "$leaderIp:~/.cargo/bin/solana*" ~/.cargo/bin/
# shellcheck disable=SC2086 # Don't want to double quote "$setupArgs"
./multinode-demo/setup.sh -t validator -p $setupArgs

View File

@@ -100,10 +100,13 @@ fi
echo "--- $leaderIp: validator sanity"
if $validatorSanity; then
(
set -ex -o pipefail
./multinode-demo/setup.sh -t validator
set -e pipefail
timeout 10s ./multinode-demo/validator.sh "$leaderIp" 2>&1 | tee validator.log
)
timeout 10s ./multinode-demo/validator.sh "$leaderIp" "$leaderIp:8001" 2>&1 | tee validator.log
) || {
exitcode=$?
[[ $exitcode -eq 124 ]] || exit $exitcode
}
wc -l validator.log
if grep -C100 panic validator.log; then
echo "^^^ +++"