Improve error monitoring

This commit is contained in:
Michael Vines
2018-09-04 15:16:25 -07:00
parent 06fd945f85
commit db9219ccc8
4 changed files with 42 additions and 31 deletions

View File

@ -13,7 +13,7 @@ usage() {
echo "Error: $*" echo "Error: $*"
fi fi
cat <<EOF cat <<EOF
usage: $0 [start|stop] usage: $0 [start|stop|restart|sanity] [command-specific options]
Operate a configured testnet Operate a configured testnet
@ -114,16 +114,11 @@ build() {
common_start_setup() { common_start_setup() {
declare ipAddress=$1 declare ipAddress=$1
declare logFile="$2" test -d "$SOLANA_ROOT"
ssh "${sshOptions[@]}" "$ipAddress" "mkdir -p ~/solana ~/.cargo/bin"
( rsync -vPrc -e "ssh ${sshOptions[*]}" \
set -x "$SOLANA_ROOT"/{fetch-perf-libs.sh,scripts,net,multinode-demo} \
test -d "$SOLANA_ROOT" "$ipAddress":~/solana/
ssh "${sshOptions[@]}" "$ipAddress" "mkdir -p ~/solana ~/.cargo/bin"
rsync -vPr -e "ssh ${sshOptions[*]}" \
"$SOLANA_ROOT"/{fetch-perf-libs.sh,scripts,net,multinode-demo} \
"$ipAddress":~/solana/
) >> "$logFile" 2>&1
} }
startLeader() { startLeader() {
@ -131,18 +126,17 @@ startLeader() {
declare logFile="$2" declare logFile="$2"
echo "--- Starting leader: $leaderIp" echo "--- Starting leader: $leaderIp"
common_start_setup "$ipAddress" "$logFile"
# Deploy local binaries to leader. Validators and clients later fetch the # Deploy local binaries to leader. Validators and clients later fetch the
# binaries from the leader. # binaries from the leader.
( (
set -x set -x
common_start_setup "$ipAddress" || exit 1
case $deployMethod in case $deployMethod in
snap) snap)
rsync -vPr -e "ssh ${sshOptions[*]}" "$snapFilename" "$ipAddress:~/solana/solana.snap" rsync -vPrc -e "ssh ${sshOptions[*]}" "$snapFilename" "$ipAddress:~/solana/solana.snap"
;; ;;
local) local)
rsync -vPr -e "ssh ${sshOptions[*]}" "$SOLANA_ROOT"/farf/bin/* "$ipAddress:~/.cargo/bin/" rsync -vPrc -e "ssh ${sshOptions[*]}" "$SOLANA_ROOT"/farf/bin/* "$ipAddress:~/.cargo/bin/"
;; ;;
*) *)
usage "Internal error: invalid deployMethod: $deployMethod" usage "Internal error: invalid deployMethod: $deployMethod"
@ -151,7 +145,11 @@ startLeader() {
ssh "${sshOptions[@]}" -n "$ipAddress" \ ssh "${sshOptions[@]}" -n "$ipAddress" \
"./solana/net/remote/remote-node.sh $deployMethod leader $leaderIp $expectedNodeCount \"$nodeSetupArgs\" \"$RUST_LOG\"" "./solana/net/remote/remote-node.sh $deployMethod leader $leaderIp $expectedNodeCount \"$nodeSetupArgs\" \"$RUST_LOG\""
) >> "$logFile" 2>&1 ) >> "$logFile" 2>&1 || {
cat "$logFile"
echo "^^^ +++"
exit 1
}
} }
startValidator() { startValidator() {
@ -160,8 +158,8 @@ startValidator() {
echo "--- Starting validator: $leaderIp" echo "--- Starting validator: $leaderIp"
( (
common_start_setup "$ipAddress" /dev/stdout
set -x set -x
common_start_setup "$ipAddress"
ssh "${sshOptions[@]}" -n "$ipAddress" \ ssh "${sshOptions[@]}" -n "$ipAddress" \
"./solana/net/remote/remote-node.sh $deployMethod validator $leaderIp $expectedNodeCount \"$nodeSetupArgs\" \"$RUST_LOG\"" "./solana/net/remote/remote-node.sh $deployMethod validator $leaderIp $expectedNodeCount \"$nodeSetupArgs\" \"$RUST_LOG\""
) >> "$netLogDir/validator-$ipAddress.log" 2>&1 & ) >> "$netLogDir/validator-$ipAddress.log" 2>&1 &
@ -173,14 +171,18 @@ startValidator() {
startClient() { startClient() {
declare ipAddress=$1 declare ipAddress=$1
declare logFile="$2" declare logFile="$2"
echo "--- Starting client: $leaderIp" echo "--- Starting client: $ipAddress"
common_start_setup "$ipAddress" "$logFile"
( (
set -x set -x
common_start_setup "$ipAddress"
ssh "${sshOptions[@]}" -f "$ipAddress" \ ssh "${sshOptions[@]}" -f "$ipAddress" \
"./solana/net/remote/remote-client.sh $deployMethod $leaderIp $expectedNodeCount \"$RUST_LOG\"" "./solana/net/remote/remote-client.sh $deployMethod $leaderIp $expectedNodeCount \"$RUST_LOG\""
) >> "$logFile" 2>&1 ) >> "$logFile" 2>&1 || {
cat "$logFile"
echo "^^^ +++"
exit 1
}
} }
sanity() { sanity() {
@ -191,7 +193,7 @@ sanity() {
# shellcheck disable=SC2029 # remote-client.sh args are expanded on client side intentionally # shellcheck disable=SC2029 # remote-client.sh args are expanded on client side intentionally
ssh "${sshOptions[@]}" "$leaderIp" \ ssh "${sshOptions[@]}" "$leaderIp" \
"./solana/net/remote/remote-sanity.sh $sanityExtraArgs" "./solana/net/remote/remote-sanity.sh $sanityExtraArgs"
) ) || exit 1
} }
start() { start() {

14
net/remote/remote-client.sh Normal file → Executable file
View File

@ -1,4 +1,4 @@
#!/bin/bash -e #!/bin/bash -ex
cd "$(dirname "$0")"/../.. cd "$(dirname "$0")"/../..
@ -22,12 +22,13 @@ scripts/install-earlyoom.sh
case $deployMethod in case $deployMethod in
snap) snap)
rsync -vPr "$leaderIp:~/solana/solana.snap" . rsync -vPrc "$leaderIp:~/solana/solana.snap" .
sudo snap install solana.snap --devmode --dangerous sudo snap install solana.snap --devmode --dangerous
rm solana.snap rm solana.snap
nodeConfig="\ nodeConfig="\
leader-ip=$leaderIp \ leader-ip=$leaderIp \
default-metrics-rate=1 \
metrics-config=$SOLANA_METRICS_CONFIG \ metrics-config=$SOLANA_METRICS_CONFIG \
rust-log=$RUST_LOG \ rust-log=$RUST_LOG \
" "
@ -39,9 +40,10 @@ snap)
local) local)
PATH="$HOME"/.cargo/bin:"$PATH" PATH="$HOME"/.cargo/bin:"$PATH"
export USE_INSTALL=1 export USE_INSTALL=1
export SOLANA_DEFAULT_METRICS_RATE=1
export RUST_LOG export RUST_LOG
rsync -vPr "$leaderIp:~/.cargo/bin/solana*" ~/.cargo/bin/ rsync -vPrc "$leaderIp:~/.cargo/bin/solana*" ~/.cargo/bin/
solana_bench_tps="multinode-demo/client.sh $leaderIp:~/solana" solana_bench_tps="multinode-demo/client.sh $leaderIp:~/solana"
;; ;;
*) *)
@ -49,14 +51,16 @@ local)
exit 1 exit 1
esac esac
scripts/oom-monitor.sh > oom-monitor.log 2>&1 & scripts/oom-monitor.sh > oom-monitor.log 2>&1 &
while true; do while true; do
echo "=== Client start: $(date)" >> client.log echo "=== Client start: $(date)" >> client.log
clientCommand="$solana_bench_tps --num-nodes $numNodes --loop -s 600 --sustained -t threadCount" clientCommand="$solana_bench_tps --num-nodes $numNodes --seconds 600 --sustained --threads $threadCount"
echo "$ $clientCommand" >> client.log echo "$ $clientCommand" >> client.log
set +e
$clientCommand >> client.log 2>&1 $clientCommand >> client.log 2>&1
set -e
$metricsWriteDatapoint "testnet-deploy,name=$netBasename clientexit=1" $metricsWriteDatapoint "testnet-deploy,name=$netBasename clientexit=1"
echo Error: bench-tps should never exit | tee -a client.log echo Error: bench-tps should never exit | tee -a client.log

View File

@ -28,11 +28,12 @@ scripts/install-earlyoom.sh
case $deployMethod in case $deployMethod in
snap) snap)
SECONDS=0 SECONDS=0
rsync -vPr "$leaderIp:~/solana/solana.snap" . rsync -vPrc "$leaderIp:~/solana/solana.snap" .
sudo snap install solana.snap --devmode --dangerous sudo snap install solana.snap --devmode --dangerous
commonNodeConfig="\ commonNodeConfig="\
leader-ip=$leaderIp \ leader-ip=$leaderIp \
default-metrics-rate=1 \
metrics-config=$SOLANA_METRICS_CONFIG \ metrics-config=$SOLANA_METRICS_CONFIG \
rust-log=$RUST_LOG \ rust-log=$RUST_LOG \
setup-args=$setupArgs \ setup-args=$setupArgs \
@ -65,6 +66,7 @@ local)
PATH="$HOME"/.cargo/bin:"$PATH" PATH="$HOME"/.cargo/bin:"$PATH"
export USE_INSTALL=1 export USE_INSTALL=1
export RUST_LOG export RUST_LOG
export SOLANA_DEFAULT_METRICS_RATE=1
if [[ -e /dev/nvidia0 ]]; then if [[ -e /dev/nvidia0 ]]; then
export SOLANA_CUDA=1 export SOLANA_CUDA=1
fi fi
@ -80,7 +82,7 @@ local)
./multinode-demo/leader.sh > leader.log 2>&1 & ./multinode-demo/leader.sh > leader.log 2>&1 &
;; ;;
validator) validator)
rsync -vPr "$leaderIp:~/.cargo/bin/solana*" ~/.cargo/bin/ rsync -vPrc "$leaderIp:~/.cargo/bin/solana*" ~/.cargo/bin/
# shellcheck disable=SC2086 # Don't want to double quote "$setupArgs" # shellcheck disable=SC2086 # Don't want to double quote "$setupArgs"
./multinode-demo/setup.sh -t validator -p $setupArgs ./multinode-demo/setup.sh -t validator -p $setupArgs

View File

@ -100,10 +100,13 @@ fi
echo "--- $leaderIp: validator sanity" echo "--- $leaderIp: validator sanity"
if $validatorSanity; then if $validatorSanity; then
( (
set -ex -o pipefail
./multinode-demo/setup.sh -t validator ./multinode-demo/setup.sh -t validator
set -e pipefail timeout 10s ./multinode-demo/validator.sh "$leaderIp" "$leaderIp:8001" 2>&1 | tee validator.log
timeout 10s ./multinode-demo/validator.sh "$leaderIp" 2>&1 | tee validator.log ) || {
) exitcode=$?
[[ $exitcode -eq 124 ]] || exit $exitcode
}
wc -l validator.log wc -l validator.log
if grep -C100 panic validator.log; then if grep -C100 panic validator.log; then
echo "^^^ +++" echo "^^^ +++"