Improve error monitoring
This commit is contained in:
44
net/net.sh
44
net/net.sh
@ -13,7 +13,7 @@ usage() {
|
|||||||
echo "Error: $*"
|
echo "Error: $*"
|
||||||
fi
|
fi
|
||||||
cat <<EOF
|
cat <<EOF
|
||||||
usage: $0 [start|stop]
|
usage: $0 [start|stop|restart|sanity] [command-specific options]
|
||||||
|
|
||||||
Operate a configured testnet
|
Operate a configured testnet
|
||||||
|
|
||||||
@ -114,16 +114,11 @@ build() {
|
|||||||
|
|
||||||
common_start_setup() {
|
common_start_setup() {
|
||||||
declare ipAddress=$1
|
declare ipAddress=$1
|
||||||
declare logFile="$2"
|
test -d "$SOLANA_ROOT"
|
||||||
|
ssh "${sshOptions[@]}" "$ipAddress" "mkdir -p ~/solana ~/.cargo/bin"
|
||||||
(
|
rsync -vPrc -e "ssh ${sshOptions[*]}" \
|
||||||
set -x
|
"$SOLANA_ROOT"/{fetch-perf-libs.sh,scripts,net,multinode-demo} \
|
||||||
test -d "$SOLANA_ROOT"
|
"$ipAddress":~/solana/
|
||||||
ssh "${sshOptions[@]}" "$ipAddress" "mkdir -p ~/solana ~/.cargo/bin"
|
|
||||||
rsync -vPr -e "ssh ${sshOptions[*]}" \
|
|
||||||
"$SOLANA_ROOT"/{fetch-perf-libs.sh,scripts,net,multinode-demo} \
|
|
||||||
"$ipAddress":~/solana/
|
|
||||||
) >> "$logFile" 2>&1
|
|
||||||
}
|
}
|
||||||
|
|
||||||
startLeader() {
|
startLeader() {
|
||||||
@ -131,18 +126,17 @@ startLeader() {
|
|||||||
declare logFile="$2"
|
declare logFile="$2"
|
||||||
echo "--- Starting leader: $leaderIp"
|
echo "--- Starting leader: $leaderIp"
|
||||||
|
|
||||||
common_start_setup "$ipAddress" "$logFile"
|
|
||||||
|
|
||||||
# Deploy local binaries to leader. Validators and clients later fetch the
|
# Deploy local binaries to leader. Validators and clients later fetch the
|
||||||
# binaries from the leader.
|
# binaries from the leader.
|
||||||
(
|
(
|
||||||
set -x
|
set -x
|
||||||
|
common_start_setup "$ipAddress" || exit 1
|
||||||
case $deployMethod in
|
case $deployMethod in
|
||||||
snap)
|
snap)
|
||||||
rsync -vPr -e "ssh ${sshOptions[*]}" "$snapFilename" "$ipAddress:~/solana/solana.snap"
|
rsync -vPrc -e "ssh ${sshOptions[*]}" "$snapFilename" "$ipAddress:~/solana/solana.snap"
|
||||||
;;
|
;;
|
||||||
local)
|
local)
|
||||||
rsync -vPr -e "ssh ${sshOptions[*]}" "$SOLANA_ROOT"/farf/bin/* "$ipAddress:~/.cargo/bin/"
|
rsync -vPrc -e "ssh ${sshOptions[*]}" "$SOLANA_ROOT"/farf/bin/* "$ipAddress:~/.cargo/bin/"
|
||||||
;;
|
;;
|
||||||
*)
|
*)
|
||||||
usage "Internal error: invalid deployMethod: $deployMethod"
|
usage "Internal error: invalid deployMethod: $deployMethod"
|
||||||
@ -151,7 +145,11 @@ startLeader() {
|
|||||||
|
|
||||||
ssh "${sshOptions[@]}" -n "$ipAddress" \
|
ssh "${sshOptions[@]}" -n "$ipAddress" \
|
||||||
"./solana/net/remote/remote-node.sh $deployMethod leader $leaderIp $expectedNodeCount \"$nodeSetupArgs\" \"$RUST_LOG\""
|
"./solana/net/remote/remote-node.sh $deployMethod leader $leaderIp $expectedNodeCount \"$nodeSetupArgs\" \"$RUST_LOG\""
|
||||||
) >> "$logFile" 2>&1
|
) >> "$logFile" 2>&1 || {
|
||||||
|
cat "$logFile"
|
||||||
|
echo "^^^ +++"
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
startValidator() {
|
startValidator() {
|
||||||
@ -160,8 +158,8 @@ startValidator() {
|
|||||||
|
|
||||||
echo "--- Starting validator: $leaderIp"
|
echo "--- Starting validator: $leaderIp"
|
||||||
(
|
(
|
||||||
common_start_setup "$ipAddress" /dev/stdout
|
|
||||||
set -x
|
set -x
|
||||||
|
common_start_setup "$ipAddress"
|
||||||
ssh "${sshOptions[@]}" -n "$ipAddress" \
|
ssh "${sshOptions[@]}" -n "$ipAddress" \
|
||||||
"./solana/net/remote/remote-node.sh $deployMethod validator $leaderIp $expectedNodeCount \"$nodeSetupArgs\" \"$RUST_LOG\""
|
"./solana/net/remote/remote-node.sh $deployMethod validator $leaderIp $expectedNodeCount \"$nodeSetupArgs\" \"$RUST_LOG\""
|
||||||
) >> "$netLogDir/validator-$ipAddress.log" 2>&1 &
|
) >> "$netLogDir/validator-$ipAddress.log" 2>&1 &
|
||||||
@ -173,14 +171,18 @@ startValidator() {
|
|||||||
startClient() {
|
startClient() {
|
||||||
declare ipAddress=$1
|
declare ipAddress=$1
|
||||||
declare logFile="$2"
|
declare logFile="$2"
|
||||||
echo "--- Starting client: $leaderIp"
|
echo "--- Starting client: $ipAddress"
|
||||||
common_start_setup "$ipAddress" "$logFile"
|
|
||||||
|
|
||||||
(
|
(
|
||||||
set -x
|
set -x
|
||||||
|
common_start_setup "$ipAddress"
|
||||||
ssh "${sshOptions[@]}" -f "$ipAddress" \
|
ssh "${sshOptions[@]}" -f "$ipAddress" \
|
||||||
"./solana/net/remote/remote-client.sh $deployMethod $leaderIp $expectedNodeCount \"$RUST_LOG\""
|
"./solana/net/remote/remote-client.sh $deployMethod $leaderIp $expectedNodeCount \"$RUST_LOG\""
|
||||||
) >> "$logFile" 2>&1
|
) >> "$logFile" 2>&1 || {
|
||||||
|
cat "$logFile"
|
||||||
|
echo "^^^ +++"
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
sanity() {
|
sanity() {
|
||||||
@ -191,7 +193,7 @@ sanity() {
|
|||||||
# shellcheck disable=SC2029 # remote-client.sh args are expanded on client side intentionally
|
# shellcheck disable=SC2029 # remote-client.sh args are expanded on client side intentionally
|
||||||
ssh "${sshOptions[@]}" "$leaderIp" \
|
ssh "${sshOptions[@]}" "$leaderIp" \
|
||||||
"./solana/net/remote/remote-sanity.sh $sanityExtraArgs"
|
"./solana/net/remote/remote-sanity.sh $sanityExtraArgs"
|
||||||
)
|
) || exit 1
|
||||||
}
|
}
|
||||||
|
|
||||||
start() {
|
start() {
|
||||||
|
14
net/remote/remote-client.sh
Normal file → Executable file
14
net/remote/remote-client.sh
Normal file → Executable file
@ -1,4 +1,4 @@
|
|||||||
#!/bin/bash -e
|
#!/bin/bash -ex
|
||||||
|
|
||||||
cd "$(dirname "$0")"/../..
|
cd "$(dirname "$0")"/../..
|
||||||
|
|
||||||
@ -22,12 +22,13 @@ scripts/install-earlyoom.sh
|
|||||||
|
|
||||||
case $deployMethod in
|
case $deployMethod in
|
||||||
snap)
|
snap)
|
||||||
rsync -vPr "$leaderIp:~/solana/solana.snap" .
|
rsync -vPrc "$leaderIp:~/solana/solana.snap" .
|
||||||
sudo snap install solana.snap --devmode --dangerous
|
sudo snap install solana.snap --devmode --dangerous
|
||||||
rm solana.snap
|
rm solana.snap
|
||||||
|
|
||||||
nodeConfig="\
|
nodeConfig="\
|
||||||
leader-ip=$leaderIp \
|
leader-ip=$leaderIp \
|
||||||
|
default-metrics-rate=1 \
|
||||||
metrics-config=$SOLANA_METRICS_CONFIG \
|
metrics-config=$SOLANA_METRICS_CONFIG \
|
||||||
rust-log=$RUST_LOG \
|
rust-log=$RUST_LOG \
|
||||||
"
|
"
|
||||||
@ -39,9 +40,10 @@ snap)
|
|||||||
local)
|
local)
|
||||||
PATH="$HOME"/.cargo/bin:"$PATH"
|
PATH="$HOME"/.cargo/bin:"$PATH"
|
||||||
export USE_INSTALL=1
|
export USE_INSTALL=1
|
||||||
|
export SOLANA_DEFAULT_METRICS_RATE=1
|
||||||
export RUST_LOG
|
export RUST_LOG
|
||||||
|
|
||||||
rsync -vPr "$leaderIp:~/.cargo/bin/solana*" ~/.cargo/bin/
|
rsync -vPrc "$leaderIp:~/.cargo/bin/solana*" ~/.cargo/bin/
|
||||||
solana_bench_tps="multinode-demo/client.sh $leaderIp:~/solana"
|
solana_bench_tps="multinode-demo/client.sh $leaderIp:~/solana"
|
||||||
;;
|
;;
|
||||||
*)
|
*)
|
||||||
@ -49,14 +51,16 @@ local)
|
|||||||
exit 1
|
exit 1
|
||||||
esac
|
esac
|
||||||
|
|
||||||
scripts/oom-monitor.sh > oom-monitor.log 2>&1 &
|
scripts/oom-monitor.sh > oom-monitor.log 2>&1 &
|
||||||
|
|
||||||
while true; do
|
while true; do
|
||||||
echo "=== Client start: $(date)" >> client.log
|
echo "=== Client start: $(date)" >> client.log
|
||||||
clientCommand="$solana_bench_tps --num-nodes $numNodes --loop -s 600 --sustained -t threadCount"
|
clientCommand="$solana_bench_tps --num-nodes $numNodes --seconds 600 --sustained --threads $threadCount"
|
||||||
echo "$ $clientCommand" >> client.log
|
echo "$ $clientCommand" >> client.log
|
||||||
|
|
||||||
|
set +e
|
||||||
$clientCommand >> client.log 2>&1
|
$clientCommand >> client.log 2>&1
|
||||||
|
set -e
|
||||||
|
|
||||||
$metricsWriteDatapoint "testnet-deploy,name=$netBasename clientexit=1"
|
$metricsWriteDatapoint "testnet-deploy,name=$netBasename clientexit=1"
|
||||||
echo Error: bench-tps should never exit | tee -a client.log
|
echo Error: bench-tps should never exit | tee -a client.log
|
||||||
|
@ -28,11 +28,12 @@ scripts/install-earlyoom.sh
|
|||||||
case $deployMethod in
|
case $deployMethod in
|
||||||
snap)
|
snap)
|
||||||
SECONDS=0
|
SECONDS=0
|
||||||
rsync -vPr "$leaderIp:~/solana/solana.snap" .
|
rsync -vPrc "$leaderIp:~/solana/solana.snap" .
|
||||||
sudo snap install solana.snap --devmode --dangerous
|
sudo snap install solana.snap --devmode --dangerous
|
||||||
|
|
||||||
commonNodeConfig="\
|
commonNodeConfig="\
|
||||||
leader-ip=$leaderIp \
|
leader-ip=$leaderIp \
|
||||||
|
default-metrics-rate=1 \
|
||||||
metrics-config=$SOLANA_METRICS_CONFIG \
|
metrics-config=$SOLANA_METRICS_CONFIG \
|
||||||
rust-log=$RUST_LOG \
|
rust-log=$RUST_LOG \
|
||||||
setup-args=$setupArgs \
|
setup-args=$setupArgs \
|
||||||
@ -65,6 +66,7 @@ local)
|
|||||||
PATH="$HOME"/.cargo/bin:"$PATH"
|
PATH="$HOME"/.cargo/bin:"$PATH"
|
||||||
export USE_INSTALL=1
|
export USE_INSTALL=1
|
||||||
export RUST_LOG
|
export RUST_LOG
|
||||||
|
export SOLANA_DEFAULT_METRICS_RATE=1
|
||||||
if [[ -e /dev/nvidia0 ]]; then
|
if [[ -e /dev/nvidia0 ]]; then
|
||||||
export SOLANA_CUDA=1
|
export SOLANA_CUDA=1
|
||||||
fi
|
fi
|
||||||
@ -80,7 +82,7 @@ local)
|
|||||||
./multinode-demo/leader.sh > leader.log 2>&1 &
|
./multinode-demo/leader.sh > leader.log 2>&1 &
|
||||||
;;
|
;;
|
||||||
validator)
|
validator)
|
||||||
rsync -vPr "$leaderIp:~/.cargo/bin/solana*" ~/.cargo/bin/
|
rsync -vPrc "$leaderIp:~/.cargo/bin/solana*" ~/.cargo/bin/
|
||||||
|
|
||||||
# shellcheck disable=SC2086 # Don't want to double quote "$setupArgs"
|
# shellcheck disable=SC2086 # Don't want to double quote "$setupArgs"
|
||||||
./multinode-demo/setup.sh -t validator -p $setupArgs
|
./multinode-demo/setup.sh -t validator -p $setupArgs
|
||||||
|
@ -100,10 +100,13 @@ fi
|
|||||||
echo "--- $leaderIp: validator sanity"
|
echo "--- $leaderIp: validator sanity"
|
||||||
if $validatorSanity; then
|
if $validatorSanity; then
|
||||||
(
|
(
|
||||||
|
set -ex -o pipefail
|
||||||
./multinode-demo/setup.sh -t validator
|
./multinode-demo/setup.sh -t validator
|
||||||
set -e pipefail
|
timeout 10s ./multinode-demo/validator.sh "$leaderIp" "$leaderIp:8001" 2>&1 | tee validator.log
|
||||||
timeout 10s ./multinode-demo/validator.sh "$leaderIp" 2>&1 | tee validator.log
|
) || {
|
||||||
)
|
exitcode=$?
|
||||||
|
[[ $exitcode -eq 124 ]] || exit $exitcode
|
||||||
|
}
|
||||||
wc -l validator.log
|
wc -l validator.log
|
||||||
if grep -C100 panic validator.log; then
|
if grep -C100 panic validator.log; then
|
||||||
echo "^^^ +++"
|
echo "^^^ +++"
|
||||||
|
Reference in New Issue
Block a user