GCE-based nodes now reboot on maintenance events instead of terminating (#5861)

This commit is contained in:
Michael Vines
2019-09-10 12:30:06 -07:00
committed by GitHub
parent 0d7efe5176
commit fc4aa71193
4 changed files with 96 additions and 44 deletions

View File

@ -59,11 +59,19 @@ genesisOptions="$genesisOptions"
airdropsEnabled=$airdropsEnabled
EOF
source scripts/oom-score-adj.sh
source net/common.sh
loadConfigFile
initCompleteFile=init-complete-node.log
cat > ~/solana/on-reboot <<EOF
#!/usr/bin/env bash
cd ~/solana
source scripts/oom-score-adj.sh
EOF
chmod +x ~/solana/on-reboot
echo "@reboot ~/solana/on-reboot" | crontab -
waitForNodeToInit() {
echo "--- waiting for node to boot up"
SECONDS=
@ -87,6 +95,13 @@ local|tar|skip)
./fetch-perf-libs.sh
# shellcheck source=/dev/null
source ./target/perf-libs/env.sh
cat >> ~/solana/on-reboot <<EOF
PATH="$HOME"/.cargo/bin:"$PATH"
export USE_INSTALL=1
# shellcheck source=/dev/null
source ./target/perf-libs/env.sh
SUDO_OK=1 source scripts/tune-system.sh
(
@ -98,12 +113,14 @@ local|tar|skip)
scripts/net-stats.sh > net-stats.log 2>&1 &
echo $! > net-stats.pid
if [[ -e /dev/nvidia0 && -x ~/.cargo/bin/solana-validator-cuda ]]; then
echo Selecting solana-validator-cuda
export SOLANA_CUDA=1
fi
EOF
case $nodeType in
bootstrap-leader)
if [[ -e /dev/nvidia0 && -x ~/.cargo/bin/solana-validator-cuda ]]; then
echo Selecting solana-validator-cuda
export SOLANA_CUDA=1
fi
set -x
if [[ $skipSetup != true ]]; then
rm -rf ./solana-node-keys
@ -175,16 +192,24 @@ EOF
)
if [[ $airdropsEnabled = true ]]; then
cat >> ~/solana/on-reboot <<EOF
./multinode-demo/drone.sh > drone.log 2>&1 &
EOF
fi
# shellcheck disable=SC2206 # Don't want to double quote $extraNodeArgs
args+=($extraNodeArgs)
nohup ./multinode-demo/bootstrap-leader.sh "${args[@]}" > fullnode.log 2>&1 &
pid=$!
oom_score_adj "$pid" 1000
cat >> ~/solana/on-reboot <<EOF
nohup ./multinode-demo/bootstrap-leader.sh ${args[@]} > fullnode.log 2>&1 &
pid=\$!
oom_score_adj "\$pid" 1000
disown
EOF
~/solana/on-reboot
waitForNodeToInit
solana --url http://"$entrypointIp":8899 -k ~/solana/fullnode-identity.json \
solana --url http://"$entrypointIp":8899 \
--keypair ~/solana/config/bootstrap-leader/identity-keypair.json \
validator-info publish "$(hostname)" -n team/solana --force || true
;;
validator|blockstreamer)
@ -197,11 +222,6 @@ EOF
"$entrypointIp":~/solana/solana-node-keys/"$nodeIndex" ~/solana/fullnode-identity.json
fi
if [[ -e /dev/nvidia0 && -x ~/.cargo/bin/solana-validator-cuda ]]; then
echo Selecting solana-validator-cuda
export SOLANA_CUDA=1
fi
args=(
--entrypoint "$entrypointIp:8001"
--gossip-port 8001
@ -240,7 +260,9 @@ EOF
# a location that somebody would expect to be able to airdrop from
scp "$entrypointIp":~/solana/config/mint-keypair.json config/
if [[ $airdropsEnabled = true ]]; then
cat >> ~/solana/on-reboot <<EOF
./multinode-demo/drone.sh > drone.log 2>&1 &
EOF
fi
# Grab the TLS cert generated by /certbot-restore.sh
@ -249,30 +271,39 @@ EOF
ls -l .cert.pem .key.pem
fi
export BLOCKEXPLORER_GEOIP_WHITELIST=$PWD/net/config/geoip.yml
npm install @solana/blockexplorer@1
npx solana-blockexplorer > blockexplorer.log 2>&1 &
# Confirm the blockexplorer is accessible
curl --head --retry 3 --retry-connrefused http://localhost:5000/
cat >> ~/solana/on-reboot <<EOF
export BLOCKEXPLORER_GEOIP_WHITELIST=$PWD/net/config/geoip.yml
npx solana-blockexplorer > blockexplorer.log 2>&1 &
# Redirect port 80 to port 5000
sudo iptables -A INPUT -p tcp --dport 80 -j ACCEPT
sudo iptables -A INPUT -p tcp --dport 5000 -j ACCEPT
sudo iptables -A PREROUTING -t nat -p tcp --dport 80 -j REDIRECT --to-port 5000
# Confirm the blockexplorer is now globally accessible
curl --head "$(curl ifconfig.io)"
EOF
fi
args+=(--init-complete-file "$initCompleteFile")
# shellcheck disable=SC2206 # Don't want to double quote $extraNodeArgs
args+=($extraNodeArgs)
nohup ./multinode-demo/validator.sh "${args[@]}" > fullnode.log 2>&1 &
pid=$!
oom_score_adj "$pid" 1000
cat >> ~/solana/on-reboot <<EOF
nohup ./multinode-demo/validator.sh ${args[@]} > fullnode.log 2>&1 &
pid=\$!
oom_score_adj "\$pid" 1000
disown
EOF
~/solana/on-reboot
waitForNodeToInit
if [[ $nodeType = blockstreamer ]]; then
# Confirm the blockexplorer is accessible
curl --head --retry 3 --retry-connrefused http://localhost:5000/
# Confirm the blockexplorer is now globally accessible
curl --head "$(curl ifconfig.io)"
fi
if [[ $skipSetup != true && $nodeType != blockstreamer ]]; then
args=(
--url http://"$entrypointIp":8899
@ -289,7 +320,8 @@ EOF
./multinode-demo/delegate-stake.sh "${args[@]}"
fi
solana --url http://"$entrypointIp":8899 -k ~/solana/fullnode-identity.json \
solana --url http://"$entrypointIp":8899 \
--keypair ~/solana/fullnode-identity.json \
validator-info publish "$(hostname)" -n team/solana --force || true
;;
replicator)
@ -308,9 +340,13 @@ EOF
exit 1
fi
nohup ./multinode-demo/replicator.sh "${args[@]}" > fullnode.log 2>&1 &
pid=$!
oom_score_adj "$pid" 1000
cat >> ~/solana/on-reboot <<EOF
nohup ./multinode-demo/replicator.sh ${args[@]} > fullnode.log 2>&1 &
pid=\$!
oom_score_adj "\$pid" 1000
disown
EOF
~/solana/on-reboot
sleep 1
;;
*)
@ -318,9 +354,9 @@ EOF
exit 1
;;
esac
disown
;;
*)
echo "Unknown deployment method: $deployMethod"
exit 1
esac