Morph gce_multinode-based scripts into net/

This commit is contained in:
Michael Vines
2018-08-28 10:19:33 -07:00
parent ffb72136c8
commit 399caf343c
16 changed files with 676 additions and 340 deletions

2
net/.gitignore vendored Normal file
View File

@@ -0,0 +1,2 @@
/config/
/log/

29
net/README.md Normal file
View File

@@ -0,0 +1,29 @@
# Network Management
This directory contains scripts useful for working with a test network. It's
intended to be both dev and CD friendly.
### User Account Prerequisites
Log in to GCP with:
```bash
$ gcloud auth login
```
Also ensure that `$(whoami)` is the name of an InfluxDB user account with enough
access to create a new database.
You currently must be running on a Linux system (for now, TODO fix this)
## Quick Start
```bash
$ cd net/
$ ./gce.sh create -n 5 -c 1 #<-- Create a GCE testnet with 5 validators, 1 client (billing starts here)
$ ./init-metrics $(whoami) #<-- Configure a metrics database for the testnet
$ ./net.sh start #<-- Deploy the network from the local workspace
$ ./ssh.sh #<-- Details on how to ssh into any testnet node
$ ./gce.sh delete #<-- Dispose of the network (billing stops here)
```

41
net/common.sh Normal file
View File

@@ -0,0 +1,41 @@
# |source| this file
#
# Common utilities shared by other scripts in this directory
#
# The following directive disable complaints about unused variables in this
# file:
# shellcheck disable=2034
#
netConfigDir="$(dirname "${BASH_SOURCE[0]}")"/config
netLogDir="$(dirname "${BASH_SOURCE[0]}")"/log
mkdir -p "$netConfigDir" "$netLogDir"
configFile="$netConfigDir/config.sh"
clientIpList=()
leaderIp=
sshPrivateKey=
sshUsername=
sshOptions=()
validatorIpList=()
loadConfigFile() {
[[ -r $configFile ]] || usage "Config file unreadable: $configFile"
# shellcheck source=/dev/null
source "$configFile"
[[ -n "$leaderIp" ]] || usage "Config file invalid, leaderIp unspecified: $configFile"
[[ ${#validatorIpList[@]} -gt 0 ]] || usage "Config file invalid, validatorIpList unspecified: $configFile"
[[ -n $sshUsername ]] || usage "Config file invalid, sshUsername unspecified: $configFile"
[[ -n $sshPrivateKey ]] || usage "Config file invalid, sshPrivateKey unspecified: $configFile"
sshOptions=(
-o "BatchMode=yes"
-o "StrictHostKeyChecking=no"
-o "UserKnownHostsFile=/dev/null"
-o "User=$sshUsername"
-o "IdentityFile=$sshPrivateKey"
-o "LogLevel=ERROR"
)
}

172
net/gce.sh Executable file
View File

@@ -0,0 +1,172 @@
#!/bin/bash -e
here=$(dirname "$0")
# shellcheck source=scripts/gcloud.sh
source "$here"/../scripts/gcloud.sh
# shellcheck source=net/common.sh
source "$here"/common.sh
prefix=testnet-dev-$(whoami | sed -e s/[^a-z0-9].*//)
validatorNodeCount=
clientNodeCount=
imageName="ubuntu-16-04-cuda-9-2-new"
internalNetwork=false
zone="us-west1-b"
usage() {
exitcode=0
if [[ -n "$1" ]]; then
exitcode=1
echo "Error: $*"
fi
cat <<EOF
usage: $0 [create|config|delete] [common options] [command-specific options]
Manage a GCE-based testnet
create - create a new testnet (implies 'config')
config - configure the testnet and write a config file describing it
delete - delete the testnet
common options:
-p prefix - Optional common prefix for instance names to avoid collisions
(default: $prefix)
create-specific options:
-n number - Number of validator nodes
-c number - Number of client nodes
-P - Use GCE internal/private network
-z - GCP Zone for the nodes (default: $zone)
-i imageName - Existing image on GCE (default: $imageName)
config-specific options:
none
delete-specific options:
none
EOF
exit $exitcode
}
command=$1
[[ -n $command ]] || usage
shift
[[ $command = create || $command = config || $command = delete ]] || usage "Invalid command: $command"
while getopts "h?p:Pi:n:c:z:" opt; do
case $opt in
h | \?)
usage
;;
p)
prefix=$OPTARG
;;
P)
internalNetwork=true
;;
i)
imageName=$OPTARG
;;
n)
validatorNodeCount=$OPTARG
;;
c)
clientNodeCount=$OPTARG
;;
z)
zone=$OPTARG
;;
*)
usage "Error: unhandled option: $opt"
;;
esac
done
writeConfigFile() {
echo "# autogenerated at $(date)" >> "$configFile"
declare sshPrivateKey="$netConfigDir/id_$prefix"
rm -rf "$sshPrivateKey"{,.pub}
(
set -x
ssh-keygen -t ecdsa -N '' -f "$sshPrivateKey"
)
echo "sshPrivateKey=$sshPrivateKey" >> "$configFile"
recordInstanceIp() {
declare name="$1"
declare publicIp="$3"
declare privateIp="$4"
declare arrayName="$6"
if $internalNetwork; then
echo "$arrayName+=($privateIp) # $name" >> "$configFile"
else
echo "$arrayName+=($publicIp) # $name" >> "$configFile"
fi
}
gcloud_FindInstances "name=$prefix-leader" show
[[ ${#instances[@]} -eq 1 ]] || {
echo "Unable to start leader"
exit 1
}
gcloud_FigureRemoteUsername "${instances[0]}"
echo "sshUsername=$gcloud_username" >> "$configFile"
gcloud_PrepInstancesForSsh "$gcloud_username" "$sshPrivateKey"
echo "leaderIp=()" >> "$configFile"
gcloud_ForEachInstance recordInstanceIp leaderIp
gcloud_FindInstances "name~^$prefix-validator" show
[[ ${#instances[@]} -gt 0 ]] || {
echo "Unable to start validators"
exit 1
}
echo "validatorIpList=()" >> "$configFile"
gcloud_PrepInstancesForSsh "$gcloud_username" "$sshPrivateKey"
gcloud_ForEachInstance recordInstanceIp validatorIpList
echo "clientIpList=()" >> "$configFile"
gcloud_FindInstances "name~^$prefix-client" show
if [[ ${#instances[@]} -gt 0 ]]; then
gcloud_PrepInstancesForSsh "$gcloud_username" "$sshPrivateKey"
gcloud_ForEachInstance recordInstanceIp clientIpList
fi
echo "Wrote $configFile"
}
case $command in
delete)
gcloud_FindInstances "name~^$prefix-"
if [[ ${#instances[@]} -eq 0 ]]; then
echo "No instances found matching '^$prefix-'"
exit 0
fi
gcloud_DeleteInstances
;;
create)
[[ -n $validatorNodeCount ]] || usage "Need number of nodes"
gcloud_CreateInstances "$prefix-leader" 1 "$zone" "$imageName"
gcloud_CreateInstances "$prefix-validator" "$validatorNodeCount" "$zone" "$imageName"
if [[ -n $clientNodeCount ]]; then
gcloud_CreateInstances "$prefix-client" "$clientNodeCount" "$zone" "$imageName"
fi
writeConfigFile
;;
config)
writeConfigFile
;;
*)
usage "Unknown command: $command"
esac

76
net/init-metrics.sh Executable file
View File

@@ -0,0 +1,76 @@
#!/bin/bash -e
here=$(dirname "$0")
# shellcheck source=net/common.sh
source "$here"/common.sh
usage() {
exitcode=0
if [[ -n "$1" ]]; then
exitcode=1
echo "Error: $*"
fi
cat <<EOF
usage: $0 [-d] [username] [optional database name]
Creates a testnet dev metrics database
username InfluxDB user with access to create a new database
database Uncommon. Optional database suffix to follow the mandiatory
'testnet-dev-[username]' database name prefix
-d Delete the database instead of creating it
EOF
exit $exitcode
}
delete=false
while getopts "hd" opt; do
case $opt in
h|\?)
usage
exit 0
;;
d)
delete=true;
;;
*)
usage "Error: unhandled option: $opt"
;;
esac
done
shift $((OPTIND - 1))
username=$1
[[ -n "$username" ]] || usage "username not specified"
database="testnet-dev-$username"
if [[ -n "$2" ]]; then
database="$database-$2"
fi
read -rs -p "InfluxDB password for $username: " password
[[ -n $password ]] || { echo "Password not specified"; exit 1; }
echo
query() {
echo "$*"
curl -XPOST \
"https://metrics.solana.com:8086/query?u=${username}&p=${password}" \
--data-urlencode "q=$*"
}
query "DROP DATABASE \"$database\""
! $delete || exit 0
query "CREATE DATABASE \"$database\""
query "ALTER RETENTION POLICY autogen ON \"$database\" DURATION 7d"
query "GRANT READ ON \"$database\" TO \"ro\""
query "GRANT WRITE ON \"$database\" TO \"scratch_writer\""
echo "export \
SOLANA_METRICS_CONFIG=\"db=$database,u=scratch_writer,p=topsecret\" \
" >> "$configFile"
exit 0

197
net/net.sh Executable file
View File

@@ -0,0 +1,197 @@
#!/bin/bash -e
here=$(dirname "$0")
SOLANA_ROOT="$(cd "$here"/..; pwd)"
# shellcheck source=net/common.sh
source "$here"/common.sh
usage() {
exitcode=0
if [[ -n "$1" ]]; then
exitcode=1
echo "Error: $*"
fi
cat <<EOF
usage: $0 [start|stop]
Manage a multinode network
start|stop - Start or stop the network
EOF
exit $exitcode
}
command=$1
[[ -n $command ]] || usage
shift
[[ $command = start || $command = stop ]] || usage "Invalid command: $command"
while getopts "h?" opt; do
case $opt in
h | \?)
usage
;;
*)
usage "Error: unhandled option: $opt"
;;
esac
done
loadConfigFile
build() {
if [[ $(uname) != Linux ]]; then
echo "Unable to build, this isn't a Linux system"
exit 1
fi
SECONDS=0
(
cd "$SOLANA_ROOT"
echo "****************"
echo "Build started at $(date)"
# Build and install locally
PATH="$HOME"/.cargo/bin:"$PATH"
cargo install --force
)
echo "Build took $SECONDS seconds"
}
common_start_setup() {
declare ipAddress=$1
declare logFile="$2"
(
set -x
ssh "${sshOptions[@]}" "$ipAddress" "
set -ex;
sudo systemctl disable apt-daily.service # disable run when system boot
sudo systemctl disable apt-daily.timer # disable timer run
sudo apt-get --assume-yes install rsync libssl-dev;
mkdir -p ~/solana ~/.cargo/bin;
"
test -d "$SOLANA_ROOT"
rsync -vPrz -e "ssh ${sshOptions[*]}" \
"$SOLANA_ROOT"/{fetch-perf-libs.sh,scripts,net,multinode-demo} \
"$ipAddress":~/solana/
) >> "$logFile"
}
startLeader() {
declare ipAddress=$1
declare logFile="$2"
echo "****************"
echo "Starting leader: $leaderIp"
common_start_setup "$ipAddress" "$logFile"
(
set -x
rsync -vPrz -e "ssh ${sshOptions[*]}" ~/.cargo/bin/solana* "$ipAddress":~/.cargo/bin/
ssh "${sshOptions[@]}" -f "$ipAddress" \
"./solana/net/remote/remote_leader.sh"
) >> "$logFile"
}
startValidator() {
declare ipAddress=$1
declare logFile="$2"
echo "*******************"
echo "Starting validator: $leaderIp"
common_start_setup "$ipAddress" "$logFile"
(
set -x
ssh "${sshOptions[@]}" -f "$ipAddress" \
"./solana/net/remote/remote_validator.sh $leaderIp"
) >> "$logFile"
}
startClient() {
declare ipAddress=$1
declare logFile="$2"
echo "****************"
echo "Starting client: $leaderIp"
common_start_setup "$ipAddress" "$logFile"
ssh "${sshOptions[@]}" -f "$ipAddress" \
"./solana/net/remote/remote_client.sh $leaderIp" >> "$logFile"
}
start() {
echo "Deployment started at $(date)"
SECONDS=0
leaderDeployTime=
startLeader "$leaderIp" "$netLogDir/leader-$leaderIp.log"
leaderDeployTime=$SECONDS
SECONDS=0
for ipAddress in "${validatorIpList[@]}"; do
startValidator "$ipAddress" "$netLogDir/validator-$ipAddress.log" &
done
wait
validatorDeployTime=$SECONDS
SECONDS=0
for ipAddress in "${clientIpList[@]}"; do
startClient "$ipAddress" "$netLogDir/client-$ipAddress.log"
done
clientDeployTime=$SECONDS
SECONDS=0
wait
echo
echo "================================================================="
echo "Deployment finished at $(date)"
echo "Leader deployment took $leaderDeployTime seconds"
echo "Validator deployment (${#validatorIpList[@]} instances) took $validatorDeployTime seconds"
echo "Client deployment (${#clientIpList[@]} instances) took $clientDeployTime seconds"
echo "Logs in $netLogDir:"
ls -l "$netLogDir"
}
stop_node() {
local ipAddress=$1
echo "**************"
echo "Stopping node: $ipAddress"
(
set -x
ssh "${sshOptions[@]}" "$ipAddress" "
set -x;
pkill -9 solana-;
pkill -9 validator;
pkill -9 leader;
"
) || true
}
stop() {
SECONDS=0
stop_node "$leaderIp"
for ipAddress in "${validatorIpList[@]}" "${clientIpList[@]}"; do
stop_node "$ipAddress"
done
echo "Stopping nodes took $SECONDS seconds"
}
mkdir -p log
if [[ $command == "start" ]]; then
build
stop
start
elif [[ $command == "stop" ]]; then
stop
else
usage "Unknown command: $command"
fi

1
net/remote/README.md Normal file
View File

@@ -0,0 +1 @@
Scripts that run on the remote testnet nodes

15
net/remote/remote_client.sh Executable file
View File

@@ -0,0 +1,15 @@
#!/bin/bash -e
[[ -n $1 ]] || exit
cd "$(dirname "$0")"/../..
source net/common.sh
loadConfigFile
PATH="$HOME"/.cargo/bin:"$PATH"
rsync -vPrz "$1":~/.cargo/bin/solana* ~/.cargo/bin/
numNodes=1 # TODO: Pass this in
export USE_INSTALL=1
multinode-demo/client.sh "$1":~/solana $numNodes --loop -s 600 --sustained >client.log 2>&1 &

15
net/remote/remote_leader.sh Executable file
View File

@@ -0,0 +1,15 @@
#!/bin/bash -e
cd "$(dirname "$0")"/../..
source net/common.sh
loadConfigFile
PATH="$HOME"/.cargo/bin:"$PATH"
export USE_INSTALL=1
export SOLANA_CUDA=1
./fetch-perf-libs.sh
./multinode-demo/setup.sh
./multinode-demo/drone.sh >drone.log 2>&1 &
./multinode-demo/leader.sh >leader.log 2>&1 &

15
net/remote/remote_validator.sh Executable file
View File

@@ -0,0 +1,15 @@
#!/bin/bash -e
[[ -n $1 ]] || exit
cd "$(dirname "$0")"/../..
source net/common.sh
loadConfigFile
PATH="$HOME"/.cargo/bin:"$PATH"
rsync -vPrz "$1":~/.cargo/bin/solana* ~/.cargo/bin/
export USE_INSTALL=1
./multinode-demo/setup.sh
./multinode-demo/validator.sh "$1":~/solana "$1" >validator.log 2>&1 &

62
net/ssh.sh Executable file
View File

@@ -0,0 +1,62 @@
#!/bin/bash
here=$(dirname "$0")
# shellcheck source=net/common.sh
source "$here"/common.sh
usage() {
exitcode=0
if [[ -n "$1" ]]; then
exitcode=1
echo "Error: $*"
fi
cat <<EOF
usage: $0 [ipAddress]
ssh into a node
ipAddress - IP address of the desired node.
If ipAddress is unspecified, a list of available nodes will be displayed.
EOF
exit $exitcode
}
while getopts "h?" opt; do
case $opt in
h | \?)
usage
;;
*)
usage "Error: unhandled option: $opt"
;;
esac
done
loadConfigFile
ipAddress=$1
if [[ -n "$ipAddress" ]]; then
set -x
exec ssh "${sshOptions[@]}" "$ipAddress"
fi
echo Leader:
echo " $0 $leaderIp"
echo
echo Validators:
for ipAddress in "${validatorIpList[@]}"; do
echo " $0 $ipAddress"
done
echo
echo Clients:
if [[ ${#clientIpList[@]} -eq 0 ]]; then
echo " None"
else
for ipAddress in "${clientIpList[@]}"; do
echo " $0 $ipAddress"
done
fi
exit 0