Add AWS EC2 support

This commit is contained in:
Michael Vines
2018-09-16 14:46:08 -07:00
parent 27986d7abb
commit f89f121d2b
7 changed files with 644 additions and 271 deletions

View File

@ -5,15 +5,30 @@ intended to be both dev and CD friendly.
### User Account Prerequisites ### User Account Prerequisites
Log in to GCP with: GCP and AWS are supported.
#### GCP
First authenticate with
```bash ```bash
$ gcloud auth login $ gcloud auth login
``` ```
Also ensure that `$(whoami)` is the name of an InfluxDB user account with enough #### AWS
access to create a new database. Obtain your credentials from the AWS IAM Console and configure the AWS CLI with
```bash
$ aws configure
```
More information on AWS CLI configuration can be found [here](https://docs.aws.amazon.com/cli/latest/userguide/cli-chap-getting-started.html#cli-quick-configuration)
### Metrics configuration
Ensure that `$(whoami)` is the name of an InfluxDB user account with enough
access to create a new InfluxDB database. Ask mvines@ for help if needed.
## Quick Start ## Quick Start
NOTE: This example uses GCP. If you are using AWS, replace `./gce.sh` with
`./ec2.sh` in the commands.
```bash ```bash
$ cd net/ $ cd net/
$ ./gce.sh create -n 5 -c 1 #<-- Create a GCE testnet with 5 validators, 1 client (billing starts here) $ ./gce.sh create -n 5 -c 1 #<-- Create a GCE testnet with 5 validators, 1 client (billing starts here)
@ -32,6 +47,10 @@ network over public IP addresses:
```bash ```bash
$ ./gce.sh create -P ... $ ./gce.sh create -P ...
``` ```
or
```bash
$ ./ec2.sh create -P ...
```
### Deploying a Snap-based network ### Deploying a Snap-based network
To deploy the latest pre-built `edge` channel Snap (ie, latest from the `master` To deploy the latest pre-built `edge` channel Snap (ie, latest from the `master`
@ -46,6 +65,10 @@ First ensure the network instances are created with GPU enabled:
```bash ```bash
$ ./gce.sh create -g ... $ ./gce.sh create -g ...
``` ```
or
```bash
$ ./ec2.sh create -g ...
```
If deploying a Snap-based network nothing further is required, as GPU presence If deploying a Snap-based network nothing further is required, as GPU presence
is detected at runtime and the CUDA build is auto selected. is detected at runtime and the CUDA build is auto selected.
@ -58,9 +81,20 @@ $ ./net.sh start -f "cuda,erasure"
### How to interact with a CD testnet deployed by ci/testnet-deploy.sh ### How to interact with a CD testnet deployed by ci/testnet-deploy.sh
**AWS-Specific Extra Setup**: Follow the steps in `scripts/add-solana-user-authorized_keys.sh`,
then redeploy the testnet before continuing in this section.
Taking **master-testnet-solana-com** as an example, configure your workspace for Taking **master-testnet-solana-com** as an example, configure your workspace for
the testnet using: the testnet using:
``` ```bash
$ ./gce.sh config -p master-testnet-solana-com $ ./gce.sh config -p master-testnet-solana-com
$ ./ssh.sh # <-- Details on how to ssh into any testnet node ```
or
```bash
$ ./ec2.sh config -p master-testnet-solana-com
```
Then run the following for details on how to ssh into any testnet node
```bash
$ ./ssh.sh
``` ```

1
net/ec2.sh Symbolic link
View File

@ -0,0 +1 @@
gce.sh

View File

@ -1,27 +1,44 @@
#!/bin/bash -e #!/bin/bash -e
here=$(dirname "$0") here=$(dirname "$0")
# shellcheck source=net/scripts/gcloud.sh
source "$here"/scripts/gcloud.sh
# shellcheck source=net/common.sh # shellcheck source=net/common.sh
source "$here"/common.sh source "$here"/common.sh
cloudProvider=$(basename "$0" .sh)
case $cloudProvider in
gce)
# shellcheck source=net/scripts/gce-provider.sh
source "$here"/scripts/gce-provider.sh
imageName="ubuntu-16-04-cuda-9-2-new"
leaderMachineType=n1-standard-16
validatorMachineType=n1-standard-4
clientMachineType=n1-standard-16
;;
ec2)
# shellcheck source=net/scripts/ec2-provider.sh
source "$here"/scripts/ec2-provider.sh
imageName="ami-04169656fea786776"
leaderMachineType=m4.4xlarge
validatorMachineType=m4.xlarge
clientMachineType=m4.4xlarge
;;
*)
echo "Error: Unknown cloud provider: $cloudProvider"
;;
esac
prefix=testnet-dev-${USER//[^A-Za-z0-9]/} prefix=testnet-dev-${USER//[^A-Za-z0-9]/}
validatorNodeCount=5 validatorNodeCount=5
clientNodeCount=1 clientNodeCount=1
leaderBootDiskSize=1TB leaderBootDiskSizeInGb=1000
leaderMachineType=n1-standard-16 validatorBootDiskSizeInGb=$leaderBootDiskSizeInGb
leaderAccelerator= clientBootDiskSizeInGb=40
validatorMachineType=n1-standard-4
validatorBootDiskSize=$leaderBootDiskSize
validatorAccelerator=
clientMachineType=n1-standard-16
clientBootDiskSize=40GB
clientAccelerator=
imageName="ubuntu-16-04-cuda-9-2-new"
publicNetwork=false publicNetwork=false
zone="us-west1-b" enableGpu=false
leaderAddress= leaderAddress=
usage() { usage() {
@ -33,7 +50,7 @@ usage() {
cat <<EOF cat <<EOF
usage: $0 [create|config|delete] [common options] [command-specific options] usage: $0 [create|config|delete] [common options] [command-specific options]
Configure a GCE-based testnet Manage testnet instances
create - create a new testnet (implies 'config') create - create a new testnet (implies 'config')
config - configure the testnet and write a config file describing it config - configure the testnet and write a config file describing it
@ -47,10 +64,13 @@ Configure a GCE-based testnet
-n [number] - Number of validator nodes (default: $validatorNodeCount) -n [number] - Number of validator nodes (default: $validatorNodeCount)
-c [number] - Number of client nodes (default: $clientNodeCount) -c [number] - Number of client nodes (default: $clientNodeCount)
-P - Use public network IP addresses (default: $publicNetwork) -P - Use public network IP addresses (default: $publicNetwork)
-z [zone] - GCP Zone for the nodes (default: $zone) -z [zone] - Zone for the nodes (default: $zone)
-i [imageName] - Existing image on GCE (default: $imageName) -g - Enable GPU (default: $enableGpu)
-g - Enable GPU -a [address] - Set the leader node's external IP address to this value.
-a [address] - Set the leader node's external IP address to this GCE address For GCE, [address] is the "name" of the desired External
IP Address.
For EC2, [address] is the "allocation ID" of the desired
Elastic IP.
config-specific options: config-specific options:
none none
@ -68,7 +88,7 @@ command=$1
shift shift
[[ $command = create || $command = config || $command = delete ]] || usage "Invalid command: $command" [[ $command = create || $command = config || $command = delete ]] || usage "Invalid command: $command"
while getopts "h?p:Pi:n:c:z:ga:" opt; do while getopts "h?p:Pn:c:z:ga:" opt; do
case $opt in case $opt in
h | \?) h | \?)
usage usage
@ -80,9 +100,6 @@ while getopts "h?p:Pi:n:c:z:ga:" opt; do
P) P)
publicNetwork=true publicNetwork=true
;; ;;
i)
imageName=$OPTARG
;;
n) n)
validatorNodeCount=$OPTARG validatorNodeCount=$OPTARG
;; ;;
@ -90,10 +107,10 @@ while getopts "h?p:Pi:n:c:z:ga:" opt; do
clientNodeCount=$OPTARG clientNodeCount=$OPTARG
;; ;;
z) z)
zone=$OPTARG cloud_SetZone "$OPTARG"
;; ;;
g) g)
leaderAccelerator="count=4,type=nvidia-tesla-k80" enableGpu=true
;; ;;
a) a)
leaderAddress=$OPTARG leaderAddress=$OPTARG
@ -108,6 +125,37 @@ shift $((OPTIND - 1))
[[ -z $1 ]] || usage "Unexpected argument: $1" [[ -z $1 ]] || usage "Unexpected argument: $1"
sshPrivateKey="$netConfigDir/id_$prefix" sshPrivateKey="$netConfigDir/id_$prefix"
# cloud_ForEachInstance [cmd] [extra args to cmd]
#
# Execute a command for each element in the `instances` array
#
# cmd - The command to execute on each instance
# The command will receive arguments followed by any
# additionl arguments supplied to cloud_ForEachInstance:
# name - name of the instance
# publicIp - The public IP address of this instance
# privateIp - The priate IP address of this instance
# count - Monotonically increasing count for each
# invocation of cmd, starting at 1
# ... - Extra args to cmd..
#
#
cloud_ForEachInstance() {
declare cmd="$1"
shift
[[ -n $cmd ]] || { echo cloud_ForEachInstance: cmd not specified; exit 1; }
declare count=1
for info in "${instances[@]}"; do
declare name publicIp privateIp
IFS=: read -r name publicIp privateIp < <(echo "$info")
eval "$cmd" "$name" "$publicIp" "$privateIp" "$count" "$@"
count=$((count + 1))
done
}
prepareInstancesAndWriteConfigFile() { prepareInstancesAndWriteConfigFile() {
$metricsWriteDatapoint "testnet-deploy net-config-begin=1" $metricsWriteDatapoint "testnet-deploy net-config-begin=1"
@ -122,10 +170,10 @@ EOF
recordInstanceIp() { recordInstanceIp() {
declare name="$1" declare name="$1"
declare publicIp="$3" declare publicIp="$2"
declare privateIp="$4" declare privateIp="$3"
declare arrayName="$6" declare arrayName="$5"
echo "$arrayName+=($publicIp) # $name" >> "$configFile" echo "$arrayName+=($publicIp) # $name" >> "$configFile"
if [[ $arrayName = "leaderIp" ]]; then if [[ $arrayName = "leaderIp" ]]; then
@ -139,121 +187,133 @@ EOF
waitForStartupComplete() { waitForStartupComplete() {
declare name="$1" declare name="$1"
declare publicIp="$3" declare publicIp="$2"
echo "Waiting for $name to finish booting..." echo "Waiting for $name to finish booting..."
( (
for i in $(seq 1 30); do for i in $(seq 1 30); do
if (set -x; ssh "${sshOptions[@]}" "$publicIp" "test -f /.gce-startup-complete"); then if (set -x; ssh "${sshOptions[@]}" "$publicIp" "test -f /.instance-startup-complete"); then
break break
fi fi
sleep 2 sleep 2
echo "Retry $i..." echo "Retry $i..."
done done
) )
echo "$name has booted."
} }
echo "Looking for leader instance..." echo "Looking for leader instance..."
gcloud_FindInstances "name=$prefix-leader" show cloud_FindInstance "$prefix-leader"
[[ ${#instances[@]} -eq 1 ]] || { [[ ${#instances[@]} -eq 1 ]] || {
echo "Unable to find leader" echo "Unable to find leader"
exit 1 exit 1
} }
echo "Fetching $sshPrivateKey from $leaderName"
( (
rm -rf "$sshPrivateKey"{,pub}
declare leaderName declare leaderName
declare leaderZone
declare leaderIp declare leaderIp
IFS=: read -r leaderName leaderZone leaderIp _ < <(echo "${instances[0]}") IFS=: read -r leaderName leaderIp _ < <(echo "${instances[0]}")
set -x # Try to ping the machine first.
timeout 60s bash -c "set -o pipefail; until ping -c 3 $leaderIp | tr - _; do echo .; done"
# Try to ping the machine first. There can be a delay between when the if [[ ! -r $sshPrivateKey ]]; then
# instance is reported as RUNNING and when it's reachable over the network echo "Fetching $sshPrivateKey from $leaderName"
timeout 30s bash -c "set -o pipefail; until ping -c 3 $leaderIp | tr - _; do echo .; done"
# Try to scp in a couple times, sshd may not yet be up even though the # Try to scp in a couple times, sshd may not yet be up even though the
# machine can be pinged... # machine can be pinged...
set -o pipefail set -x -o pipefail
for i in $(seq 1 10); do for i in $(seq 1 30); do
if gcloud compute scp --zone "$leaderZone" \ if cloud_FetchFile "$leaderName" "$leaderIp" /solana-id_ecdsa "$sshPrivateKey"; then
"$leaderName:/solana-id_ecdsa" "$sshPrivateKey"; then break
break fi
fi
sleep 1
echo "Retry $i..."
done
chmod 400 "$sshPrivateKey" sleep 1
echo "Retry $i..."
done
chmod 400 "$sshPrivateKey"
ls -l "$sshPrivateKey"
fi
) )
echo "leaderIp=()" >> "$configFile" echo "leaderIp=()" >> "$configFile"
gcloud_ForEachInstance recordInstanceIp leaderIp cloud_ForEachInstance recordInstanceIp leaderIp
gcloud_ForEachInstance waitForStartupComplete cloud_ForEachInstance waitForStartupComplete
echo "Looking for validator instances..." echo "Looking for validator instances..."
gcloud_FindInstances "name~^$prefix-validator" show cloud_FindInstances "$prefix-validator"
[[ ${#instances[@]} -gt 0 ]] || { [[ ${#instances[@]} -gt 0 ]] || {
echo "Unable to find validators" echo "Unable to find validators"
exit 1 exit 1
} }
echo "validatorIpList=()" >> "$configFile" echo "validatorIpList=()" >> "$configFile"
gcloud_ForEachInstance recordInstanceIp validatorIpList cloud_ForEachInstance recordInstanceIp validatorIpList
gcloud_ForEachInstance waitForStartupComplete cloud_ForEachInstance waitForStartupComplete
echo "clientIpList=()" >> "$configFile" echo "clientIpList=()" >> "$configFile"
echo "Looking for client instances..." echo "Looking for client instances..."
gcloud_FindInstances "name~^$prefix-client" show cloud_FindInstances "$prefix-client"
[[ ${#instances[@]} -eq 0 ]] || { [[ ${#instances[@]} -eq 0 ]] || {
gcloud_ForEachInstance recordInstanceIp clientIpList cloud_ForEachInstance recordInstanceIp clientIpList
gcloud_ForEachInstance waitForStartupComplete cloud_ForEachInstance waitForStartupComplete
} }
echo "Wrote $configFile" echo "Wrote $configFile"
$metricsWriteDatapoint "testnet-deploy net-config-complete=1" $metricsWriteDatapoint "testnet-deploy net-config-complete=1"
} }
case $command in delete() {
delete)
$metricsWriteDatapoint "testnet-deploy net-delete-begin=1" $metricsWriteDatapoint "testnet-deploy net-delete-begin=1"
# Delete the leader node first to prevent unusual metrics on the dashboard # Delete the leader node first to prevent unusual metrics on the dashboard
# during shutdown. # during shutdown.
# TODO: It would be better to fully cut-off metrics reporting before any # TODO: It would be better to fully cut-off metrics reporting before any
# instances are deleted. # instances are deleted.
for filter in "^$prefix-leader" "^$prefix-"; do for filter in "$prefix-leader" "$prefix-"; do
gcloud_FindInstances "name~$filter" echo "Searching for instances: $filter"
cloud_FindInstances "$filter"
if [[ ${#instances[@]} -eq 0 ]]; then if [[ ${#instances[@]} -eq 0 ]]; then
echo "No instances found matching '$filter'" echo "No instances found matching '$filter'"
else else
gcloud_DeleteInstances true cloud_DeleteInstances true
fi fi
done done
rm -f "$configFile" rm -f "$configFile"
$metricsWriteDatapoint "testnet-deploy net-delete-complete=1" $metricsWriteDatapoint "testnet-deploy net-delete-complete=1"
}
case $command in
delete)
delete
;; ;;
create) create)
[[ -n $validatorNodeCount ]] || usage "Need number of nodes" [[ -n $validatorNodeCount ]] || usage "Need number of nodes"
if [[ $validatorNodeCount -le 0 ]]; then
usage "One or more validator nodes is required"
fi
delete
$metricsWriteDatapoint "testnet-deploy net-create-begin=1" $metricsWriteDatapoint "testnet-deploy net-create-begin=1"
rm -rf "$sshPrivateKey"{,.pub} rm -rf "$sshPrivateKey"{,.pub}
ssh-keygen -t ecdsa -N '' -f "$sshPrivateKey"
# Note: using rsa because |aws ec2 import-key-pair| seems to fail for ecdsa
ssh-keygen -t rsa -N '' -f "$sshPrivateKey"
printNetworkInfo() { printNetworkInfo() {
cat <<EOF cat <<EOF
======================================================================================== ========================================================================================
Network composition: Network composition:
Leader = $leaderMachineType (GPU=${leaderAccelerator:-none}) Leader = $leaderMachineType (GPU=$enableGpu)
Validators = $validatorNodeCount x $validatorMachineType (GPU=${validatorAccelerator:-none}) Validators = $validatorNodeCount x $validatorMachineType
Client(s) = $clientNodeCount x $clientMachineType (GPU=${clientAccelerator:-none}) Client(s) = $clientNodeCount x $clientMachineType
======================================================================================== ========================================================================================
@ -261,7 +321,7 @@ EOF
} }
printNetworkInfo printNetworkInfo
declare startupScript="$netConfigDir"/gce-startup-script.sh declare startupScript="$netConfigDir"/instance-startup-script.sh
cat > "$startupScript" <<EOF cat > "$startupScript" <<EOF
#!/bin/bash -ex #!/bin/bash -ex
# autogenerated at $(date) # autogenerated at $(date)
@ -270,11 +330,12 @@ cat > /etc/motd <<EOM
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
This instance has not been fully configured. This instance has not been fully configured.
See "startup-script" log messages in /var/log/syslog for status:
$ sudo cat /var/log/syslog | grep startup-script See startup script log messages in /var/log/syslog for status:
$ sudo cat /var/log/syslog | egrep \\(startup-script\\|cloud-init\)
To block until setup is complete, run: To block until setup is complete, run:
$ until [[ -f /.gce-startup-complete ]]; do sleep 1; done $ until [[ -f /.instance-startup-complete ]]; do sleep 1; done
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
EOM EOM
@ -296,6 +357,7 @@ $(
cat \ cat \
disable-background-upgrades.sh \ disable-background-upgrades.sh \
create-solana-user.sh \ create-solana-user.sh \
add-solana-user-authorized_keys.sh \
install-earlyoom.sh \ install-earlyoom.sh \
install-libssl-compatability.sh \ install-libssl-compatability.sh \
install-rsync.sh \ install-rsync.sh \
@ -305,21 +367,21 @@ cat > /etc/motd <<EOM
$(printNetworkInfo) $(printNetworkInfo)
EOM EOM
touch /.gce-startup-complete touch /.instance-startup-complete
EOF EOF
gcloud_CreateInstances "$prefix-leader" 1 "$zone" \ cloud_CreateInstances "$prefix" "$prefix-leader" 1 \
"$imageName" "$leaderMachineType" "$leaderBootDiskSize" "$leaderAccelerator" \ "$imageName" "$leaderMachineType" "$leaderBootDiskSizeInGb" "$enableGpu" \
"$startupScript" "$leaderAddress" "$startupScript" "$leaderAddress"
gcloud_CreateInstances "$prefix-validator" "$validatorNodeCount" "$zone" \ cloud_CreateInstances "$prefix" "$prefix-validator" "$validatorNodeCount" \
"$imageName" "$validatorMachineType" "$validatorBootDiskSize" "$validatorAccelerator" \ "$imageName" "$validatorMachineType" "$validatorBootDiskSizeInGb" false \
"$startupScript" "" "$startupScript" ""
if [[ $clientNodeCount -gt 0 ]]; then if [[ $clientNodeCount -gt 0 ]]; then
gcloud_CreateInstances "$prefix-client" "$clientNodeCount" "$zone" \ cloud_CreateInstances "$prefix" "$prefix-client" "$clientNodeCount" \
"$imageName" "$clientMachineType" "$clientBootDiskSize" "$clientAccelerator" \ "$imageName" "$clientMachineType" "$clientBootDiskSizeInGb" false \
"$startupScript" "" "$startupScript" ""
fi fi

View File

@ -0,0 +1,20 @@
#!/bin/bash -ex
[[ $(uname) = Linux ]] || exit 1
[[ $USER = root ]] || exit 1
[[ -d /home/solana/.ssh ]] || exit 1
# /solana-authorized_keys contains the public keys for users that should
# automatically be granted access to ALL testnets.
#
# To add an entry into this list:
# 1. Run: ssh-keygen -t ecdsa -N '' -f ~/.ssh/id-solana-testnet
# 2. Inline ~/.ssh/id-solana-testnet.pub below
cat > /solana-authorized_keys <<EOF
ecdsa-sha2-nistp256 AAAAE2VjZHNhLXNoYTItbmlzdHAyNTYAAAAIbmlzdHAyNTYAAABBBFBNwLw0i+rI312gWshojFlNw9NV7WfaKeeUsYADqOvM2o4yrO2pPw+sgW8W+/rPpVyH7zU9WVRgTME8NgFV1Vc=
EOF
sudo -u solana bash -c "
cat /solana-authorized_keys >> /home/solana/.ssh/authorized_keys
"

242
net/scripts/ec2-provider.sh Normal file
View File

@ -0,0 +1,242 @@
# |source| this file
#
# Utilities for working with EC2 instances
#
zone=
region=
cloud_SetZone() {
zone="$1"
# AWS region is zone with the last character removed
region="${zone:0:$((${#zone} - 1))}"
}
# Set the default zone
cloud_SetZone "us-east-1b"
# sshPrivateKey should be globally defined whenever this function is called.
#
# TODO: Remove usage of the sshPrivateKey global
__cloud_SshPrivateKeyCheck() {
# shellcheck disable=SC2154
if [[ -z $sshPrivateKey ]]; then
echo Error: sshPrivateKey not defined
exit 1
fi
if [[ ! -r $sshPrivateKey ]]; then
echo "Error: file is not readable: $sshPrivateKey"
exit 1
fi
}
#
# __cloud_FindInstances
#
# Find instances with name matching the specified pattern.
#
# For each matching instance, an entry in the `instances` array will be added with the
# following information about the instance:
# "name:public IP:private IP"
#
# filter - The instances to filter on
#
# examples:
# $ __cloud_FindInstances "exact-machine-name"
# $ __cloud_FindInstances "all-machines-with-a-common-machine-prefix*"
#
__cloud_FindInstances() {
declare filter="$1"
instances=()
declare name publicIp privateIp
while read -r name publicIp privateIp; do
printf "%-30s | publicIp=%-16s privateIp=%s\n" "$name" "$publicIp" "$privateIp"
instances+=("$name:$publicIp:$privateIp")
done < <(aws ec2 describe-instances \
--region "$region" \
--filters \
"Name=tag:name,Values=$filter" \
"Name=instance-state-name,Values=pending,running" \
--query "Reservations[].Instances[].[InstanceId,PublicIpAddress,PrivateIpAddress]" \
--output text
)
}
#
# cloud_FindInstances [namePrefix]
#
# Find instances with names matching the specified prefix
#
# For each matching instance, an entry in the `instances` array will be added with the
# following information about the instance:
# "name:public IP:private IP"
#
# namePrefix - The instance name prefix to look for
#
# examples:
# $ cloud_FindInstances all-machines-with-a-common-machine-prefix
#
cloud_FindInstances() {
declare namePrefix="$1"
__cloud_FindInstances "$namePrefix*"
}
#
# cloud_FindInstance [name]
#
# Find an instance with a name matching the exact pattern.
#
# For each matching instance, an entry in the `instances` array will be added with the
# following information about the instance:
# "name:public IP:private IP"
#
# name - The instance name to look for
#
# examples:
# $ cloud_FindInstance exact-machine-name
#
cloud_FindInstance() {
declare name="$1"
__cloud_FindInstances "$name"
}
#
# cloud_CreateInstances [networkName] [namePrefix] [numNodes] [imageName]
# [machineType] [bootDiskSize] [enableGpu]
# [startupScript] [address]
#
# Creates one more identical instances.
#
# networkName - unique name of this testnet
# namePrefix - unique string to prefix all the instance names with
# numNodes - number of instances to create
# imageName - Disk image for the instances
# machineType - GCE machine type
# bootDiskSize - Optional size of the boot disk in GB
# enableGpu - Optionally enable GPU, use the value "true" to enable
# eg, request 4 K80 GPUs with "count=4,type=nvidia-tesla-k80"
# startupScript - Optional startup script to execute when the instance boots
# address - Optional name of the GCE static IP address to attach to the
# instance. Requires that |numNodes| = 1 and that addressName
# has been provisioned in the GCE region that is hosting |zone|
#
# Tip: use cloud_FindInstances to locate the instances once this function
# returns
cloud_CreateInstances() {
declare networkName="$1"
declare namePrefix="$2"
declare numNodes="$3"
declare imageName="$4"
declare machineType="$5"
declare optionalBootDiskSize="$6"
declare optionalGpu="$7"
declare optionalStartupScript="$8"
declare optionalAddress="$9"
__cloud_SshPrivateKeyCheck
(
set -x
aws ec2 delete-key-pair --region "$region" --key-name "$networkName"
aws ec2 import-key-pair --region "$region" --key-name "$networkName" \
--public-key-material file://"${sshPrivateKey}".pub
)
declare -a args
args=(
--key-name "$networkName"
--count "$numNodes"
--region "$region"
--placement "AvailabilityZone=$zone"
--security-groups testnet
--image-id "$imageName"
--instance-type "$machineType"
--tag-specifications "ResourceType=instance,Tags=[{Key=name,Value=$namePrefix}]"
)
if [[ -n $optionalBootDiskSize ]]; then
args+=(
--block-device-mapping "[{\"DeviceName\": \"/dev/sda1\", \"Ebs\": { \"VolumeSize\": $optionalBootDiskSize }}]"
)
fi
if [[ $optionalGpu = true ]]; then
echo TODO: GPU support not implemented yet
exit 1
fi
if [[ -n $optionalStartupScript ]]; then
args+=(
--user-data "file://$optionalStartupScript"
)
fi
if [[ -n $optionalAddress ]]; then
[[ $numNodes = 1 ]] || {
echo "Error: address may not be supplied when provisioning multiple nodes: $optionalAddress"
exit 1
}
fi
(
set -x
aws ec2 run-instances "${args[@]}"
)
if [[ -n $optionalAddress ]]; then
cloud_FindInstance "$namePrefix"
if [[ ${#instances[@]} -ne 1 ]]; then
echo "Failed to find newly created instance: $namePrefix"
fi
declare instanceId
IFS=: read -r instanceId _ < <(echo "${instances[0]}")
aws ec2 associate-address \
--instance-id "$instanceId" \
--region "region" \
--allocation-id "$optionalAddress"
fi
}
#
# cloud_DeleteInstances
#
# Deletes all the instances listed in the `instances` array
#
cloud_DeleteInstances() {
if [[ ${#instances[0]} -eq 0 ]]; then
echo No instances to delete
return
fi
declare names=("${instances[@]/:*/}")
(
set -x
aws ec2 terminate-instances --region "$region" --instance-ids "${names[@]}"
)
}
#
# cloud_FetchFile [instanceName] [publicIp] [remoteFile] [localFile]
#
# Fetch a file from the given instance. This function uses a cloud-specific
# mechanism to fetch the file
#
cloud_FetchFile() {
# shellcheck disable=SC2034 # instanceName is unused
declare instanceName="$1"
declare publicIp="$2"
declare remoteFile="$3"
declare localFile="$4"
__cloud_SshPrivateKeyCheck
(
set -x
scp \
-o "StrictHostKeyChecking=no" \
-o "UserKnownHostsFile=/dev/null" \
-o "User=solana" \
-o "IdentityFile=$sshPrivateKey" \
-o "LogLevel=ERROR" \
-F /dev/null \
"solana@$publicIp:$remoteFile" "$localFile"
)
}

201
net/scripts/gce-provider.sh Normal file
View File

@ -0,0 +1,201 @@
# |source| this file
#
# Utilities for working with GCE instances
#
# Default zone
zone="us-west1-b"
cloud_SetZone() {
zone="$1"
}
#
# __cloud_FindInstances
#
# Find instances matching the specified pattern.
#
# For each matching instance, an entry in the `instances` array will be added with the
# following information about the instance:
# "name:zone:public IP:private IP"
#
# filter - The instances to filter on
#
# examples:
# $ __cloud_FindInstances "name=exact-machine-name"
# $ __cloud_FindInstances "name~^all-machines-with-a-common-machine-prefix"
#
__cloud_FindInstances() {
declare filter="$1"
instances=()
declare name zone publicIp privateIp status
while read -r name publicIp privateIp status; do
if [[ $status != RUNNING ]]; then
echo "Warning: $name is not RUNNING, ignoring it."
continue
fi
printf "%-30s | publicIp=%-16s privateIp=%s\n" "$name" "$publicIp" "$privateIp"
instances+=("$name:$publicIp:$privateIp")
done < <(gcloud compute instances list \
--filter="$filter" \
--format 'value(name,networkInterfaces[0].accessConfigs[0].natIP,networkInterfaces[0].networkIP,status)')
}
#
# cloud_FindInstances [namePrefix]
#
# Find instances with names matching the specified prefix
#
# For each matching instance, an entry in the `instances` array will be added with the
# following information about the instance:
# "name:public IP:private IP"
#
# namePrefix - The instance name prefix to look for
#
# examples:
# $ cloud_FindInstances all-machines-with-a-common-machine-prefix
#
cloud_FindInstances() {
declare namePrefix="$1"
__cloud_FindInstances "name~^$namePrefix"
}
#
# cloud_FindInstance [name]
#
# Find an instance with a name matching the exact pattern.
#
# For each matching instance, an entry in the `instances` array will be added with the
# following information about the instance:
# "name:public IP:private IP"
#
# name - The instance name to look for
#
# examples:
# $ cloud_FindInstance exact-machine-name
#
cloud_FindInstance() {
declare name="$1"
__cloud_FindInstances "name=$name"
}
#
# cloud_CreateInstances [networkName] [namePrefix] [numNodes] [imageName]
# [machineType] [bootDiskSize] [enableGpu]
# [startupScript] [address]
#
# Creates one more identical instances.
#
# networkName - unique name of this testnet
# namePrefix - unique string to prefix all the instance names with
# numNodes - number of instances to create
# imageName - Disk image for the instances
# machineType - GCE machine type
# bootDiskSize - Optional size of the boot disk in GB
# enableGpu - Optionally enable GPU, use the value "true" to enable
# eg, request 4 K80 GPUs with "count=4,type=nvidia-tesla-k80"
# startupScript - Optional startup script to execute when the instance boots
# address - Optional name of the GCE static IP address to attach to the
# instance. Requires that |numNodes| = 1 and that addressName
# has been provisioned in the GCE region that is hosting `$zone`
#
# Tip: use cloud_FindInstances to locate the instances once this function
# returns
cloud_CreateInstances() {
declare networkName="$1"
declare namePrefix="$2"
declare numNodes="$3"
declare imageName="$4"
declare machineType="$5"
declare optionalBootDiskSize="$6"
declare optionalGpu="$7"
declare optionalStartupScript="$8"
declare optionalAddress="$9"
declare nodes
if [[ $numNodes = 1 ]]; then
nodes=("$namePrefix")
else
read -ra nodes <<<$(seq -f "${namePrefix}%0${#numNodes}g" 1 "$numNodes")
fi
declare -a args
args=(
"--zone=$zone"
"--tags=testnet"
"--metadata=testnet=$networkName"
"--image=$imageName"
"--machine-type=$machineType"
)
if [[ -n $optionalBootDiskSize ]]; then
args+=(
"--boot-disk-size=${optionalBootDiskSize}GB"
)
fi
if [[ $optionalGpu = true ]]; then
args+=(
"--accelerator=count=4,type=nvidia-tesla-k80"
--maintenance-policy TERMINATE
--restart-on-failure
)
fi
if [[ -n $optionalStartupScript ]]; then
args+=(
--metadata-from-file "startup-script=$optionalStartupScript"
)
fi
if [[ -n $optionalAddress ]]; then
[[ $numNodes = 1 ]] || {
echo "Error: address may not be supplied when provisioning multiple nodes: $optionalAddress"
exit 1
}
args+=(
"--address=$optionalAddress"
)
fi
(
set -x
gcloud beta compute instances create "${nodes[@]}" "${args[@]}"
)
}
#
# cloud_DeleteInstances
#
# Deletes all the instances listed in the `instances` array
#
cloud_DeleteInstances() {
if [[ ${#instances[0]} -eq 0 ]]; then
echo No instances to delete
return
fi
declare names=("${instances[@]/:*/}")
(
set -x
gcloud beta compute instances delete --zone "$zone" --quiet "${names[@]}"
)
}
#
# cloud_FetchFile [instanceName] [publicIp] [remoteFile] [localFile]
#
# Fetch a file from the given instance. This function uses a cloud-specific
# mechanism to fetch the file
#
cloud_FetchFile() {
declare instanceName="$1"
# shellcheck disable=SC2034 # publicIp is unused
declare publicIp="$2"
declare remoteFile="$3"
declare localFile="$4"
(
set -x
gcloud compute scp --zone "$zone" "$instanceName:$remoteFile" "$localFile"
)
}

View File

@ -1,187 +0,0 @@
# |source| this file
#
# Utilities for working with gcloud
#
#
# gcloud_FindInstances [filter] [options]
#
# Find instances matching the specified pattern.
#
# For each matching instance, an entry in the `instances` array will be added with the
# following information about the instance:
# "name:zone:public IP:private IP"
#
# filter - The instances to filter on
# options - If set to the string "show", the list of instances will be echoed
# to stdout
#
# examples:
# $ gcloud_FindInstances "name=exact-machine-name"
# $ gcloud_FindInstances "name~^all-machines-with-a-common-machine-prefix"
#
gcloud_FindInstances() {
declare filter="$1"
declare options="$2"
instances=()
declare name zone publicIp privateIp status
while read -r name zone publicIp privateIp status; do
if [[ $status != RUNNING ]]; then
echo "Warning: $name is not RUNNING, ignoring it."
continue
fi
if [[ $options = show ]]; then
printf "%-30s | %-16s publicIp=%-16s privateIp=%s\n" "$name" "$zone" "$publicIp" "$privateIp"
fi
instances+=("$name:$zone:$publicIp:$privateIp")
done < <(gcloud compute instances list \
--filter="$filter" \
--format 'value(name,zone,networkInterfaces[0].accessConfigs[0].natIP,networkInterfaces[0].networkIP,status)')
}
#
# gcloud_ForEachInstance [cmd] [extra args to cmd]
#
# Execute a command for each element in the `instances` array
#
# cmd - The command to execute on each instance
# The command will receive arguments followed by any
# additionl arguments supplied to gcloud_ForEachInstance:
# name - name of the instance
# zone - zone the instance is located in
# publicIp - The public IP address of this instance
# privateIp - The priate IP address of this instance
# count - Monotonically increasing count for each
# invocation of cmd, starting at 1
# ... - Extra args to cmd..
#
#
gcloud_ForEachInstance() {
declare cmd="$1"
shift
[[ -n $cmd ]] || { echo gcloud_ForEachInstance: cmd not specified; exit 1; }
declare count=1
for info in "${instances[@]}"; do
declare name zone publicIp privateIp
IFS=: read -r name zone publicIp privateIp < <(echo "$info")
eval "$cmd" "$name" "$zone" "$publicIp" "$privateIp" "$count" "$@"
count=$((count + 1))
done
}
#
# gcloud_CreateInstances [namePrefix] [numNodes] [zone] [imageName]
# [machineType] [bootDiskSize] [accelerator]
# [startupScript] [address]
#
# Creates one more identical instances.
#
# namePrefix - unique string to prefix all the instance names with
# numNodes - number of instances to create
# zone - zone to create the instances in
# imageName - Disk image for the instances
# machineType - GCE machine type
# bootDiskSize - Optional disk of the boot disk
# accelerator - Optional accelerator to attach to the instance(s), see
# eg, request 4 K80 GPUs with "count=4,type=nvidia-tesla-k80"
# startupScript - Optional startup script to execute when the instance boots
# address - Optional name of the GCE static IP address to attach to the
# instance. Requires that |numNodes| = 1 and that addressName
# has been provisioned in the GCE region that is hosting |zone|
#
# Tip: use gcloud_FindInstances to locate the instances once this function
# returns
gcloud_CreateInstances() {
declare namePrefix="$1"
declare numNodes="$2"
declare zone="$3"
declare imageName="$4"
declare machineType="$5"
declare optionalBootDiskSize="$6"
declare optionalAccelerator="$7"
declare optionalStartupScript="$8"
declare optionalAddress="$9"
declare nodes
if [[ $numNodes = 1 ]]; then
nodes=("$namePrefix")
else
read -ra nodes <<<$(seq -f "${namePrefix}%0${#numNodes}g" 1 "$numNodes")
fi
declare -a args
args=(
"--zone=$zone"
"--tags=testnet"
"--image=$imageName"
"--machine-type=$machineType"
)
if [[ -n $optionalBootDiskSize ]]; then
args+=(
"--boot-disk-size=$optionalBootDiskSize"
)
fi
if [[ -n $optionalAccelerator ]]; then
args+=(
"--accelerator=$optionalAccelerator"
--maintenance-policy TERMINATE
--restart-on-failure
)
fi
if [[ -n $optionalStartupScript ]]; then
args+=(
--metadata-from-file "startup-script=$optionalStartupScript"
)
fi
if [[ -n $optionalAddress ]]; then
[[ $numNodes = 1 ]] || {
echo "Error: address may not be supplied when provisioning multiple nodes: $optionalAddress"
exit 1
}
args+=(
"--address=$optionalAddress"
)
fi
(
set -x
gcloud beta compute instances create "${nodes[@]}" "${args[@]}"
)
}
#
# gcloud_DeleteInstances [yes]
#
# Deletes all the instances listed in the `instances` array
#
# If yes = "true", skip the delete confirmation
#
gcloud_DeleteInstances() {
declare maybeQuiet=
if [[ $1 = true ]]; then
maybeQuiet=--quiet
fi
if [[ ${#instances[0]} -eq 0 ]]; then
echo No instances to delete
return
fi
declare names=("${instances[@]/:*/}")
# Assume all instances are in the same zone
# TODO: One day this assumption will be invalid
declare zone
IFS=: read -r _ zone _ < <(echo "${instances[0]}")
(
set -x
gcloud beta compute instances delete --zone "$zone" $maybeQuiet "${names[@]}"
)
}