diff --git a/net/gce.sh b/net/gce.sh index f8a417869a..db59bde3c5 100755 --- a/net/gce.sh +++ b/net/gce.sh @@ -47,6 +47,14 @@ publicNetwork=false enableGpu=false customAddress= leaderRotation=true +zones=() + +containsZone() { + local e match="$1" + shift + for e; do [[ "$e" == "$match" ]] && return 0; done + return 1 +} usage() { exitcode=0 @@ -125,7 +133,7 @@ while getopts "h?p:Pn:c:z:gG:a:d:bu" opt; do clientNodeCount=$OPTARG ;; z) - cloud_SetZone "$OPTARG" + containsZone "$OPTARG" "${zones[@]}" || zones+=("$OPTARG") ;; b) leaderRotation=false @@ -156,6 +164,8 @@ while getopts "h?p:Pn:c:z:gG:a:d:bu" opt; do done shift $((OPTIND - 1)) +[[ ${#zones[@]} -gt 0 ]] || zones+=($(cloud_DefaultZone)) + [[ -z $1 ]] || usage "Unexpected argument: $1" if [[ $cloudProvider = ec2 ]]; then # EC2 keys can't be retrieved from running instances like GCE keys can so save @@ -168,59 +178,8 @@ fi case $cloudProvider in gce) - if $enableGpu; then - # Custom Ubuntu 18.04 LTS image with CUDA 9.2 and CUDA 10.0 installed - # - # TODO: Unfortunately this image is not public. When this becomes an issue, - # use the stock Ubuntu 18.04 image and programmatically install CUDA after the - # instance boots - # - imageName="ubuntu-1804-bionic-v20181029-with-cuda-10-and-cuda-9-2" - else - # Upstream Ubuntu 18.04 LTS image - imageName="ubuntu-1804-bionic-v20181029 --image-project ubuntu-os-cloud" - fi ;; ec2) - if $enableGpu; then - # - # Custom Ubuntu 18.04 LTS image with CUDA 9.2 and CUDA 10.0 installed - # - # TODO: Unfortunately these AMIs are not public. When this becomes an issue, - # use the stock Ubuntu 18.04 image and programmatically install CUDA after the - # instance boots - # - case $region in - us-east-1) - imageName="ami-0a8bd6fb204473f78" - ;; - us-west-1) - imageName="ami-07011f0795513c59d" - ;; - us-west-2) - imageName="ami-0a11ef42b62b82b68" - ;; - *) - usage "Unsupported region: $region" - ;; - esac - else - # Select an upstream Ubuntu 18.04 AMI from https://cloud-images.ubuntu.com/locator/ec2/ - case $region in - us-east-1) - imageName="ami-0a313d6098716f372" - ;; - us-west-1) - imageName="ami-06397100adf427136" - ;; - us-west-2) - imageName="ami-0dc34f4b016c9ce49" - ;; - *) - usage "Unsupported region: $region" - ;; - esac - fi ;; *) echo "Error: Unknown cloud provider: $cloudProvider" @@ -313,7 +272,8 @@ EOF ( declare nodeName declare nodeIp - IFS=: read -r nodeName nodeIp _ < <(echo "${instances[0]}") + declare nodeZone + IFS=: read -r nodeName nodeIp _ nodeZone < <(echo "${instances[0]}") # Try to ping the machine first. timeout 90s bash -c "set -o pipefail; until ping -c 3 $nodeIp | tr - _; do echo .; done" @@ -325,7 +285,7 @@ EOF # machine can be pinged... set -x -o pipefail for i in $(seq 1 30); do - if cloud_FetchFile "$nodeName" "$nodeIp" /solana-id_ecdsa "$sshPrivateKey"; then + if cloud_FetchFile "$nodeName" "$nodeIp" /solana-id_ecdsa "$sshPrivateKey" "$nodeZone"; then break fi @@ -344,13 +304,15 @@ EOF cloud_ForEachInstance waitForStartupComplete echo "Looking for additional fullnode instances..." - cloud_FindInstances "$prefix-fullnode" - [[ ${#instances[@]} -gt 0 ]] || { - echo "Unable to find additional fullnodes" - exit 1 - } - cloud_ForEachInstance recordInstanceIp fullnodeIpList - cloud_ForEachInstance waitForStartupComplete + for zone in "${zones[@]}"; do + cloud_FindInstances "$prefix-$zone-fullnode" + [[ ${#instances[@]} -gt 0 ]] || { + echo "Unable to find additional fullnodes" + exit 1 + } + cloud_ForEachInstance recordInstanceIp fullnodeIpList + cloud_ForEachInstance waitForStartupComplete + done echo "clientIpList=()" >> "$configFile" echo "clientIpListPrivate=()" >> "$configFile" @@ -381,7 +343,14 @@ delete() { # during shutdown (only applicable when leader rotation is disabled). # TODO: It would be better to fully cut-off metrics reporting before any # instances are deleted. - for filter in "$prefix-bootstrap-leader" "$prefix-"; do + filters=("$prefix-bootstrap-leader") + for zone in "${zones[@]}"; do + filters+=("$prefix-$zone") + done + # Filter for all other nodes (client, blockstreamer) + filters+=("$prefix-") + + for filter in "${filters[@]}"; do echo "Searching for instances: $filter" cloud_FindInstances "$filter" @@ -501,25 +470,37 @@ EOF bootstrapLeaderAddress=$customAddress fi - cloud_Initialize "$prefix" + for zone in "${zones[@]}"; do + cloud_Initialize "$prefix" "$zone" + done cloud_CreateInstances "$prefix" "$prefix-bootstrap-leader" 1 \ - "$imageName" "$bootstrapLeaderMachineType" "$fullNodeBootDiskSizeInGb" \ + "$enableGpu" "$bootstrapLeaderMachineType" "${zones[0]}" "$fullNodeBootDiskSizeInGb" \ "$startupScript" "$bootstrapLeaderAddress" "$bootDiskType" - cloud_CreateInstances "$prefix" "$prefix-fullnode" "$additionalFullNodeCount" \ - "$imageName" "$fullNodeMachineType" "$fullNodeBootDiskSizeInGb" \ - "$startupScript" "" "$bootDiskType" + num_zones=${#zones[@]} + numNodesPerZone=$((additionalFullNodeCount / num_zones)) + numLeftOverNodes=$((additionalFullNodeCount % num_zones)) + count=0 + for zone in "${zones[@]}"; do + count=$((count + 1)) + if [[ $count -eq $num_zones ]]; then + numNodesPerZone=$((numNodesPerZone + numLeftOverNodes)) + fi + cloud_CreateInstances "$prefix" "$prefix-$zone-fullnode" "$numNodesPerZone" \ + "$enableGpu" "$fullNodeMachineType" "$zone" "$fullNodeBootDiskSizeInGb" \ + "$startupScript" "" "$bootDiskType" + done if [[ $clientNodeCount -gt 0 ]]; then cloud_CreateInstances "$prefix" "$prefix-client" "$clientNodeCount" \ - "$imageName" "$clientMachineType" "$clientBootDiskSizeInGb" \ + "$enableGpu" "$clientMachineType" "${zones[0]}" "$clientBootDiskSizeInGb" \ "$startupScript" "" "$bootDiskType" fi if $blockstreamer; then cloud_CreateInstances "$prefix" "$prefix-blockstreamer" "1" \ - "$imageName" "$blockstreamerMachineType" "$fullNodeBootDiskSizeInGb" \ + "$enableGpu" "$blockstreamerMachineType" "${zones[0]}" "$fullNodeBootDiskSizeInGb" \ "$startupScript" "$blockstreamerAddress" "$bootDiskType" fi diff --git a/net/scripts/ec2-provider.sh b/net/scripts/ec2-provider.sh index 5787d69db6..aba818a686 100644 --- a/net/scripts/ec2-provider.sh +++ b/net/scripts/ec2-provider.sh @@ -3,17 +3,17 @@ # Utilities for working with EC2 instances # -zone= -region= - -cloud_SetZone() { - zone="$1" - # AWS region is zone with the last character removed - region="${zone:0:$((${#zone} - 1))}" +cloud_DefaultZone() { + echo "us-east-1b" } -# Set the default zone -cloud_SetZone "us-east-1b" +# AWS region is zone with the last character removed +__cloud_GetRegion() { + declare zone="$1" + # AWS region is zone with the last character removed + declare region="${zone:0:$((${#zone} - 1))}" + echo "$region" +} # sshPrivateKey should be globally defined whenever this function is called. # @@ -49,18 +49,22 @@ __cloud_FindInstances() { declare filter="$1" instances=() - declare name publicIp privateIp - while read -r name publicIp privateIp; do - printf "%-30s | publicIp=%-16s privateIp=%s\n" "$name" "$publicIp" "$privateIp" - instances+=("$name:$publicIp:$privateIp") - done < <(aws ec2 describe-instances \ - --region "$region" \ - --filters \ - "Name=tag:name,Values=$filter" \ - "Name=instance-state-name,Values=pending,running" \ - --query "Reservations[].Instances[].[InstanceId,PublicIpAddress,PrivateIpAddress]" \ - --output text \ - ) + declare -a regions=("us-east-1" "us-west-1" "us-west-2") + for region in "${regions[@]}" + do + declare name publicIp privateIp + while read -r name publicIp privateIp zone; do + printf "%-30s | publicIp=%-16s privateIp=%s zone=%s\n" "$name" "$publicIp" "$privateIp" "$zone" + instances+=("$name:$publicIp:$privateIp:$zone") + done < <(aws ec2 describe-instances \ + --region "$region" \ + --filters \ + "Name=tag:name,Values=$filter" \ + "Name=instance-state-name,Values=pending,running" \ + --query "Reservations[].Instances[].[InstanceId,PublicIpAddress,PrivateIpAddress,Placement.AvailabilityZone]" \ + --output text \ + ) + done } # @@ -111,6 +115,8 @@ cloud_FindInstance() { # This function will be called before |cloud_CreateInstances| cloud_Initialize() { declare networkName="$1" + declare zone="$2" + declare region=$(__cloud_GetRegion "$zone") __cloud_SshPrivateKeyCheck ( @@ -152,11 +158,53 @@ cloud_CreateInstances() { declare networkName="$1" declare namePrefix="$2" declare numNodes="$3" - declare imageName="$4" + declare enableGpu="$4" declare machineType="$5" - declare optionalBootDiskSize="$6" - declare optionalStartupScript="$7" - declare optionalAddress="$8" + declare zone="$6" + declare optionalBootDiskSize="$7" + declare optionalStartupScript="$8" + declare optionalAddress="$9" + declare region=$(__cloud_GetRegion "$zone") + + if $enableGpu; then + # + # Custom Ubuntu 18.04 LTS image with CUDA 9.2 and CUDA 10.0 installed + # + # TODO: Unfortunately these AMIs are not public. When this becomes an issue, + # use the stock Ubuntu 18.04 image and programmatically install CUDA after the + # instance boots + # + case $region in + us-east-1) + imageName="ami-0a8bd6fb204473f78" + ;; + us-west-1) + imageName="ami-07011f0795513c59d" + ;; + us-west-2) + imageName="ami-0a11ef42b62b82b68" + ;; + *) + usage "Unsupported region: $region" + ;; + esac + else + # Select an upstream Ubuntu 18.04 AMI from https://cloud-images.ubuntu.com/locator/ec2/ + case $region in + us-east-1) + imageName="ami-0a313d6098716f372" + ;; + us-west-1) + imageName="ami-06397100adf427136" + ;; + us-west-2) + imageName="ami-0dc34f4b016c9ce49" + ;; + *) + usage "Unsupported region: $region" + ;; + esac + fi declare -a args args=( @@ -225,6 +273,8 @@ cloud_DeleteInstances() { fi declare names=("${instances[@]/:*/}") + declare zones=("${instances[@]/*:/}") + declare region=$(__cloud_GetRegion "${zones[0]}") ( set -x diff --git a/net/scripts/gce-provider.sh b/net/scripts/gce-provider.sh index f9202e5ed9..a2ba40e033 100644 --- a/net/scripts/gce-provider.sh +++ b/net/scripts/gce-provider.sh @@ -4,12 +4,10 @@ # # Default zone -zone="us-west1-b" -cloud_SetZone() { - zone="$1" +cloud_DefaultZone() { + echo "us-west1-b" } - # # __cloud_FindInstances # @@ -30,13 +28,13 @@ __cloud_FindInstances() { instances=() declare name zone publicIp privateIp status - while read -r name publicIp privateIp status; do - printf "%-30s | publicIp=%-16s privateIp=%s status=%s\n" "$name" "$publicIp" "$privateIp" "$status" + while read -r name publicIp privateIp status zone; do + printf "%-30s | publicIp=%-16s privateIp=%s status=%s zone=%s\n" "$name" "$publicIp" "$privateIp" "$status" "$zone" - instances+=("$name:$publicIp:$privateIp") + instances+=("$name:$publicIp:$privateIp:$zone") done < <(gcloud compute instances list \ --filter "$filter" \ - --format 'value(name,networkInterfaces[0].accessConfigs[0].natIP,networkInterfaces[0].networkIP,status)') + --format 'value(name,networkInterfaces[0].accessConfigs[0].natIP,networkInterfaces[0].networkIP,status,zone)') } # # cloud_FindInstances [namePrefix] @@ -119,12 +117,26 @@ cloud_CreateInstances() { declare networkName="$1" declare namePrefix="$2" declare numNodes="$3" - declare imageName="$4" + declare enableGpu="$4" declare machineType="$5" - declare optionalBootDiskSize="$6" - declare optionalStartupScript="$7" - declare optionalAddress="$8" - declare optionalBootDiskType="$9" + declare zone="$6" + declare optionalBootDiskSize="$7" + declare optionalStartupScript="$8" + declare optionalAddress="$9" + declare optionalBootDiskType="${10}" + + if $enableGpu; then + # Custom Ubuntu 18.04 LTS image with CUDA 9.2 and CUDA 10.0 installed + # + # TODO: Unfortunately this image is not public. When this becomes an issue, + # use the stock Ubuntu 18.04 image and programmatically install CUDA after the + # instance boots + # + imageName="ubuntu-1804-bionic-v20181029-with-cuda-10-and-cuda-9-2" + else + # Upstream Ubuntu 18.04 LTS image + imageName="ubuntu-1804-bionic-v20181029 --image-project ubuntu-os-cloud" + fi declare -a nodes if [[ $numNodes = 1 ]]; then @@ -192,11 +204,13 @@ cloud_DeleteInstances() { echo No instances to delete return fi + declare names=("${instances[@]/:*/}") + declare zones=("${instances[@]/*:/}") ( set -x - gcloud beta compute instances delete --zone "$zone" --quiet "${names[@]}" + gcloud beta compute instances delete --zone "${zones[0]}" --quiet "${names[@]}" ) } @@ -213,6 +227,7 @@ cloud_FetchFile() { declare publicIp="$2" declare remoteFile="$3" declare localFile="$4" + declare zone="$5" ( set -x