Add GPU support to ec2-provider

This commit is contained in:
Michael Vines
2018-09-17 08:25:10 -07:00
parent f89f121d2b
commit 155ee8792f
3 changed files with 42 additions and 41 deletions

View File

@ -11,7 +11,9 @@ gce)
source "$here"/scripts/gce-provider.sh source "$here"/scripts/gce-provider.sh
imageName="ubuntu-16-04-cuda-9-2-new" imageName="ubuntu-16-04-cuda-9-2-new"
leaderMachineType=n1-standard-16 cpuLeaderMachineType=n1-standard-16
gpuLeaderMachineType="$cpuLeaderMachineType --accelerator count=4,type=nvidia-tesla-k80"
leaderMachineType=$cpuLeaderMachineType
validatorMachineType=n1-standard-4 validatorMachineType=n1-standard-4
clientMachineType=n1-standard-16 clientMachineType=n1-standard-16
;; ;;
@ -19,8 +21,10 @@ ec2)
# shellcheck source=net/scripts/ec2-provider.sh # shellcheck source=net/scripts/ec2-provider.sh
source "$here"/scripts/ec2-provider.sh source "$here"/scripts/ec2-provider.sh
imageName="ami-04169656fea786776" imageName="ami-0466e26ccc0e752c1"
leaderMachineType=m4.4xlarge cpuLeaderMachineType=m4.4xlarge
gpuLeaderMachineType=p2.xlarge
leaderMachineType=$cpuLeaderMachineType
validatorMachineType=m4.xlarge validatorMachineType=m4.xlarge
clientMachineType=m4.4xlarge clientMachineType=m4.4xlarge
;; ;;
@ -35,7 +39,7 @@ validatorNodeCount=5
clientNodeCount=1 clientNodeCount=1
leaderBootDiskSizeInGb=1000 leaderBootDiskSizeInGb=1000
validatorBootDiskSizeInGb=$leaderBootDiskSizeInGb validatorBootDiskSizeInGb=$leaderBootDiskSizeInGb
clientBootDiskSizeInGb=40 clientBootDiskSizeInGb=75
publicNetwork=false publicNetwork=false
enableGpu=false enableGpu=false
@ -111,6 +115,7 @@ while getopts "h?p:Pn:c:z:ga:" opt; do
;; ;;
g) g)
enableGpu=true enableGpu=true
leaderMachineType="$gpuLeaderMachineType"
;; ;;
a) a)
leaderAddress=$OPTARG leaderAddress=$OPTARG
@ -372,16 +377,16 @@ touch /.instance-startup-complete
EOF EOF
cloud_CreateInstances "$prefix" "$prefix-leader" 1 \ cloud_CreateInstances "$prefix" "$prefix-leader" 1 \
"$imageName" "$leaderMachineType" "$leaderBootDiskSizeInGb" "$enableGpu" \ "$imageName" "$leaderMachineType" "$leaderBootDiskSizeInGb" \
"$startupScript" "$leaderAddress" "$startupScript" "$leaderAddress"
cloud_CreateInstances "$prefix" "$prefix-validator" "$validatorNodeCount" \ cloud_CreateInstances "$prefix" "$prefix-validator" "$validatorNodeCount" \
"$imageName" "$validatorMachineType" "$validatorBootDiskSizeInGb" false \ "$imageName" "$validatorMachineType" "$validatorBootDiskSizeInGb" \
"$startupScript" "" "$startupScript" ""
if [[ $clientNodeCount -gt 0 ]]; then if [[ $clientNodeCount -gt 0 ]]; then
cloud_CreateInstances "$prefix" "$prefix-client" "$clientNodeCount" \ cloud_CreateInstances "$prefix" "$prefix-client" "$clientNodeCount" \
"$imageName" "$clientMachineType" "$clientBootDiskSizeInGb" false \ "$imageName" "$clientMachineType" "$clientBootDiskSizeInGb" \
"$startupScript" "" "$startupScript" ""
fi fi

View File

@ -104,8 +104,7 @@ cloud_FindInstance() {
# #
# cloud_CreateInstances [networkName] [namePrefix] [numNodes] [imageName] # cloud_CreateInstances [networkName] [namePrefix] [numNodes] [imageName]
# [machineType] [bootDiskSize] [enableGpu] # [machineType] [bootDiskSize] [startupScript] [address]
# [startupScript] [address]
# #
# Creates one more identical instances. # Creates one more identical instances.
# #
@ -115,8 +114,6 @@ cloud_FindInstance() {
# imageName - Disk image for the instances # imageName - Disk image for the instances
# machineType - GCE machine type # machineType - GCE machine type
# bootDiskSize - Optional size of the boot disk in GB # bootDiskSize - Optional size of the boot disk in GB
# enableGpu - Optionally enable GPU, use the value "true" to enable
# eg, request 4 K80 GPUs with "count=4,type=nvidia-tesla-k80"
# startupScript - Optional startup script to execute when the instance boots # startupScript - Optional startup script to execute when the instance boots
# address - Optional name of the GCE static IP address to attach to the # address - Optional name of the GCE static IP address to attach to the
# instance. Requires that |numNodes| = 1 and that addressName # instance. Requires that |numNodes| = 1 and that addressName
@ -131,9 +128,8 @@ cloud_CreateInstances() {
declare imageName="$4" declare imageName="$4"
declare machineType="$5" declare machineType="$5"
declare optionalBootDiskSize="$6" declare optionalBootDiskSize="$6"
declare optionalGpu="$7" declare optionalStartupScript="$7"
declare optionalStartupScript="$8" declare optionalAddress="$8"
declare optionalAddress="$9"
__cloud_SshPrivateKeyCheck __cloud_SshPrivateKeyCheck
( (
@ -159,10 +155,6 @@ cloud_CreateInstances() {
--block-device-mapping "[{\"DeviceName\": \"/dev/sda1\", \"Ebs\": { \"VolumeSize\": $optionalBootDiskSize }}]" --block-device-mapping "[{\"DeviceName\": \"/dev/sda1\", \"Ebs\": { \"VolumeSize\": $optionalBootDiskSize }}]"
) )
fi fi
if [[ $optionalGpu = true ]]; then
echo TODO: GPU support not implemented yet
exit 1
fi
if [[ -n $optionalStartupScript ]]; then if [[ -n $optionalStartupScript ]]; then
args+=( args+=(
--user-data "file://$optionalStartupScript" --user-data "file://$optionalStartupScript"
@ -189,10 +181,16 @@ cloud_CreateInstances() {
declare instanceId declare instanceId
IFS=: read -r instanceId _ < <(echo "${instances[0]}") IFS=: read -r instanceId _ < <(echo "${instances[0]}")
aws ec2 associate-address \ (
--instance-id "$instanceId" \ set -x
--region "region" \ # TODO: Poll that the instance has moved to the 'running' state instead of
--allocation-id "$optionalAddress" # blindly sleeping for 30 seconds...
sleep 30
aws ec2 associate-address \
--instance-id "$instanceId" \
--region "$region" \
--allocation-id "$optionalAddress"
)
fi fi
} }

View File

@ -39,7 +39,7 @@ __cloud_FindInstances() {
instances+=("$name:$publicIp:$privateIp") instances+=("$name:$publicIp:$privateIp")
done < <(gcloud compute instances list \ done < <(gcloud compute instances list \
--filter="$filter" \ --filter "$filter" \
--format 'value(name,networkInterfaces[0].accessConfigs[0].natIP,networkInterfaces[0].networkIP,status)') --format 'value(name,networkInterfaces[0].accessConfigs[0].natIP,networkInterfaces[0].networkIP,status)')
} }
# #
@ -91,7 +91,9 @@ cloud_FindInstance() {
# namePrefix - unique string to prefix all the instance names with # namePrefix - unique string to prefix all the instance names with
# numNodes - number of instances to create # numNodes - number of instances to create
# imageName - Disk image for the instances # imageName - Disk image for the instances
# machineType - GCE machine type # machineType - GCE machine type. Note that this may also include an
# `--accelerator=` or other |gcloud compute instances create|
# options
# bootDiskSize - Optional size of the boot disk in GB # bootDiskSize - Optional size of the boot disk in GB
# enableGpu - Optionally enable GPU, use the value "true" to enable # enableGpu - Optionally enable GPU, use the value "true" to enable
# eg, request 4 K80 GPUs with "count=4,type=nvidia-tesla-k80" # eg, request 4 K80 GPUs with "count=4,type=nvidia-tesla-k80"
@ -109,9 +111,8 @@ cloud_CreateInstances() {
declare imageName="$4" declare imageName="$4"
declare machineType="$5" declare machineType="$5"
declare optionalBootDiskSize="$6" declare optionalBootDiskSize="$6"
declare optionalGpu="$7" declare optionalStartupScript="$7"
declare optionalStartupScript="$8" declare optionalAddress="$8"
declare optionalAddress="$9"
declare nodes declare nodes
if [[ $numNodes = 1 ]]; then if [[ $numNodes = 1 ]]; then
@ -122,22 +123,19 @@ cloud_CreateInstances() {
declare -a args declare -a args
args=( args=(
"--zone=$zone" --zone "$zone"
"--tags=testnet" --tags testnet
"--metadata=testnet=$networkName" --metadata "testnet=$networkName"
"--image=$imageName" --image "$imageName"
"--machine-type=$machineType" --maintenance-policy TERMINATE
--no-restart-on-failure
) )
# shellcheck disable=SC2206 # Do not want to quote $machineType as it may contain extra args
args+=(--machine-type $machineType)
if [[ -n $optionalBootDiskSize ]]; then if [[ -n $optionalBootDiskSize ]]; then
args+=( args+=(
"--boot-disk-size=${optionalBootDiskSize}GB" --boot-disk-size "${optionalBootDiskSize}GB"
)
fi
if [[ $optionalGpu = true ]]; then
args+=(
"--accelerator=count=4,type=nvidia-tesla-k80"
--maintenance-policy TERMINATE
--restart-on-failure
) )
fi fi
if [[ -n $optionalStartupScript ]]; then if [[ -n $optionalStartupScript ]]; then
@ -152,7 +150,7 @@ cloud_CreateInstances() {
exit 1 exit 1
} }
args+=( args+=(
"--address=$optionalAddress" --address "$optionalAddress"
) )
fi fi