Add GPU support to ec2-provider
This commit is contained in:
19
net/gce.sh
19
net/gce.sh
@ -11,7 +11,9 @@ gce)
|
|||||||
source "$here"/scripts/gce-provider.sh
|
source "$here"/scripts/gce-provider.sh
|
||||||
|
|
||||||
imageName="ubuntu-16-04-cuda-9-2-new"
|
imageName="ubuntu-16-04-cuda-9-2-new"
|
||||||
leaderMachineType=n1-standard-16
|
cpuLeaderMachineType=n1-standard-16
|
||||||
|
gpuLeaderMachineType="$cpuLeaderMachineType --accelerator count=4,type=nvidia-tesla-k80"
|
||||||
|
leaderMachineType=$cpuLeaderMachineType
|
||||||
validatorMachineType=n1-standard-4
|
validatorMachineType=n1-standard-4
|
||||||
clientMachineType=n1-standard-16
|
clientMachineType=n1-standard-16
|
||||||
;;
|
;;
|
||||||
@ -19,8 +21,10 @@ ec2)
|
|||||||
# shellcheck source=net/scripts/ec2-provider.sh
|
# shellcheck source=net/scripts/ec2-provider.sh
|
||||||
source "$here"/scripts/ec2-provider.sh
|
source "$here"/scripts/ec2-provider.sh
|
||||||
|
|
||||||
imageName="ami-04169656fea786776"
|
imageName="ami-0466e26ccc0e752c1"
|
||||||
leaderMachineType=m4.4xlarge
|
cpuLeaderMachineType=m4.4xlarge
|
||||||
|
gpuLeaderMachineType=p2.xlarge
|
||||||
|
leaderMachineType=$cpuLeaderMachineType
|
||||||
validatorMachineType=m4.xlarge
|
validatorMachineType=m4.xlarge
|
||||||
clientMachineType=m4.4xlarge
|
clientMachineType=m4.4xlarge
|
||||||
;;
|
;;
|
||||||
@ -35,7 +39,7 @@ validatorNodeCount=5
|
|||||||
clientNodeCount=1
|
clientNodeCount=1
|
||||||
leaderBootDiskSizeInGb=1000
|
leaderBootDiskSizeInGb=1000
|
||||||
validatorBootDiskSizeInGb=$leaderBootDiskSizeInGb
|
validatorBootDiskSizeInGb=$leaderBootDiskSizeInGb
|
||||||
clientBootDiskSizeInGb=40
|
clientBootDiskSizeInGb=75
|
||||||
|
|
||||||
publicNetwork=false
|
publicNetwork=false
|
||||||
enableGpu=false
|
enableGpu=false
|
||||||
@ -111,6 +115,7 @@ while getopts "h?p:Pn:c:z:ga:" opt; do
|
|||||||
;;
|
;;
|
||||||
g)
|
g)
|
||||||
enableGpu=true
|
enableGpu=true
|
||||||
|
leaderMachineType="$gpuLeaderMachineType"
|
||||||
;;
|
;;
|
||||||
a)
|
a)
|
||||||
leaderAddress=$OPTARG
|
leaderAddress=$OPTARG
|
||||||
@ -372,16 +377,16 @@ touch /.instance-startup-complete
|
|||||||
EOF
|
EOF
|
||||||
|
|
||||||
cloud_CreateInstances "$prefix" "$prefix-leader" 1 \
|
cloud_CreateInstances "$prefix" "$prefix-leader" 1 \
|
||||||
"$imageName" "$leaderMachineType" "$leaderBootDiskSizeInGb" "$enableGpu" \
|
"$imageName" "$leaderMachineType" "$leaderBootDiskSizeInGb" \
|
||||||
"$startupScript" "$leaderAddress"
|
"$startupScript" "$leaderAddress"
|
||||||
|
|
||||||
cloud_CreateInstances "$prefix" "$prefix-validator" "$validatorNodeCount" \
|
cloud_CreateInstances "$prefix" "$prefix-validator" "$validatorNodeCount" \
|
||||||
"$imageName" "$validatorMachineType" "$validatorBootDiskSizeInGb" false \
|
"$imageName" "$validatorMachineType" "$validatorBootDiskSizeInGb" \
|
||||||
"$startupScript" ""
|
"$startupScript" ""
|
||||||
|
|
||||||
if [[ $clientNodeCount -gt 0 ]]; then
|
if [[ $clientNodeCount -gt 0 ]]; then
|
||||||
cloud_CreateInstances "$prefix" "$prefix-client" "$clientNodeCount" \
|
cloud_CreateInstances "$prefix" "$prefix-client" "$clientNodeCount" \
|
||||||
"$imageName" "$clientMachineType" "$clientBootDiskSizeInGb" false \
|
"$imageName" "$clientMachineType" "$clientBootDiskSizeInGb" \
|
||||||
"$startupScript" ""
|
"$startupScript" ""
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
@ -104,8 +104,7 @@ cloud_FindInstance() {
|
|||||||
|
|
||||||
#
|
#
|
||||||
# cloud_CreateInstances [networkName] [namePrefix] [numNodes] [imageName]
|
# cloud_CreateInstances [networkName] [namePrefix] [numNodes] [imageName]
|
||||||
# [machineType] [bootDiskSize] [enableGpu]
|
# [machineType] [bootDiskSize] [startupScript] [address]
|
||||||
# [startupScript] [address]
|
|
||||||
#
|
#
|
||||||
# Creates one more identical instances.
|
# Creates one more identical instances.
|
||||||
#
|
#
|
||||||
@ -115,8 +114,6 @@ cloud_FindInstance() {
|
|||||||
# imageName - Disk image for the instances
|
# imageName - Disk image for the instances
|
||||||
# machineType - GCE machine type
|
# machineType - GCE machine type
|
||||||
# bootDiskSize - Optional size of the boot disk in GB
|
# bootDiskSize - Optional size of the boot disk in GB
|
||||||
# enableGpu - Optionally enable GPU, use the value "true" to enable
|
|
||||||
# eg, request 4 K80 GPUs with "count=4,type=nvidia-tesla-k80"
|
|
||||||
# startupScript - Optional startup script to execute when the instance boots
|
# startupScript - Optional startup script to execute when the instance boots
|
||||||
# address - Optional name of the GCE static IP address to attach to the
|
# address - Optional name of the GCE static IP address to attach to the
|
||||||
# instance. Requires that |numNodes| = 1 and that addressName
|
# instance. Requires that |numNodes| = 1 and that addressName
|
||||||
@ -131,9 +128,8 @@ cloud_CreateInstances() {
|
|||||||
declare imageName="$4"
|
declare imageName="$4"
|
||||||
declare machineType="$5"
|
declare machineType="$5"
|
||||||
declare optionalBootDiskSize="$6"
|
declare optionalBootDiskSize="$6"
|
||||||
declare optionalGpu="$7"
|
declare optionalStartupScript="$7"
|
||||||
declare optionalStartupScript="$8"
|
declare optionalAddress="$8"
|
||||||
declare optionalAddress="$9"
|
|
||||||
|
|
||||||
__cloud_SshPrivateKeyCheck
|
__cloud_SshPrivateKeyCheck
|
||||||
(
|
(
|
||||||
@ -159,10 +155,6 @@ cloud_CreateInstances() {
|
|||||||
--block-device-mapping "[{\"DeviceName\": \"/dev/sda1\", \"Ebs\": { \"VolumeSize\": $optionalBootDiskSize }}]"
|
--block-device-mapping "[{\"DeviceName\": \"/dev/sda1\", \"Ebs\": { \"VolumeSize\": $optionalBootDiskSize }}]"
|
||||||
)
|
)
|
||||||
fi
|
fi
|
||||||
if [[ $optionalGpu = true ]]; then
|
|
||||||
echo TODO: GPU support not implemented yet
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
if [[ -n $optionalStartupScript ]]; then
|
if [[ -n $optionalStartupScript ]]; then
|
||||||
args+=(
|
args+=(
|
||||||
--user-data "file://$optionalStartupScript"
|
--user-data "file://$optionalStartupScript"
|
||||||
@ -189,10 +181,16 @@ cloud_CreateInstances() {
|
|||||||
|
|
||||||
declare instanceId
|
declare instanceId
|
||||||
IFS=: read -r instanceId _ < <(echo "${instances[0]}")
|
IFS=: read -r instanceId _ < <(echo "${instances[0]}")
|
||||||
|
(
|
||||||
|
set -x
|
||||||
|
# TODO: Poll that the instance has moved to the 'running' state instead of
|
||||||
|
# blindly sleeping for 30 seconds...
|
||||||
|
sleep 30
|
||||||
aws ec2 associate-address \
|
aws ec2 associate-address \
|
||||||
--instance-id "$instanceId" \
|
--instance-id "$instanceId" \
|
||||||
--region "region" \
|
--region "$region" \
|
||||||
--allocation-id "$optionalAddress"
|
--allocation-id "$optionalAddress"
|
||||||
|
)
|
||||||
fi
|
fi
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -39,7 +39,7 @@ __cloud_FindInstances() {
|
|||||||
|
|
||||||
instances+=("$name:$publicIp:$privateIp")
|
instances+=("$name:$publicIp:$privateIp")
|
||||||
done < <(gcloud compute instances list \
|
done < <(gcloud compute instances list \
|
||||||
--filter="$filter" \
|
--filter "$filter" \
|
||||||
--format 'value(name,networkInterfaces[0].accessConfigs[0].natIP,networkInterfaces[0].networkIP,status)')
|
--format 'value(name,networkInterfaces[0].accessConfigs[0].natIP,networkInterfaces[0].networkIP,status)')
|
||||||
}
|
}
|
||||||
#
|
#
|
||||||
@ -91,7 +91,9 @@ cloud_FindInstance() {
|
|||||||
# namePrefix - unique string to prefix all the instance names with
|
# namePrefix - unique string to prefix all the instance names with
|
||||||
# numNodes - number of instances to create
|
# numNodes - number of instances to create
|
||||||
# imageName - Disk image for the instances
|
# imageName - Disk image for the instances
|
||||||
# machineType - GCE machine type
|
# machineType - GCE machine type. Note that this may also include an
|
||||||
|
# `--accelerator=` or other |gcloud compute instances create|
|
||||||
|
# options
|
||||||
# bootDiskSize - Optional size of the boot disk in GB
|
# bootDiskSize - Optional size of the boot disk in GB
|
||||||
# enableGpu - Optionally enable GPU, use the value "true" to enable
|
# enableGpu - Optionally enable GPU, use the value "true" to enable
|
||||||
# eg, request 4 K80 GPUs with "count=4,type=nvidia-tesla-k80"
|
# eg, request 4 K80 GPUs with "count=4,type=nvidia-tesla-k80"
|
||||||
@ -109,9 +111,8 @@ cloud_CreateInstances() {
|
|||||||
declare imageName="$4"
|
declare imageName="$4"
|
||||||
declare machineType="$5"
|
declare machineType="$5"
|
||||||
declare optionalBootDiskSize="$6"
|
declare optionalBootDiskSize="$6"
|
||||||
declare optionalGpu="$7"
|
declare optionalStartupScript="$7"
|
||||||
declare optionalStartupScript="$8"
|
declare optionalAddress="$8"
|
||||||
declare optionalAddress="$9"
|
|
||||||
|
|
||||||
declare nodes
|
declare nodes
|
||||||
if [[ $numNodes = 1 ]]; then
|
if [[ $numNodes = 1 ]]; then
|
||||||
@ -122,22 +123,19 @@ cloud_CreateInstances() {
|
|||||||
|
|
||||||
declare -a args
|
declare -a args
|
||||||
args=(
|
args=(
|
||||||
"--zone=$zone"
|
--zone "$zone"
|
||||||
"--tags=testnet"
|
--tags testnet
|
||||||
"--metadata=testnet=$networkName"
|
--metadata "testnet=$networkName"
|
||||||
"--image=$imageName"
|
--image "$imageName"
|
||||||
"--machine-type=$machineType"
|
--maintenance-policy TERMINATE
|
||||||
|
--no-restart-on-failure
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# shellcheck disable=SC2206 # Do not want to quote $machineType as it may contain extra args
|
||||||
|
args+=(--machine-type $machineType)
|
||||||
if [[ -n $optionalBootDiskSize ]]; then
|
if [[ -n $optionalBootDiskSize ]]; then
|
||||||
args+=(
|
args+=(
|
||||||
"--boot-disk-size=${optionalBootDiskSize}GB"
|
--boot-disk-size "${optionalBootDiskSize}GB"
|
||||||
)
|
|
||||||
fi
|
|
||||||
if [[ $optionalGpu = true ]]; then
|
|
||||||
args+=(
|
|
||||||
"--accelerator=count=4,type=nvidia-tesla-k80"
|
|
||||||
--maintenance-policy TERMINATE
|
|
||||||
--restart-on-failure
|
|
||||||
)
|
)
|
||||||
fi
|
fi
|
||||||
if [[ -n $optionalStartupScript ]]; then
|
if [[ -n $optionalStartupScript ]]; then
|
||||||
@ -152,7 +150,7 @@ cloud_CreateInstances() {
|
|||||||
exit 1
|
exit 1
|
||||||
}
|
}
|
||||||
args+=(
|
args+=(
|
||||||
"--address=$optionalAddress"
|
--address "$optionalAddress"
|
||||||
)
|
)
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user