312 lines
		
	
	
		
			8.9 KiB
		
	
	
	
		
			Bash
		
	
	
		
			Executable File
		
	
	
	
	
			
		
		
	
	
			312 lines
		
	
	
		
			8.9 KiB
		
	
	
	
		
			Bash
		
	
	
		
			Executable File
		
	
	
	
	
# |source| this file
 | 
						|
#
 | 
						|
# Utilities for working with Azure instances
 | 
						|
#
 | 
						|
 | 
						|
# Default zone
 | 
						|
cloud_DefaultZone() {
 | 
						|
  echo "westus"
 | 
						|
}
 | 
						|
 | 
						|
#
 | 
						|
# __cloud_GetConfigValueFromInstanceName
 | 
						|
# Return a piece of configuration information about an instance
 | 
						|
# Provide the exact name of an instance and the configuration key, and the corresponding value will be returned
 | 
						|
#
 | 
						|
# example:
 | 
						|
#   This will return the name of the resource group of the instance named
 | 
						|
#   __cloud_GetConfigValueFromInstanceName some-instance-name resourceGroup
 | 
						|
 | 
						|
cloud_GetConfigValueFromInstanceName() {
 | 
						|
  query="[?name=='$1']"
 | 
						|
  key="[$2]"
 | 
						|
  config_value=$(az vm list -d -o tsv --query "$query.$key")
 | 
						|
}
 | 
						|
 | 
						|
cloud_GetResourceGroupFromInstanceName() {
 | 
						|
  resourceGroup=$(az vm list -o tsv --query "[?name=='$1'].[resourceGroup]")
 | 
						|
}
 | 
						|
cloud_GetIdFromInstanceName() {
 | 
						|
  id=$(az vm list -o tsv --query "[?name=='$1'].[id]")
 | 
						|
}
 | 
						|
 | 
						|
#
 | 
						|
# __cloud_FindInstances
 | 
						|
#
 | 
						|
# Find instances matching the specified pattern.
 | 
						|
#
 | 
						|
# For each matching instance, an entry in the `instances` array will be added with the
 | 
						|
# following information about the instance:
 | 
						|
#   "name:public IP:private IP:location"
 | 
						|
#
 | 
						|
# filter   - The instances to filter on
 | 
						|
#
 | 
						|
# examples:
 | 
						|
#   $ __cloud_FindInstances prefix some-machine-prefix
 | 
						|
#   $ __cloud_FindInstances name exact-machine-name
 | 
						|
#
 | 
						|
#  Examples of plain-text filter command
 | 
						|
#
 | 
						|
#  This will return an exact match for a machine named pgnode
 | 
						|
#  az vm list -d --query "[?name=='pgnode'].[name,publicIps,privateIps,location]"
 | 
						|
#
 | 
						|
#  This will return a match for any machine with prefix pgnode, ex: pgnode and pgnode2
 | 
						|
#  az vm list -d --query "[?starts_with(name,'pgnode')].[name,publicIps,privateIps,location]"
 | 
						|
__cloud_FindInstances() {
 | 
						|
  case $1 in
 | 
						|
    prefix)
 | 
						|
      query="[?starts_with(name,'$2')]"
 | 
						|
      ;;
 | 
						|
    name)
 | 
						|
      query="[?name=='$2']"
 | 
						|
      ;;
 | 
						|
    *)
 | 
						|
      echo "Unknown filter command: $1"
 | 
						|
      ;;
 | 
						|
  esac
 | 
						|
 | 
						|
  keys="[name,publicIps,privateIps,location]"
 | 
						|
 | 
						|
  instances=()
 | 
						|
  while read -r name publicIp privateIp location; do
 | 
						|
    instances+=("$name:$publicIp:$privateIp:$location")
 | 
						|
  done < <(az vm list -d -o tsv --query "$query.$keys")
 | 
						|
  echo "${instances[*]}"
 | 
						|
}
 | 
						|
 | 
						|
#
 | 
						|
# cloud_FindInstances [namePrefix]
 | 
						|
#
 | 
						|
# Find instances with names matching the specified prefix
 | 
						|
#
 | 
						|
# For each matching instance, an entry in the `instances` array will be added with the
 | 
						|
# following information about the instance:
 | 
						|
#   "name:public IP:private IP:location"
 | 
						|
#
 | 
						|
# namePrefix - The instance name prefix to look for
 | 
						|
#
 | 
						|
# examples:
 | 
						|
#   $ cloud_FindInstances all-machines-with-a-common-machine-prefix
 | 
						|
#
 | 
						|
cloud_FindInstances() {
 | 
						|
  __cloud_FindInstances prefix "$1"
 | 
						|
}
 | 
						|
 | 
						|
#
 | 
						|
# cloud_FindInstance [name]
 | 
						|
#
 | 
						|
# Find an instance with a name matching the exact pattern.
 | 
						|
#
 | 
						|
# For each matching instance, an entry in the `instances` array will be added with the
 | 
						|
# following information about the instance:
 | 
						|
#   "name:public IP:private IP:location"
 | 
						|
#
 | 
						|
# name - The instance name to look for
 | 
						|
#
 | 
						|
# examples:
 | 
						|
#   $ cloud_FindInstance exact-machine-name
 | 
						|
#
 | 
						|
cloud_FindInstance() {
 | 
						|
  __cloud_FindInstances name "$1"
 | 
						|
}
 | 
						|
 | 
						|
#
 | 
						|
# cloud_Initialize [networkName]
 | 
						|
#
 | 
						|
# Perform one-time initialization that may be required for the given testnet.
 | 
						|
#
 | 
						|
# networkName   - unique name of this testnet
 | 
						|
#
 | 
						|
# This function will be called before |cloud_CreateInstances|
 | 
						|
cloud_Initialize() {
 | 
						|
  declare networkName="$1"
 | 
						|
  # ec2-provider.sh creates firewall rules programmatically, should do the same
 | 
						|
  # here.
 | 
						|
  echo "TODO: create $networkName firewall rules programmatically instead of assuming the 'testnet' tag exists"
 | 
						|
}
 | 
						|
 | 
						|
#
 | 
						|
# cloud_CreateInstances [networkName] [namePrefix] [numNodes] [imageName]
 | 
						|
#                       [machineType] [bootDiskSize] [enableGpu]
 | 
						|
#                       [startupScript] [address]
 | 
						|
#
 | 
						|
# Creates one more identical instances.
 | 
						|
#
 | 
						|
# networkName   - unique name of this testnet
 | 
						|
# namePrefix    - unique string to prefix all the instance names with
 | 
						|
# numNodes      - number of instances to create
 | 
						|
# imageName     - Disk image for the instances
 | 
						|
# machineType   - GCE machine type.  Note that this may also include an
 | 
						|
#                 `--accelerator=` or other |gcloud compute instances create|
 | 
						|
#                 options
 | 
						|
# bootDiskSize  - Optional size of the boot disk in GB
 | 
						|
# enableGpu     - Optionally enable GPU, use the value "true" to enable
 | 
						|
#                 eg, request 4 K80 GPUs with "count=4,type=nvidia-tesla-k80"
 | 
						|
# startupScript - Optional startup script to execute when the instance boots
 | 
						|
# address       - Optional name of the GCE static IP address to attach to the
 | 
						|
#                 instance.  Requires that |numNodes| = 1 and that addressName
 | 
						|
#                 has been provisioned in the GCE region that is hosting `$zone`
 | 
						|
#
 | 
						|
# Tip: use cloud_FindInstances to locate the instances once this function
 | 
						|
#      returns
 | 
						|
cloud_CreateInstances() {
 | 
						|
  declare networkName="$1"
 | 
						|
  declare namePrefix="$2"
 | 
						|
  declare numNodes="$3"
 | 
						|
  declare enableGpu="$4"
 | 
						|
  declare machineType="$5"
 | 
						|
  declare zone="$6"
 | 
						|
  declare optionalBootDiskSize="$7"
 | 
						|
  declare optionalStartupScript="$8"
 | 
						|
  declare optionalAddress="$9"
 | 
						|
  declare optionalBootDiskType="${10}"
 | 
						|
 | 
						|
  declare -a nodes
 | 
						|
  if [[ $numNodes = 1 ]]; then
 | 
						|
    nodes=("$namePrefix")
 | 
						|
  else
 | 
						|
    for node in $(seq -f "${namePrefix}%0${#numNodes}g" 1 "$numNodes"); do
 | 
						|
      nodes+=("$node")
 | 
						|
    done
 | 
						|
  fi
 | 
						|
 | 
						|
  declare -a args
 | 
						|
  args=(
 | 
						|
    --resource-group "$networkName"
 | 
						|
    --tags testnet
 | 
						|
    --image UbuntuLTS
 | 
						|
    --size "$machineType"
 | 
						|
    --location "$zone"
 | 
						|
    --generate-ssh-keys
 | 
						|
  )
 | 
						|
 | 
						|
  if [[ -n $optionalBootDiskSize ]]; then
 | 
						|
    args+=(
 | 
						|
      --os-disk-size-gb "$optionalBootDiskSize"
 | 
						|
    )
 | 
						|
  fi
 | 
						|
  if [[ -n $optionalStartupScript ]]; then
 | 
						|
    args+=(
 | 
						|
      --custom-data "$optionalStartupScript"
 | 
						|
    )
 | 
						|
  fi
 | 
						|
 | 
						|
  if [[ -n $optionalBootDiskType ]]; then
 | 
						|
    args+=(
 | 
						|
      --storage-sku "$optionalBootDiskType"
 | 
						|
    )
 | 
						|
  else
 | 
						|
    args+=(
 | 
						|
      --storage-sku StandardSSD_LRS
 | 
						|
    )
 | 
						|
  fi
 | 
						|
 | 
						|
  if [[ -n $optionalAddress ]]; then
 | 
						|
    [[ $numNodes = 1 ]] || {
 | 
						|
      echo "Error: address may not be supplied when provisioning multiple nodes: $optionalAddress"
 | 
						|
      exit 1
 | 
						|
    }
 | 
						|
    args+=(
 | 
						|
      --public-ip-address "$optionalAddress"
 | 
						|
    )
 | 
						|
  fi
 | 
						|
 | 
						|
  (
 | 
						|
    set -x
 | 
						|
    # 1: Check if resource group exists.  If not, create it.
 | 
						|
    numGroup=$(az group list --query "length([?name=='$networkName'])")
 | 
						|
    if [[ $numGroup -eq 0 ]]; then
 | 
						|
      echo Resource Group "$networkName" does not exist.  Creating it now.
 | 
						|
      az group create --name "$networkName" --location "$zone"
 | 
						|
    else
 | 
						|
      echo Resource group "$networkName" already exists.
 | 
						|
      az group show --name "$networkName"
 | 
						|
    fi
 | 
						|
 | 
						|
    # 2: For node in numNodes, create VM and put the creation process in the background with --no-wait
 | 
						|
    for nodeName in "${nodes[@]}"; do
 | 
						|
      az vm create --name "$nodeName" "${args[@]}" --no-wait
 | 
						|
    done
 | 
						|
 | 
						|
    # 3. If GPU is to be enabled, wait until nodes are created, then install the appropriate extension
 | 
						|
    if $enableGpu; then
 | 
						|
      for nodeName in "${nodes[@]}"; do
 | 
						|
        az vm wait --created --name "$nodeName" --resource-group "$networkName" --verbose --timeout 600
 | 
						|
      done
 | 
						|
 | 
						|
      for nodeName in "${nodes[@]}"; do
 | 
						|
        az vm extension set \
 | 
						|
        --resource-group "$networkName" \
 | 
						|
        --vm-name "$nodeName" \
 | 
						|
        --name NvidiaGpuDriverLinux \
 | 
						|
        --publisher Microsoft.HpcCompute \
 | 
						|
        --version 1.2 \
 | 
						|
        --no-wait
 | 
						|
      done
 | 
						|
 | 
						|
      # 4. Wait until all nodes have GPU extension installed
 | 
						|
      for nodeName in "${nodes[@]}"; do
 | 
						|
        az vm wait --updated --name "$nodeName" --resource-group "$networkName" --verbose --timeout 600
 | 
						|
      done
 | 
						|
    fi
 | 
						|
  )
 | 
						|
}
 | 
						|
 | 
						|
#
 | 
						|
# cloud_DeleteInstances
 | 
						|
#
 | 
						|
# Deletes all the instances listed in the `instances` array
 | 
						|
#
 | 
						|
cloud_DeleteInstances() {
 | 
						|
  if [[ ${#instances[0]} -eq 0 ]]; then
 | 
						|
    echo No instances to delete
 | 
						|
    return
 | 
						|
  fi
 | 
						|
 | 
						|
  declare names=("${instances[@]/:*/}")
 | 
						|
  (
 | 
						|
    set -x
 | 
						|
    id_list=()
 | 
						|
 | 
						|
    # Build a space delimited list of all resource IDs to delete
 | 
						|
    for instance in "${names[@]}"; do
 | 
						|
      cloud_GetIdFromInstanceName "$instance"
 | 
						|
      id_list+=("$id")
 | 
						|
    done
 | 
						|
 | 
						|
    # Delete all instances in the id_list and return once they are all deleted
 | 
						|
    az vm delete --ids "${id_list[@]}" --yes --verbose --no-wait
 | 
						|
  )
 | 
						|
}
 | 
						|
 | 
						|
#
 | 
						|
# cloud_WaitForInstanceReady [instanceName] [instanceIp] [instanceZone] [timeout]
 | 
						|
#
 | 
						|
# Return once the newly created VM instance is responding.  This function is cloud-provider specific.
 | 
						|
#
 | 
						|
cloud_WaitForInstanceReady() {
 | 
						|
  declare instanceName="$1"
 | 
						|
#  declare instanceIp="$2"  # unused
 | 
						|
#  declare instanceZone="$3"  # unused
 | 
						|
  declare timeout="$4"
 | 
						|
 | 
						|
  cloud_GetResourceGroupFromInstanceName "$instanceName"
 | 
						|
  az vm wait -g "$resourceGroup" -n "$instanceName" --created  --interval 10 --timeout "$timeout"
 | 
						|
}
 | 
						|
 | 
						|
#
 | 
						|
# cloud_FetchFile [instanceName] [publicIp] [remoteFile] [localFile]
 | 
						|
#
 | 
						|
# Fetch a file from the given instance.  This function uses a cloud-specific
 | 
						|
# mechanism to fetch the file
 | 
						|
#
 | 
						|
cloud_FetchFile() {
 | 
						|
  declare instanceName="$1"
 | 
						|
  declare publicIp="$2"
 | 
						|
  declare remoteFile="$3"
 | 
						|
  declare localFile="$4"
 | 
						|
 | 
						|
  cloud_GetConfigValueFromInstanceName "$instanceName" osProfile.adminUsername
 | 
						|
  scp "${config_value}@${publicIp}:${remoteFile}" "$localFile"
 | 
						|
}
 |