Establish net/scripts/... for better scoping
This commit is contained in:
@ -1,333 +0,0 @@
|
||||
# |source| this file
|
||||
#
|
||||
# Utilities for working with gcloud
|
||||
#
|
||||
|
||||
|
||||
#
|
||||
# gcloud_FindInstances [filter] [options]
|
||||
#
|
||||
# Find instances matching the specified pattern.
|
||||
#
|
||||
# For each matching instance, an entry in the `instances` array will be added with the
|
||||
# following information about the instance:
|
||||
# "name:zone:public IP:private IP"
|
||||
#
|
||||
# filter - The instances to filter on
|
||||
# options - If set to the string "show", the list of instances will be echoed
|
||||
# to stdout
|
||||
#
|
||||
# examples:
|
||||
# $ gcloud_FindInstances "name=exact-machine-name"
|
||||
# $ gcloud_FindInstances "name~^all-machines-with-a-common-machine-prefix"
|
||||
#
|
||||
gcloud_FindInstances() {
|
||||
declare filter="$1"
|
||||
declare options="$2"
|
||||
instances=()
|
||||
|
||||
declare name zone publicIp privateIp status
|
||||
while read -r name zone publicIp privateIp status; do
|
||||
if [[ $status != RUNNING ]]; then
|
||||
echo "Warning: $name is not RUNNING, ignoring it."
|
||||
continue
|
||||
fi
|
||||
if [[ $options = show ]]; then
|
||||
printf "%-30s | %-16s publicIp=%-16s privateIp=%s\n" "$name" "$zone" "$publicIp" "$privateIp"
|
||||
fi
|
||||
|
||||
instances+=("$name:$zone:$publicIp:$privateIp")
|
||||
done < <(gcloud compute instances list \
|
||||
--filter="$filter" \
|
||||
--format 'value(name,zone,networkInterfaces[0].accessConfigs[0].natIP,networkInterfaces[0].networkIP,status)')
|
||||
}
|
||||
|
||||
#
|
||||
# gcloud_ForEachInstance [cmd] [extra args to cmd]
|
||||
#
|
||||
# Execute a command for each element in the `instances` array
|
||||
#
|
||||
# cmd - The command to execute on each instance
|
||||
# The command will receive arguments followed by any
|
||||
# additionl arguments supplied to gcloud_ForEachInstance:
|
||||
# name - name of the instance
|
||||
# zone - zone the instance is located in
|
||||
# publicIp - The public IP address of this instance
|
||||
# privateIp - The priate IP address of this instance
|
||||
# count - Monotonically increasing count for each
|
||||
# invocation of cmd, starting at 1
|
||||
# ... - Extra args to cmd..
|
||||
#
|
||||
#
|
||||
gcloud_ForEachInstance() {
|
||||
declare cmd="$1"
|
||||
shift
|
||||
[[ -n $cmd ]] || { echo gcloud_ForEachInstance: cmd not specified; exit 1; }
|
||||
|
||||
declare count=1
|
||||
for info in "${instances[@]}"; do
|
||||
declare name zone publicIp privateIp
|
||||
IFS=: read -r name zone publicIp privateIp < <(echo "$info")
|
||||
|
||||
eval "$cmd" "$name" "$zone" "$publicIp" "$privateIp" "$count" "$@"
|
||||
count=$((count + 1))
|
||||
done
|
||||
}
|
||||
|
||||
#
|
||||
# gcloud_CreateInstances [namePrefix] [numNodes] [zone] [imageName]
|
||||
# [machineType] [bootDiskSize] [accelerator]
|
||||
# [startupScript] [address]
|
||||
#
|
||||
# Creates one more identical instances.
|
||||
#
|
||||
# namePrefix - unique string to prefix all the instance names with
|
||||
# numNodes - number of instances to create
|
||||
# zone - zone to create the instances in
|
||||
# imageName - Disk image for the instances
|
||||
# machineType - GCE machine type
|
||||
# bootDiskSize - Optional disk of the boot disk
|
||||
# accelerator - Optional accelerator to attach to the instance(s), see
|
||||
# eg, request 4 K80 GPUs with "count=4,type=nvidia-tesla-k80"
|
||||
# startupScript - Optional startup script to execute when the instance boots
|
||||
# address - Optional name of the GCE static IP address to attach to the
|
||||
# instance. Requires that |numNodes| = 1 and that addressName
|
||||
# has been provisioned in the GCE region that is hosting |zone|
|
||||
#
|
||||
# Tip: use gcloud_FindInstances to locate the instances once this function
|
||||
# returns
|
||||
gcloud_CreateInstances() {
|
||||
declare namePrefix="$1"
|
||||
declare numNodes="$2"
|
||||
declare zone="$3"
|
||||
declare imageName="$4"
|
||||
declare machineType="$5"
|
||||
declare optionalBootDiskSize="$6"
|
||||
declare optionalAccelerator="$7"
|
||||
declare optionalStartupScript="$8"
|
||||
declare optionalAddress="$9"
|
||||
|
||||
declare nodes
|
||||
if [[ $numNodes = 1 ]]; then
|
||||
nodes=("$namePrefix")
|
||||
else
|
||||
read -ra nodes <<<$(seq -f "${namePrefix}%0${#numNodes}g" 1 "$numNodes")
|
||||
fi
|
||||
|
||||
declare -a args
|
||||
args=(
|
||||
"--zone=$zone"
|
||||
"--tags=testnet"
|
||||
"--image=$imageName"
|
||||
"--machine-type=$machineType"
|
||||
)
|
||||
if [[ -n $optionalBootDiskSize ]]; then
|
||||
args+=(
|
||||
"--boot-disk-size=$optionalBootDiskSize"
|
||||
)
|
||||
fi
|
||||
if [[ -n $optionalAccelerator ]]; then
|
||||
args+=(
|
||||
"--accelerator=$optionalAccelerator"
|
||||
--maintenance-policy TERMINATE
|
||||
--restart-on-failure
|
||||
)
|
||||
fi
|
||||
if [[ -n $optionalStartupScript ]]; then
|
||||
args+=(
|
||||
--metadata-from-file "startup-script=$optionalStartupScript"
|
||||
)
|
||||
fi
|
||||
|
||||
if [[ -n $optionalAddress ]]; then
|
||||
[[ $numNodes = 1 ]] || {
|
||||
echo "Error: address may not be supplied when provisioning multiple nodes: $optionalAddress"
|
||||
exit 1
|
||||
}
|
||||
args+=(
|
||||
"--address=$optionalAddress"
|
||||
)
|
||||
fi
|
||||
|
||||
(
|
||||
set -x
|
||||
gcloud beta compute instances create "${nodes[@]}" "${args[@]}"
|
||||
)
|
||||
}
|
||||
|
||||
#
|
||||
# gcloud_DeleteInstances [yes]
|
||||
#
|
||||
# Deletes all the instances listed in the `instances` array
|
||||
#
|
||||
# If yes = "true", skip the delete confirmation
|
||||
#
|
||||
gcloud_DeleteInstances() {
|
||||
declare maybeQuiet=
|
||||
if [[ $1 = true ]]; then
|
||||
maybeQuiet=--quiet
|
||||
fi
|
||||
|
||||
if [[ ${#instances[0]} -eq 0 ]]; then
|
||||
echo No instances to delete
|
||||
return
|
||||
fi
|
||||
declare names=("${instances[@]/:*/}")
|
||||
|
||||
# Assume all instances are in the same zone
|
||||
# TODO: One day this assumption will be invalid
|
||||
declare zone
|
||||
IFS=: read -r _ zone _ < <(echo "${instances[0]}")
|
||||
|
||||
(
|
||||
set -x
|
||||
gcloud beta compute instances delete --zone "$zone" $maybeQuiet "${names[@]}"
|
||||
)
|
||||
}
|
||||
|
||||
#
|
||||
# gcloud_FigureRemoteUsername [instanceInfo]
|
||||
#
|
||||
# The remote username when ssh-ing into GCP instances tends to not be the same
|
||||
# as the user's local username, but it needs to be discovered by ssh-ing into an
|
||||
# instance and examining the system.
|
||||
#
|
||||
# On success the gcloud_username global variable is updated
|
||||
#
|
||||
# instanceInfo - an entry from the `instances` array
|
||||
#
|
||||
# example:
|
||||
# gcloud_FigureRemoteUsername "name:zone:..."
|
||||
#
|
||||
gcloud_FigureRemoteUsername() {
|
||||
if [[ -n $gcloud_username ]]; then
|
||||
return
|
||||
fi
|
||||
|
||||
declare instanceInfo="$1"
|
||||
declare name zone publicIp
|
||||
IFS=: read -r name zone publicIp _ < <(echo "$instanceInfo")
|
||||
|
||||
echo "Detecting remote username using $zone in $zone:"
|
||||
|
||||
# Figure the gcp ssh username
|
||||
(
|
||||
set -x
|
||||
|
||||
# Try to ping the machine first. There can be a delay between when the
|
||||
# instance is reported as RUNNING and when it's reachable over the network
|
||||
timeout 30s bash -c "set -o pipefail; until ping -c 3 $publicIp | tr - _; do echo .; done"
|
||||
|
||||
# Try to ssh in a couple times, sshd may not yet be up even though the
|
||||
# machine can be pinged...
|
||||
set -o pipefail
|
||||
for i in $(seq 1 10); do
|
||||
if gcloud compute ssh "$name" \
|
||||
--zone "$zone" -- "echo whoami:\$USER:iamwho" \
|
||||
| tr -d $'\r '| tee /tmp/whoami-$$; then
|
||||
break
|
||||
fi
|
||||
sleep 1
|
||||
echo "Retry $i..."
|
||||
done
|
||||
)
|
||||
while IFS=: read -r whoami gcloud_username iamwho ; do
|
||||
[[ $whoami == "whoami" && $iamwho == "iamwho" ]] && break;
|
||||
done < /tmp/whoami-$$
|
||||
rm -f /tmp/whoami-$$
|
||||
|
||||
if [[ -z $gcloud_username ]]; then
|
||||
echo Unable to figure remote user name
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "Remote username: $gcloud_username"
|
||||
}
|
||||
|
||||
#
|
||||
# gcloud_PrepInstancesForSsh [username] [privateKey]
|
||||
#
|
||||
# Prepares all the instances in the `instances` array for ssh with the specified
|
||||
# keypair. This eliminates the need to use the restrictive |gcloud compute ssh|,
|
||||
# use plain |ssh| instead.
|
||||
#
|
||||
# username - gcp ssh username as computed by gcloud_FigureRemoteUsername
|
||||
# privateKey - private key to install on all the instances
|
||||
#
|
||||
gcloud_PrepInstancesForSsh() {
|
||||
declare username="$1"
|
||||
declare privateKey="$2"
|
||||
declare publicKey="$privateKey".pub
|
||||
declare logDir=log/
|
||||
|
||||
mkdir -p $logDir
|
||||
rm -rf $logDir/gcloud_PrepInstancesForSsh-*
|
||||
|
||||
[[ -r $publicKey ]] || {
|
||||
echo "Unable to read public key: $publicKey"
|
||||
exit 1
|
||||
}
|
||||
|
||||
[[ -r $privateKey ]] || {
|
||||
echo "Unable to read private key: $privateKey"
|
||||
exit 1
|
||||
}
|
||||
|
||||
[[ -d $logDir ]] || {
|
||||
echo "logDir does not exist: $logDir"
|
||||
exit 1
|
||||
}
|
||||
|
||||
declare -a pids
|
||||
for instanceInfo in "${instances[@]}"; do
|
||||
declare name zone publicIp
|
||||
IFS=: read -r name zone publicIp _ < <(echo "$instanceInfo")
|
||||
|
||||
logFile="$logDir/gcloud_PrepInstancesForSsh-$name.log"
|
||||
|
||||
# TODO: This next subshell runs in series because for unknown reason running
|
||||
# multiple |gcloud compute ssh| commands in parallel cause the macOS
|
||||
# terminal to misbehave
|
||||
(
|
||||
set -x
|
||||
|
||||
# Try to ping the machine first. There can be a delay between when the
|
||||
# instance is reported as RUNNING and when it's reachable over the network
|
||||
timeout 60s bash -c "set -o pipefail; until ping -c 3 $publicIp | tr - _; do echo .; done"
|
||||
|
||||
gcloud compute ssh --zone "$zone" "$name" -- "
|
||||
set -x;
|
||||
mkdir -p .ssh;
|
||||
echo \"$(cat "$publicKey")\" >> .ssh/authorized_keys;
|
||||
echo \"
|
||||
Host *
|
||||
BatchMode yes
|
||||
IdentityFile ~/.ssh/id_testnet
|
||||
StrictHostKeyChecking no
|
||||
\" > .ssh/config;
|
||||
"
|
||||
) >> "$logFile" 2>&1
|
||||
(
|
||||
set -x
|
||||
scp \
|
||||
-o StrictHostKeyChecking=no \
|
||||
-o UserKnownHostsFile=/dev/null \
|
||||
-i "$privateKey" \
|
||||
"$privateKey" "$username@$publicIp:.ssh/id_testnet"
|
||||
) >> "$logFile" 2>&1 &
|
||||
declare pid=$!
|
||||
|
||||
ln -sfT "$logFile" "$logDir/gcloud_PrepInstancesForSsh-$pid.log"
|
||||
pids+=("$pid")
|
||||
done
|
||||
|
||||
for pid in "${pids[@]}"; do
|
||||
declare ok=true
|
||||
wait "$pid" || ok=false
|
||||
if ! $ok; then
|
||||
cat "$logDir/gcloud_PrepInstancesForSsh-$pid.log"
|
||||
echo ^^^ +++
|
||||
exit 1
|
||||
fi
|
||||
done
|
||||
}
|
@ -1,32 +0,0 @@
|
||||
#!/bin/bash -x
|
||||
#
|
||||
# Install EarlyOOM
|
||||
#
|
||||
|
||||
[[ $(uname) = Linux ]] || exit 1
|
||||
|
||||
# 64 - enable signalling of processes (term, kill, oom-kill)
|
||||
# TODO: This setting will not persist across reboots
|
||||
sysrq=$(( $(cat /proc/sys/kernel/sysrq) | 64 ))
|
||||
sudo sysctl -w kernel.sysrq=$sysrq
|
||||
|
||||
if command -v earlyoom; then
|
||||
sudo systemctl status earlyoom
|
||||
exit 0
|
||||
fi
|
||||
|
||||
wget http://ftp.us.debian.org/debian/pool/main/e/earlyoom/earlyoom_1.1-2_amd64.deb
|
||||
sudo apt install --quiet --yes ./earlyoom_1.1-2_amd64.deb
|
||||
|
||||
cat > earlyoom <<OOM
|
||||
# use the kernel OOM killer, trigger at 20% available RAM,
|
||||
EARLYOOM_ARGS="-k -m 20"
|
||||
OOM
|
||||
sudo cp earlyoom /etc/default/
|
||||
rm earlyoom
|
||||
|
||||
sudo systemctl stop earlyoom
|
||||
sudo systemctl enable earlyoom
|
||||
sudo systemctl start earlyoom
|
||||
|
||||
exit 0
|
Reference in New Issue
Block a user