diff --git a/net/common.sh b/net/common.sh index c220ef3291..d084732ce7 100644 --- a/net/common.sh +++ b/net/common.sh @@ -7,8 +7,12 @@ # shellcheck disable=2034 # -netConfigDir="$(dirname "${BASH_SOURCE[0]}")"/config -netLogDir="$(dirname "${BASH_SOURCE[0]}")"/log +netDir=$( + cd "$(dirname "${BASH_SOURCE[0]}")" || exit + echo "$PWD" +) +netConfigDir="$netDir"/config +netLogDir="$netDir"/log mkdir -p "$netConfigDir" "$netLogDir" # shellcheck source=scripts/configure-metrics.sh @@ -21,7 +25,6 @@ publicNetwork= leaderIp= netBasename= sshPrivateKey= -sshUsername= clientIpList=() sshOptions=() validatorIpList=() @@ -31,9 +34,10 @@ buildSshOptions() { -o "BatchMode=yes" -o "StrictHostKeyChecking=no" -o "UserKnownHostsFile=/dev/null" - -o "User=$sshUsername" + -o "User=solana" -o "IdentityFile=$sshPrivateKey" -o "LogLevel=ERROR" + -F /dev/null ) } @@ -47,7 +51,6 @@ loadConfigFile() { [[ -n "$leaderIp" ]] || usage "Config file invalid, leaderIp unspecified: $configFile" [[ -n "$netBasename" ]] || usage "Config file invalid, netBasename unspecified: $configFile" [[ -n $sshPrivateKey ]] || usage "Config file invalid, sshPrivateKey unspecified: $configFile" - [[ -n $sshUsername ]] || usage "Config file invalid, sshUsername unspecified: $configFile" [[ ${#validatorIpList[@]} -gt 0 ]] || usage "Config file invalid, validatorIpList unspecified: $configFile" buildSshOptions diff --git a/net/gce.sh b/net/gce.sh index 9fa161d9f6..f81964aa86 100755 --- a/net/gce.sh +++ b/net/gce.sh @@ -106,6 +106,7 @@ done shift $((OPTIND - 1)) [[ -z $1 ]] || usage "Unexpected argument: $1" +sshPrivateKey="$netConfigDir/id_$prefix" prepareInstancesAndWriteConfigFile() { $metricsWriteDatapoint "testnet-deploy net-config-begin=1" @@ -114,15 +115,10 @@ prepareInstancesAndWriteConfigFile() { # autogenerated at $(date) netBasename=$prefix publicNetwork=$publicNetwork +sshPrivateKey=$sshPrivateKey EOF - declare sshPrivateKey="$netConfigDir/id_$prefix" - rm -rf "$sshPrivateKey"{,.pub} - ( - set -x - ssh-keygen -t ecdsa -N '' -f "$sshPrivateKey" - ) - echo "sshPrivateKey=$sshPrivateKey" >> "$configFile" + buildSshOptions recordInstanceIp() { declare name="$1" @@ -141,38 +137,79 @@ EOF fi } + waitForStartupComplete() { + declare name="$1" + declare publicIp="$3" + + echo "Waiting for $name to finish booting..." + ( + for i in $(seq 1 30); do + if (set -x; ssh "${sshOptions[@]}" "$publicIp" "test -f /.gce-startup-complete"); then + break + fi + sleep 2 + echo "Retry $i..." + done + ) + } + echo "Looking for leader instance..." gcloud_FindInstances "name=$prefix-leader" show [[ ${#instances[@]} -eq 1 ]] || { - echo "Unable to start leader" + echo "Unable to find leader" exit 1 } - gcloud_FigureRemoteUsername "${instances[0]}" - sshUsername=$gcloud_username - echo "sshUsername=$sshUsername" >> "$configFile" - buildSshOptions - gcloud_PrepInstancesForSsh "$gcloud_username" "$sshPrivateKey" + echo "Fetching $sshPrivateKey from $leaderName" + ( + rm -rf "$sshPrivateKey"{,pub} + + declare leaderName + declare leaderZone + declare leaderIp + IFS=: read -r leaderName leaderZone leaderIp _ < <(echo "${instances[0]}") + + set -x + + # Try to ping the machine first. There can be a delay between when the + # instance is reported as RUNNING and when it's reachable over the network + timeout 30s bash -c "set -o pipefail; until ping -c 3 $leaderIp | tr - _; do echo .; done" + + # Try to scp in a couple times, sshd may not yet be up even though the + # machine can be pinged... + set -o pipefail + for i in $(seq 1 10); do + if gcloud compute scp --zone "$leaderZone" \ + "$leaderName:/solana-id_ecdsa" "$sshPrivateKey"; then + break + fi + sleep 1 + echo "Retry $i..." + done + + chmod 400 "$sshPrivateKey" + ) echo "leaderIp=()" >> "$configFile" gcloud_ForEachInstance recordInstanceIp leaderIp + gcloud_ForEachInstance waitForStartupComplete echo "Looking for validator instances..." gcloud_FindInstances "name~^$prefix-validator" show [[ ${#instances[@]} -gt 0 ]] || { - echo "Unable to start validators" + echo "Unable to find validators" exit 1 } echo "validatorIpList=()" >> "$configFile" - gcloud_PrepInstancesForSsh "$gcloud_username" "$sshPrivateKey" gcloud_ForEachInstance recordInstanceIp validatorIpList + gcloud_ForEachInstance waitForStartupComplete echo "clientIpList=()" >> "$configFile" echo "Looking for client instances..." gcloud_FindInstances "name~^$prefix-client" show [[ ${#instances[@]} -eq 0 ]] || { - gcloud_PrepInstancesForSsh "$gcloud_username" "$sshPrivateKey" gcloud_ForEachInstance recordInstanceIp clientIpList + gcloud_ForEachInstance waitForStartupComplete } echo "Wrote $configFile" @@ -206,6 +243,9 @@ create) $metricsWriteDatapoint "testnet-deploy net-create-begin=1" + rm -rf "$sshPrivateKey"{,.pub} + ssh-keygen -t ecdsa -N '' -f "$sshPrivateKey" + printNetworkInfo() { cat < /etc/motd < /solana-id_ecdsa < /solana-id_ecdsa.pub < /etc/motd < client.log + deployMethod="$1" entrypointIp="$2" numNodes="$3" diff --git a/net/scripts/create-solana-user.sh b/net/scripts/create-solana-user.sh new file mode 100755 index 0000000000..457639295d --- /dev/null +++ b/net/scripts/create-solana-user.sh @@ -0,0 +1,27 @@ +#!/bin/bash -ex + +[[ $(uname) = Linux ]] || exit 1 +[[ $USER = root ]] || exit 1 + +adduser solana --gecos "" --disabled-password --quiet +adduser solana sudo +echo "solana ALL=(ALL) NOPASSWD:ALL" >> /etc/sudoers +id solana + +[[ -r /solana-id_ecdsa ]] || exit 1 +[[ -r /solana-id_ecdsa.pub ]] || exit 1 + +sudo -u solana bash -c " + mkdir -p /home/solana/.ssh/ + cd /home/solana/.ssh/ + cp /solana-id_ecdsa.pub authorized_keys + umask 377 + cp /solana-id_ecdsa id_ecdsa + echo \" + Host * + BatchMode yes + IdentityFile ~/.ssh/id_ecdsa + StrictHostKeyChecking no + \" > config +" + diff --git a/net/scripts/disable-background-upgrades.sh b/net/scripts/disable-background-upgrades.sh index f100860627..ad4903674c 100755 --- a/net/scripts/disable-background-upgrades.sh +++ b/net/scripts/disable-background-upgrades.sh @@ -1,5 +1,5 @@ #!/bin/bash -ex - +# # Prevent background upgrades that block |apt-get| # # TODO: This approach is pretty uncompromising. An alternative solution that @@ -18,4 +18,3 @@ while fuser /var/lib/dpkg/lock; do sleep 1 done - diff --git a/net/scripts/gcloud.sh b/net/scripts/gcloud.sh index 77be58cc37..d758b76d15 100644 --- a/net/scripts/gcloud.sh +++ b/net/scripts/gcloud.sh @@ -185,149 +185,3 @@ gcloud_DeleteInstances() { ) } -# -# gcloud_FigureRemoteUsername [instanceInfo] -# -# The remote username when ssh-ing into GCP instances tends to not be the same -# as the user's local username, but it needs to be discovered by ssh-ing into an -# instance and examining the system. -# -# On success the gcloud_username global variable is updated -# -# instanceInfo - an entry from the `instances` array -# -# example: -# gcloud_FigureRemoteUsername "name:zone:..." -# -gcloud_FigureRemoteUsername() { - if [[ -n $gcloud_username ]]; then - return - fi - - declare instanceInfo="$1" - declare name zone publicIp - IFS=: read -r name zone publicIp _ < <(echo "$instanceInfo") - - echo "Detecting remote username using $zone in $zone:" - - # Figure the gcp ssh username - ( - set -x - - # Try to ping the machine first. There can be a delay between when the - # instance is reported as RUNNING and when it's reachable over the network - timeout 30s bash -c "set -o pipefail; until ping -c 3 $publicIp | tr - _; do echo .; done" - - # Try to ssh in a couple times, sshd may not yet be up even though the - # machine can be pinged... - set -o pipefail - for i in $(seq 1 10); do - if gcloud compute ssh "$name" \ - --zone "$zone" -- "echo whoami:\$USER:iamwho" \ - | tr -d $'\r '| tee /tmp/whoami-$$; then - break - fi - sleep 1 - echo "Retry $i..." - done - ) - while IFS=: read -r whoami gcloud_username iamwho ; do - [[ $whoami == "whoami" && $iamwho == "iamwho" ]] && break; - done < /tmp/whoami-$$ - rm -f /tmp/whoami-$$ - - if [[ -z $gcloud_username ]]; then - echo Unable to figure remote user name - exit 1 - fi - - echo "Remote username: $gcloud_username" -} - -# -# gcloud_PrepInstancesForSsh [username] [privateKey] -# -# Prepares all the instances in the `instances` array for ssh with the specified -# keypair. This eliminates the need to use the restrictive |gcloud compute ssh|, -# use plain |ssh| instead. -# -# username - gcp ssh username as computed by gcloud_FigureRemoteUsername -# privateKey - private key to install on all the instances -# -gcloud_PrepInstancesForSsh() { - declare username="$1" - declare privateKey="$2" - declare publicKey="$privateKey".pub - declare logDir=log/ - - mkdir -p $logDir - rm -rf $logDir/gcloud_PrepInstancesForSsh-* - - [[ -r $publicKey ]] || { - echo "Unable to read public key: $publicKey" - exit 1 - } - - [[ -r $privateKey ]] || { - echo "Unable to read private key: $privateKey" - exit 1 - } - - [[ -d $logDir ]] || { - echo "logDir does not exist: $logDir" - exit 1 - } - - declare -a pids - for instanceInfo in "${instances[@]}"; do - declare name zone publicIp - IFS=: read -r name zone publicIp _ < <(echo "$instanceInfo") - - logFile="$logDir/gcloud_PrepInstancesForSsh-$name.log" - - # TODO: This next subshell runs in series because for unknown reason running - # multiple |gcloud compute ssh| commands in parallel cause the macOS - # terminal to misbehave - ( - set -x - - # Try to ping the machine first. There can be a delay between when the - # instance is reported as RUNNING and when it's reachable over the network - timeout 60s bash -c "set -o pipefail; until ping -c 3 $publicIp | tr - _; do echo .; done" - - gcloud compute ssh --zone "$zone" "$name" -- " - set -x; - mkdir -p .ssh; - echo \"$(cat "$publicKey")\" >> .ssh/authorized_keys; - echo \" - Host * - BatchMode yes - IdentityFile ~/.ssh/id_testnet - StrictHostKeyChecking no - \" > .ssh/config; - " - ) >> "$logFile" 2>&1 - ( - set -x - scp \ - -o StrictHostKeyChecking=no \ - -o UserKnownHostsFile=/dev/null \ - -i "$privateKey" \ - "$privateKey" "$username@$publicIp:.ssh/id_testnet" - ) >> "$logFile" 2>&1 & - declare pid=$! - - ln -sfT "$logFile" "$logDir/gcloud_PrepInstancesForSsh-$pid.log" - pids+=("$pid") - done - - for pid in "${pids[@]}"; do - declare ok=true - wait "$pid" || ok=false - if ! $ok; then - cat "$logDir/gcloud_PrepInstancesForSsh-$pid.log" - echo ^^^ +++ - exit 1 - fi - done -} diff --git a/net/ssh.sh b/net/ssh.sh index c9935a78ef..3cc281b1d1 100755 --- a/net/ssh.sh +++ b/net/ssh.sh @@ -46,7 +46,7 @@ fi printNode() { declare nodeType=$1 declare ip=$2 - printf " %-25s | For logs run: $0 $ip tail -f /tmp/solana/=/$nodeType.log\n" "$0 $ip" + printf " %-25s | For logs run: $0 $ip tail -f solana/$nodeType.log\n" "$0 $ip" } echo Leader: