Use a common solana user on all testnet instances

This commit is contained in:
Michael Vines
2018-09-08 19:19:12 -07:00
committed by Grimes
parent 7029e4395c
commit ebcac3c2d1
8 changed files with 116 additions and 181 deletions

View File

@ -7,8 +7,12 @@
# shellcheck disable=2034 # shellcheck disable=2034
# #
netConfigDir="$(dirname "${BASH_SOURCE[0]}")"/config netDir=$(
netLogDir="$(dirname "${BASH_SOURCE[0]}")"/log cd "$(dirname "${BASH_SOURCE[0]}")" || exit
echo "$PWD"
)
netConfigDir="$netDir"/config
netLogDir="$netDir"/log
mkdir -p "$netConfigDir" "$netLogDir" mkdir -p "$netConfigDir" "$netLogDir"
# shellcheck source=scripts/configure-metrics.sh # shellcheck source=scripts/configure-metrics.sh
@ -21,7 +25,6 @@ publicNetwork=
leaderIp= leaderIp=
netBasename= netBasename=
sshPrivateKey= sshPrivateKey=
sshUsername=
clientIpList=() clientIpList=()
sshOptions=() sshOptions=()
validatorIpList=() validatorIpList=()
@ -31,9 +34,10 @@ buildSshOptions() {
-o "BatchMode=yes" -o "BatchMode=yes"
-o "StrictHostKeyChecking=no" -o "StrictHostKeyChecking=no"
-o "UserKnownHostsFile=/dev/null" -o "UserKnownHostsFile=/dev/null"
-o "User=$sshUsername" -o "User=solana"
-o "IdentityFile=$sshPrivateKey" -o "IdentityFile=$sshPrivateKey"
-o "LogLevel=ERROR" -o "LogLevel=ERROR"
-F /dev/null
) )
} }
@ -47,7 +51,6 @@ loadConfigFile() {
[[ -n "$leaderIp" ]] || usage "Config file invalid, leaderIp unspecified: $configFile" [[ -n "$leaderIp" ]] || usage "Config file invalid, leaderIp unspecified: $configFile"
[[ -n "$netBasename" ]] || usage "Config file invalid, netBasename unspecified: $configFile" [[ -n "$netBasename" ]] || usage "Config file invalid, netBasename unspecified: $configFile"
[[ -n $sshPrivateKey ]] || usage "Config file invalid, sshPrivateKey unspecified: $configFile" [[ -n $sshPrivateKey ]] || usage "Config file invalid, sshPrivateKey unspecified: $configFile"
[[ -n $sshUsername ]] || usage "Config file invalid, sshUsername unspecified: $configFile"
[[ ${#validatorIpList[@]} -gt 0 ]] || usage "Config file invalid, validatorIpList unspecified: $configFile" [[ ${#validatorIpList[@]} -gt 0 ]] || usage "Config file invalid, validatorIpList unspecified: $configFile"
buildSshOptions buildSshOptions

View File

@ -106,6 +106,7 @@ done
shift $((OPTIND - 1)) shift $((OPTIND - 1))
[[ -z $1 ]] || usage "Unexpected argument: $1" [[ -z $1 ]] || usage "Unexpected argument: $1"
sshPrivateKey="$netConfigDir/id_$prefix"
prepareInstancesAndWriteConfigFile() { prepareInstancesAndWriteConfigFile() {
$metricsWriteDatapoint "testnet-deploy net-config-begin=1" $metricsWriteDatapoint "testnet-deploy net-config-begin=1"
@ -114,15 +115,10 @@ prepareInstancesAndWriteConfigFile() {
# autogenerated at $(date) # autogenerated at $(date)
netBasename=$prefix netBasename=$prefix
publicNetwork=$publicNetwork publicNetwork=$publicNetwork
sshPrivateKey=$sshPrivateKey
EOF EOF
declare sshPrivateKey="$netConfigDir/id_$prefix" buildSshOptions
rm -rf "$sshPrivateKey"{,.pub}
(
set -x
ssh-keygen -t ecdsa -N '' -f "$sshPrivateKey"
)
echo "sshPrivateKey=$sshPrivateKey" >> "$configFile"
recordInstanceIp() { recordInstanceIp() {
declare name="$1" declare name="$1"
@ -141,38 +137,79 @@ EOF
fi fi
} }
waitForStartupComplete() {
declare name="$1"
declare publicIp="$3"
echo "Waiting for $name to finish booting..."
(
for i in $(seq 1 30); do
if (set -x; ssh "${sshOptions[@]}" "$publicIp" "test -f /.gce-startup-complete"); then
break
fi
sleep 2
echo "Retry $i..."
done
)
}
echo "Looking for leader instance..." echo "Looking for leader instance..."
gcloud_FindInstances "name=$prefix-leader" show gcloud_FindInstances "name=$prefix-leader" show
[[ ${#instances[@]} -eq 1 ]] || { [[ ${#instances[@]} -eq 1 ]] || {
echo "Unable to start leader" echo "Unable to find leader"
exit 1 exit 1
} }
gcloud_FigureRemoteUsername "${instances[0]}"
sshUsername=$gcloud_username
echo "sshUsername=$sshUsername" >> "$configFile"
buildSshOptions
gcloud_PrepInstancesForSsh "$gcloud_username" "$sshPrivateKey" echo "Fetching $sshPrivateKey from $leaderName"
(
rm -rf "$sshPrivateKey"{,pub}
declare leaderName
declare leaderZone
declare leaderIp
IFS=: read -r leaderName leaderZone leaderIp _ < <(echo "${instances[0]}")
set -x
# Try to ping the machine first. There can be a delay between when the
# instance is reported as RUNNING and when it's reachable over the network
timeout 30s bash -c "set -o pipefail; until ping -c 3 $leaderIp | tr - _; do echo .; done"
# Try to scp in a couple times, sshd may not yet be up even though the
# machine can be pinged...
set -o pipefail
for i in $(seq 1 10); do
if gcloud compute scp --zone "$leaderZone" \
"$leaderName:/solana-id_ecdsa" "$sshPrivateKey"; then
break
fi
sleep 1
echo "Retry $i..."
done
chmod 400 "$sshPrivateKey"
)
echo "leaderIp=()" >> "$configFile" echo "leaderIp=()" >> "$configFile"
gcloud_ForEachInstance recordInstanceIp leaderIp gcloud_ForEachInstance recordInstanceIp leaderIp
gcloud_ForEachInstance waitForStartupComplete
echo "Looking for validator instances..." echo "Looking for validator instances..."
gcloud_FindInstances "name~^$prefix-validator" show gcloud_FindInstances "name~^$prefix-validator" show
[[ ${#instances[@]} -gt 0 ]] || { [[ ${#instances[@]} -gt 0 ]] || {
echo "Unable to start validators" echo "Unable to find validators"
exit 1 exit 1
} }
echo "validatorIpList=()" >> "$configFile" echo "validatorIpList=()" >> "$configFile"
gcloud_PrepInstancesForSsh "$gcloud_username" "$sshPrivateKey"
gcloud_ForEachInstance recordInstanceIp validatorIpList gcloud_ForEachInstance recordInstanceIp validatorIpList
gcloud_ForEachInstance waitForStartupComplete
echo "clientIpList=()" >> "$configFile" echo "clientIpList=()" >> "$configFile"
echo "Looking for client instances..." echo "Looking for client instances..."
gcloud_FindInstances "name~^$prefix-client" show gcloud_FindInstances "name~^$prefix-client" show
[[ ${#instances[@]} -eq 0 ]] || { [[ ${#instances[@]} -eq 0 ]] || {
gcloud_PrepInstancesForSsh "$gcloud_username" "$sshPrivateKey"
gcloud_ForEachInstance recordInstanceIp clientIpList gcloud_ForEachInstance recordInstanceIp clientIpList
gcloud_ForEachInstance waitForStartupComplete
} }
echo "Wrote $configFile" echo "Wrote $configFile"
@ -206,6 +243,9 @@ create)
$metricsWriteDatapoint "testnet-deploy net-create-begin=1" $metricsWriteDatapoint "testnet-deploy net-create-begin=1"
rm -rf "$sshPrivateKey"{,.pub}
ssh-keygen -t ecdsa -N '' -f "$sshPrivateKey"
printNetworkInfo() { printNetworkInfo() {
cat <<EOF cat <<EOF
======================================================================================== ========================================================================================
@ -233,15 +273,29 @@ cat > /etc/motd <<EOM
See "startup-script" log messages in /var/log/syslog for status: See "startup-script" log messages in /var/log/syslog for status:
$ sudo cat /var/log/syslog | grep startup-script $ sudo cat /var/log/syslog | grep startup-script
To block until setup is complete, run:
$ until [[ -f /.gce-startup-complete ]]; do sleep 1; done
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
EOM EOM
# Place the generated private key at /solana-id_ecdsa so it's retrievable by anybody
# who is able to log into this machine
cat > /solana-id_ecdsa <<EOK
$(cat "$sshPrivateKey")
EOK
cat > /solana-id_ecdsa.pub <<EOK
$(cat "$sshPrivateKey.pub")
EOK
chmod 444 /solana-id_ecdsa
USER=\$(id -un) USER=\$(id -un)
$( $(
cd "$here"/scripts/ cd "$here"/scripts/
cat \ cat \
disable-background-upgrades.sh \ disable-background-upgrades.sh \
create-solana-user.sh \
install-earlyoom.sh \ install-earlyoom.sh \
install-rsync.sh \ install-rsync.sh \
install-libssl-compatability.sh \ install-libssl-compatability.sh \
@ -251,6 +305,8 @@ cat > /etc/motd <<EOM
$(printNetworkInfo) $(printNetworkInfo)
EOM EOM
touch /.gce-startup-complete
EOF EOF
gcloud_CreateInstances "$prefix-leader" 1 "$zone" \ gcloud_CreateInstances "$prefix-leader" 1 "$zone" \

View File

@ -116,16 +116,7 @@ build() {
startCommon() { startCommon() {
declare ipAddress=$1 declare ipAddress=$1
test -d "$SOLANA_ROOT" test -d "$SOLANA_ROOT"
ssh "${sshOptions[@]}" "$ipAddress" " ssh "${sshOptions[@]}" "$ipAddress" "mkdir -p ~/solana ~/.cargo/bin"
mkdir -p ~/solana ~/.cargo/bin
# Help other users of the machine locate network logs
[[ -d /tmp/solana/ ]] || {
mkdir /tmp/solana/
chmod go+w /tmp/solana/
}
ln -sfT ~/solana /tmp/solana/=
"
rsync -vPrc -e "ssh ${sshOptions[*]}" \ rsync -vPrc -e "ssh ${sshOptions[*]}" \
"$SOLANA_ROOT"/{fetch-perf-libs.sh,scripts,net,multinode-demo} \ "$SOLANA_ROOT"/{fetch-perf-libs.sh,scripts,net,multinode-demo} \
"$ipAddress":~/solana/ "$ipAddress":~/solana/
@ -231,7 +222,10 @@ start() {
" "
) )
else else
snap download --channel="$snapChannel" solana (
cd "$SOLANA_ROOT"
snap download --channel="$snapChannel" solana
)
fi fi
snapFilename="$(echo "$SOLANA_ROOT"/solana_*.snap)" snapFilename="$(echo "$SOLANA_ROOT"/solana_*.snap)"
[[ -r $snapFilename ]] || { [[ -r $snapFilename ]] || {

View File

@ -2,6 +2,8 @@
cd "$(dirname "$0")"/../.. cd "$(dirname "$0")"/../..
echo "$(date) | $0 $*" > client.log
deployMethod="$1" deployMethod="$1"
entrypointIp="$2" entrypointIp="$2"
numNodes="$3" numNodes="$3"

View File

@ -0,0 +1,27 @@
#!/bin/bash -ex
[[ $(uname) = Linux ]] || exit 1
[[ $USER = root ]] || exit 1
adduser solana --gecos "" --disabled-password --quiet
adduser solana sudo
echo "solana ALL=(ALL) NOPASSWD:ALL" >> /etc/sudoers
id solana
[[ -r /solana-id_ecdsa ]] || exit 1
[[ -r /solana-id_ecdsa.pub ]] || exit 1
sudo -u solana bash -c "
mkdir -p /home/solana/.ssh/
cd /home/solana/.ssh/
cp /solana-id_ecdsa.pub authorized_keys
umask 377
cp /solana-id_ecdsa id_ecdsa
echo \"
Host *
BatchMode yes
IdentityFile ~/.ssh/id_ecdsa
StrictHostKeyChecking no
\" > config
"

View File

@ -1,5 +1,5 @@
#!/bin/bash -ex #!/bin/bash -ex
#
# Prevent background upgrades that block |apt-get| # Prevent background upgrades that block |apt-get|
# #
# TODO: This approach is pretty uncompromising. An alternative solution that # TODO: This approach is pretty uncompromising. An alternative solution that
@ -18,4 +18,3 @@ while fuser /var/lib/dpkg/lock; do
sleep 1 sleep 1
done done

View File

@ -185,149 +185,3 @@ gcloud_DeleteInstances() {
) )
} }
#
# gcloud_FigureRemoteUsername [instanceInfo]
#
# The remote username when ssh-ing into GCP instances tends to not be the same
# as the user's local username, but it needs to be discovered by ssh-ing into an
# instance and examining the system.
#
# On success the gcloud_username global variable is updated
#
# instanceInfo - an entry from the `instances` array
#
# example:
# gcloud_FigureRemoteUsername "name:zone:..."
#
gcloud_FigureRemoteUsername() {
if [[ -n $gcloud_username ]]; then
return
fi
declare instanceInfo="$1"
declare name zone publicIp
IFS=: read -r name zone publicIp _ < <(echo "$instanceInfo")
echo "Detecting remote username using $zone in $zone:"
# Figure the gcp ssh username
(
set -x
# Try to ping the machine first. There can be a delay between when the
# instance is reported as RUNNING and when it's reachable over the network
timeout 30s bash -c "set -o pipefail; until ping -c 3 $publicIp | tr - _; do echo .; done"
# Try to ssh in a couple times, sshd may not yet be up even though the
# machine can be pinged...
set -o pipefail
for i in $(seq 1 10); do
if gcloud compute ssh "$name" \
--zone "$zone" -- "echo whoami:\$USER:iamwho" \
| tr -d $'\r '| tee /tmp/whoami-$$; then
break
fi
sleep 1
echo "Retry $i..."
done
)
while IFS=: read -r whoami gcloud_username iamwho ; do
[[ $whoami == "whoami" && $iamwho == "iamwho" ]] && break;
done < /tmp/whoami-$$
rm -f /tmp/whoami-$$
if [[ -z $gcloud_username ]]; then
echo Unable to figure remote user name
exit 1
fi
echo "Remote username: $gcloud_username"
}
#
# gcloud_PrepInstancesForSsh [username] [privateKey]
#
# Prepares all the instances in the `instances` array for ssh with the specified
# keypair. This eliminates the need to use the restrictive |gcloud compute ssh|,
# use plain |ssh| instead.
#
# username - gcp ssh username as computed by gcloud_FigureRemoteUsername
# privateKey - private key to install on all the instances
#
gcloud_PrepInstancesForSsh() {
declare username="$1"
declare privateKey="$2"
declare publicKey="$privateKey".pub
declare logDir=log/
mkdir -p $logDir
rm -rf $logDir/gcloud_PrepInstancesForSsh-*
[[ -r $publicKey ]] || {
echo "Unable to read public key: $publicKey"
exit 1
}
[[ -r $privateKey ]] || {
echo "Unable to read private key: $privateKey"
exit 1
}
[[ -d $logDir ]] || {
echo "logDir does not exist: $logDir"
exit 1
}
declare -a pids
for instanceInfo in "${instances[@]}"; do
declare name zone publicIp
IFS=: read -r name zone publicIp _ < <(echo "$instanceInfo")
logFile="$logDir/gcloud_PrepInstancesForSsh-$name.log"
# TODO: This next subshell runs in series because for unknown reason running
# multiple |gcloud compute ssh| commands in parallel cause the macOS
# terminal to misbehave
(
set -x
# Try to ping the machine first. There can be a delay between when the
# instance is reported as RUNNING and when it's reachable over the network
timeout 60s bash -c "set -o pipefail; until ping -c 3 $publicIp | tr - _; do echo .; done"
gcloud compute ssh --zone "$zone" "$name" -- "
set -x;
mkdir -p .ssh;
echo \"$(cat "$publicKey")\" >> .ssh/authorized_keys;
echo \"
Host *
BatchMode yes
IdentityFile ~/.ssh/id_testnet
StrictHostKeyChecking no
\" > .ssh/config;
"
) >> "$logFile" 2>&1
(
set -x
scp \
-o StrictHostKeyChecking=no \
-o UserKnownHostsFile=/dev/null \
-i "$privateKey" \
"$privateKey" "$username@$publicIp:.ssh/id_testnet"
) >> "$logFile" 2>&1 &
declare pid=$!
ln -sfT "$logFile" "$logDir/gcloud_PrepInstancesForSsh-$pid.log"
pids+=("$pid")
done
for pid in "${pids[@]}"; do
declare ok=true
wait "$pid" || ok=false
if ! $ok; then
cat "$logDir/gcloud_PrepInstancesForSsh-$pid.log"
echo ^^^ +++
exit 1
fi
done
}

View File

@ -46,7 +46,7 @@ fi
printNode() { printNode() {
declare nodeType=$1 declare nodeType=$1
declare ip=$2 declare ip=$2
printf " %-25s | For logs run: $0 $ip tail -f /tmp/solana/=/$nodeType.log\n" "$0 $ip" printf " %-25s | For logs run: $0 $ip tail -f solana/$nodeType.log\n" "$0 $ip"
} }
echo Leader: echo Leader: