Fixup scripts to set up a new CI node (#9348)

* Clean up node setup scripts for new CI boxes

* Move files under ci directory

* Set CUDA env var to setup cuda drivers

* Fixup and add README

* shellcheck

* Apply review feedback, rename dir and setup files

Co-authored-by: publish-docs.sh <maintainers@solana.com>
This commit is contained in:
Dan Albert
2020-04-20 17:43:13 -06:00
committed by GitHub
parent 41fec5bd5b
commit 3fbe7f0bb3
19 changed files with 266 additions and 147 deletions

View File

@@ -0,0 +1,13 @@
#!/usr/bin/env bash
HERE="$(dirname "$0")"
# shellcheck source=ci/setup-new-buildkite-agent/utils.sh
source "$HERE"/utils.sh
ensure_env || exit 1
set -xe
systemctl disable systemd-networkd-wait-online.service
systemctl mask systemd-networkd-wait-online.service

View File

@@ -0,0 +1,15 @@
#!/usr/bin/env bash
HERE="$(dirname "$0")"
# shellcheck source=ci/setup-new-buildkite-agent/utils.sh
source "$HERE"/utils.sh
ensure_env || exit 1
cat <<EOF > /etc/modprobe.d/blacklist-nouveau.conf
blacklist nouveau
options nouveau modeset=0
EOF
update-initramfs -u

View File

@@ -0,0 +1,4 @@
#!/usr/bin/env bash
sudo systemctl daemon-reload
sudo systemctl enable --now buildkite-agent

View File

@@ -0,0 +1,14 @@
#!/usr/bin/env bash
HERE="$(dirname "$0")"
# shellcheck source=ci/setup-new-buildkite-agent/utils.sh
source "$HERE"/utils.sh
ensure_env || exit 1
set -xe
echo "preserve_hostname: false" > /etc/cloud/cloud.cfg.d/99-disable-preserve-hostname.cfg
systemctl restart cloud-init
hostnamectl set-hostname "$1"

View File

@@ -0,0 +1,84 @@
#!/usr/bin/env bash
HERE="$(dirname "$0")"
# shellcheck source=ci/setup-new-buildkite-agent/utils.sh
source "$HERE"/utils.sh
ensure_env || exit 1
set -e
# Install buildkite-agent
echo "deb https://apt.buildkite.com/buildkite-agent stable main" | tee /etc/apt/sources.list.d/buildkite-agent.list
apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv-keys 32A37959C2FA5C3C99EFBC32A79206696452D198
apt-get update
apt-get install -y buildkite-agent
# Configure the installation
echo "Go to https://buildkite.com/organizations/solana-labs/agents"
echo "Click Reveal Agent Token"
echo "Paste the Agent Token, then press Enter:"
read -r agent_token
sudo sed -i "s/xxx/$agent_token/g" /etc/buildkite-agent/buildkite-agent.cfg
cat > /etc/buildkite-agent/hooks/environment <<EOF
set -e
export BUILDKITE_GIT_CLEAN_FLAGS="-ffdqx"
# Hack for non-docker rust builds
export PATH='$PATH':~buildkite-agent/.cargo/bin
# Add path to snaps
source /etc/profile.d/apps-bin-path.sh
if [[ '$BUILDKITE_BRANCH' =~ pull/* ]]; then
export BUILDKITE_REFSPEC="+'$BUILDKITE_BRANCH':refs/remotes/origin/'$BUILDKITE_BRANCH'"
fi
EOF
chown buildkite-agent:buildkite-agent /etc/buildkite-agent/hooks/environment
# Create SSH key
sudo -u buildkite-agent mkdir -p ~buildkite-agent/.ssh
sudo -u buildkite-agent ssh-keygen -t ecdsa -q -N "" -f ~buildkite-agent/.ssh/id_ecdsa
# Set buildkite-agent user's shell
sudo usermod --shell /bin/bash buildkite-agent
# Install Rust for buildkite-agent
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs -o /tmp/rustup-init.sh
sudo -u buildkite-agent HOME=~buildkite-agent sh /tmp/rustup-init.sh -y
# Add to docker and sudoers group
addgroup buildkite-agent docker
addgroup buildkite-agent sudo
# Edit the systemd unit file to include LimitNOFILE
cat > /lib/systemd/system/buildkite-agent.service <<EOF
[Unit]
Description=Buildkite Agent
Documentation=https://buildkite.com/agent
After=syslog.target
After=network.target
[Service]
Type=simple
User=buildkite-agent
Environment=HOME=/var/lib/buildkite-agent
ExecStart=/usr/bin/buildkite-agent start
RestartSec=5
Restart=on-failure
RestartForceExitStatus=SIGPIPE
TimeoutStartSec=10
TimeoutStopSec=0
KillMode=process
LimitNOFILE=65536
[Install]
WantedBy=multi-user.target
DefaultInstance=1
EOF

View File

@@ -0,0 +1,65 @@
#!/usr/bin/env bash
# https://developer.nvidia.com/cuda-toolkit-archive
VERSIONS=()
#VERSIONS+=("https://developer.nvidia.com/compute/cuda/10.0/Prod/local_installers/cuda_10.0.130_410.48_linux")
#VERSIONS+=("https://developer.nvidia.com/compute/cuda/10.1/Prod/local_installers/cuda_10.1.168_418.67_linux.run")
VERSIONS+=("http://developer.download.nvidia.com/compute/cuda/10.2/Prod/local_installers/cuda_10.2.89_440.33.01_linux.run")
HERE="$(dirname "$0")"
# shellcheck source=ci/setup-new-buildkite-agent/utils.sh
source "$HERE"/utils.sh
ensure_env || exit 1
set -xe
RUN_FILES=()
FAILED=()
for i in "${!VERSIONS[@]}"; do
URL=${VERSIONS[$i]}
RUN_FILE="$(basename "$URL")"
DEST="${HERE}/${RUN_FILE}"
if [[ -f "$DEST" ]]; then
RUN_FILES+=( "$DEST" )
else
echo -ne "Downloading ${RUN_FILE}:\t"
if wget --read-timeout=180 --tries=3 -O "$DEST" "$URL"; then
echo "OK"
RUN_FILES+=( "$DEST" )
else
echo "FAILED. Retrying..."
FAILED+=( "$URL" )
fi
fi
done
if [[ 0 -ne ${#FAILED[@]} ]]; then
for f in "${FAILED[@]}"; do
echo "Failed to download required resource: $f"
done
echo "Please manually download the above resources, save them to \"${HERE}\" and rerun $0"
exit 1
fi
apt update
apt install -y gcc make dkms
for rf in "${RUN_FILES[@]}"; do
sh "$rf" --silent --driver --toolkit
done
# Allow normal users to use CUDA profiler
echo 'options nvidia "NVreg_RestrictProfilingToAdminUsers=0"' > /etc/modprobe.d/nvidia-enable-user-profiling.conf
# setup persistence mode across reboots
TMPDIR="$(mktemp -d)"
if pushd "$TMPDIR"; then
tar -xvf /usr/share/doc/NVIDIA_GLX-1.0/samples/nvidia-persistenced-init.tar.bz2
./nvidia-persistenced-init/install.sh systemd
popd
rm -rf "$TMPDIR"
fi
nvidia-smi -pm ENABLED

View File

@@ -0,0 +1,12 @@
#!/usr/bin/env bash
HERE="$(dirname "$0")"
# shellcheck source=ci/setup-new-buildkite-agent/utils.sh
source "$HERE"/utils.sh
ensure_env || exit 1
# Allow more files to be opened by a user
sed -i 's/^\(# End of file\)/* soft nofile 65535\n\n\1/' /etc/security/limits.conf

View File

@@ -0,0 +1,49 @@
#!/usr/bin/env bash
HERE="$(dirname "$0")"
SOLANA_ROOT="$HERE"/../..
# shellcheck source=ci/setup-new-buildkite-agent/utils.sh
source "$HERE"/utils.sh
ensure_env || exit 1
set -ex
apt update
apt upgrade -y
cat >/etc/apt/apt.conf.d/99-solana <<'EOF'
// Set and persist extra caps on iftop binary
Dpkg::Post-Invoke { "which iftop 2>&1 >/dev/null && setcap cap_net_raw=eip $(which iftop) || true"; };
EOF
apt install -y build-essential pkg-config clang cmake sysstat linux-tools-common \
linux-generic-hwe-18.04-edge linux-tools-generic-hwe-18.04-edge \
iftop heaptrack jq ruby python3-venv gcc-multilib libudev-dev
gem install ejson ejson2env
mkdir -p /opt/ejson/keys
"$SOLANA_ROOT"/net/scripts/install-docker.sh
usermod -aG docker "$SETUP_USER"
"$SOLANA_ROOT"/net/scripts/install-certbot.sh
"$HERE"/setup-sudoers.sh
"$HERE"/setup-ssh.sh
"$HERE"/disable-nouveau.sh
"$HERE"/disable-networkd-wait.sh
"$SOLANA_ROOT"/net/scripts/install-earlyoom.sh
"$SOLANA_ROOT"/net/scripts/install-nodejs.sh
"$SOLANA_ROOT"/net/scripts/localtime.sh
"$SOLANA_ROOT"/net/scripts/install-redis.sh
"$SOLANA_ROOT"/net/scripts/install-rsync.sh
"$SOLANA_ROOT"/net/scripts/install-libssl-compatability.sh
"$HERE"/setup-procfs-knobs.sh
"$HERE"/setup-limits.sh
[[ -n $CUDA ]] && "$HERE"/setup-cuda.sh
exit 0

View File

@@ -0,0 +1,22 @@
#!/usr/bin/env bash
HERE="$(dirname "$0")"
# shellcheck source=ci/setup-new-buildkite-agent/utils.sh
source "$HERE"/utils.sh
ensure_env || exit 1
set -ex
"$HERE"/disable-nouveau.sh
"$HERE"/disable-networkd-wait.sh
"$HERE"/setup-grub.sh
"$HERE"/setup-cuda.sh
"$HERE"/setup-procfs-knobs.sh
"$HERE"/setup-limits.sh
PASSWORD="$(dd if=/dev/urandom bs=1 count=9 status=none | base64)"
echo "$PASSWORD"
chpasswd <<< "solana:$PASSWORD"

View File

@@ -0,0 +1,25 @@
#!/usr/bin/env bash
HERE="$(dirname "$0")"
# shellcheck source=ci/setup-new-buildkite-agent/utils.sh
source "$HERE"/utils.sh
ensure_env || exit 1
# Setup kernel constants
cat > /etc/sysctl.d/20-solana-node.conf <<EOF
# Solana networking requirements
net.core.rmem_default=134217728
net.core.rmem_max=134217728
net.core.wmem_default=134217728
net.core.wmem_max=134217728
# Solana earlyoom setup
kernel.sysrq=$(( $(cat /proc/sys/kernel/sysrq) | 64 ))
# Allow kernel and CPU perf events
kernel.perf_event_paranoid=0
EOF

View File

@@ -0,0 +1,15 @@
#!/usr/bin/env bash
HERE="$(dirname "$0")"
# shellcheck source=ci/setup-new-buildkite-agent/utils.sh
source "$HERE"/utils.sh
ensure_env || exit 1
set -xe
# Setup sshd
sed -i 's/^PasswordAuthentication yes//' /etc/ssh/sshd_config
sed -i 's/^#\(PasswordAuthentication\) yes/\1 no/' /etc/ssh/sshd_config
sed -i 's/^#\(PermitRootLogin\) .*/\1 no/' /etc/ssh/sshd_config
systemctl restart sshd

View File

@@ -0,0 +1,48 @@
#!/usr/bin/env bash
HERE="$(dirname "$0")"
# shellcheck source=ci/setup-new-buildkite-agent/utils.sh
source "$HERE"/utils.sh
ensure_env || exit 1
set -xe
# Enable passwordless sudo
EDITOR='tee' visudo <<EOF
#
# This file MUST be edited with the 'visudo' command as root.
#
# Please consider adding local content in /etc/sudoers.d/ instead of
# directly modifying this file.
#
# See the man page for details on how to write a sudoers file.
#
Defaults env_reset
Defaults mail_badpass
Defaults secure_path="/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/snap/bin"
# Host alias specification
# User alias specification
# Cmnd alias specification
# User privilege specification
root ALL=(ALL:ALL) ALL
# Members of the admin group may gain root privileges
%admin ALL=(ALL) ALL
# Allow members of group sudo to execute any command
%sudo ALL=(ALL:ALL) ALL
# Allow all members of sudo group to use passwordless sudo
%sudo ALL=(ALL) NOPASSWD:ALL
# See sudoers(5) for more information on "#include" directives:
#includedir /etc/sudoers.d
EOF

View File

@@ -0,0 +1,16 @@
#!/usr/bin/env bash
# We need root access, but also appropriate envvar values. Require scripts to
# run with sudo as a normal user
ensure_env() {
RC=false
[ $EUID -eq 0 ] && [ -n "$SUDO_USER" ] && [ "$SUDO_USER" != "root" ] && RC=true
if $RC; then
export SETUP_USER="$SUDO_USER"
export SETUP_HOME="$HOME"
else
echo "Please run \"$0\" via sudo as a normal user"
fi
$RC
}