Refactor system-test dir structure and add stability test (#7530)
This commit is contained in:
388
system-test/testnet-automation.sh
Executable file
388
system-test/testnet-automation.sh
Executable file
@@ -0,0 +1,388 @@
|
||||
#!/usr/bin/env bash
|
||||
set -e
|
||||
|
||||
function execution_step {
|
||||
# shellcheck disable=SC2124
|
||||
STEP="$@"
|
||||
echo --- "${STEP[@]}"
|
||||
}
|
||||
|
||||
function collect_logs {
|
||||
execution_step "Collect logs from remote nodes"
|
||||
rm -rf net/log
|
||||
net/net.sh logs
|
||||
for logfile in net/log/* ; do
|
||||
(
|
||||
new_log=net/log/"$TESTNET_TAG"_"$NUMBER_OF_VALIDATOR_NODES"-nodes_"$(basename "$logfile")"
|
||||
cp "$logfile" "$new_log"
|
||||
upload-ci-artifact "$new_log"
|
||||
)
|
||||
done
|
||||
}
|
||||
|
||||
function analyze_packet_loss {
|
||||
(
|
||||
set -x
|
||||
# shellcheck disable=SC1091
|
||||
source net/config/config
|
||||
mkdir -p iftop-logs
|
||||
execution_step "Map private -> public IP addresses in iftop logs"
|
||||
# shellcheck disable=SC2154
|
||||
for i in "${!validatorIpList[@]}"; do
|
||||
# shellcheck disable=SC2154
|
||||
# shellcheck disable=SC2086
|
||||
# shellcheck disable=SC2027
|
||||
echo "{\"private\": \""${validatorIpListPrivate[$i]}""\", \"public\": \""${validatorIpList[$i]}""\"},"
|
||||
done > ip_address_map.txt
|
||||
|
||||
for ip in "${validatorIpList[@]}"; do
|
||||
net/scp.sh ip_address_map.txt solana@"$ip":~/solana/
|
||||
done
|
||||
|
||||
execution_step "Remotely post-process iftop logs"
|
||||
# shellcheck disable=SC2154
|
||||
for ip in "${validatorIpList[@]}"; do
|
||||
iftop_log=iftop-logs/$ip-iftop.log
|
||||
# shellcheck disable=SC2016
|
||||
net/ssh.sh solana@"$ip" 'PATH=$PATH:~/.cargo/bin/ ~/solana/scripts/iftop-postprocess.sh ~/solana/iftop.log temp.log ~solana/solana/ip_address_map.txt' > "$iftop_log"
|
||||
upload-ci-artifact "$iftop_log"
|
||||
done
|
||||
|
||||
execution_step "Analyzing Packet Loss"
|
||||
solana-release/bin/solana-log-analyzer analyze -f ./iftop-logs/ | sort -k 2 -g
|
||||
)
|
||||
}
|
||||
|
||||
function cleanup_testnet {
|
||||
RC=$?
|
||||
if [[ $RC != 0 ]] ; then
|
||||
RESULT_DETAILS="
|
||||
Test failed during step:
|
||||
${STEP}
|
||||
|
||||
Failure occured when running the following command:
|
||||
$(eval echo "$@")"
|
||||
fi
|
||||
|
||||
FINISH_UNIX_MSECS="$(($(date +%s%N)/1000000))"
|
||||
if [[ "$UPLOAD_RESULTS_TO_SLACK" = "true" ]] ; then
|
||||
upload_results_to_slack
|
||||
fi
|
||||
|
||||
(
|
||||
set +e
|
||||
execution_step "Collecting Logfiles from Nodes"
|
||||
collect_logs
|
||||
)
|
||||
|
||||
(
|
||||
set +e
|
||||
execution_step "Stop Network Software"
|
||||
net/net.sh stop
|
||||
)
|
||||
|
||||
(
|
||||
set +e
|
||||
analyze_packet_loss
|
||||
)
|
||||
|
||||
execution_step "Deleting Testnet"
|
||||
case $CLOUD_PROVIDER in
|
||||
gce)
|
||||
(
|
||||
cat <<EOF
|
||||
- wait: ~
|
||||
continue_on_failure: true
|
||||
|
||||
- command: "net/gce.sh delete -p ${TESTNET_TAG}"
|
||||
label: "Delete Testnet"
|
||||
agents:
|
||||
- "queue=testnet-deploy"
|
||||
EOF
|
||||
) | buildkite-agent pipeline upload
|
||||
;;
|
||||
ec2)
|
||||
(
|
||||
cat <<EOF
|
||||
- wait: ~
|
||||
continue_on_failure: true
|
||||
|
||||
- command: "net/ec2.sh delete -p ${TESTNET_TAG}"
|
||||
label: "Delete Testnet"
|
||||
agents:
|
||||
- "queue=testnet-deploy"
|
||||
EOF
|
||||
) | buildkite-agent pipeline upload
|
||||
;;
|
||||
azure)
|
||||
(
|
||||
cat <<EOF
|
||||
- wait: ~
|
||||
continue_on_failure: true
|
||||
|
||||
- command: "net/azure.sh delete -p ${TESTNET_TAG}"
|
||||
label: "Delete Testnet"
|
||||
agents:
|
||||
- "queue=testnet-deploy"
|
||||
EOF
|
||||
) | buildkite-agent pipeline upload
|
||||
;;
|
||||
colo)
|
||||
(
|
||||
cat <<EOF
|
||||
- wait: ~
|
||||
continue_on_failure: true
|
||||
|
||||
- command: "net/colo.sh delete -p ${TESTNET_TAG}"
|
||||
label: "Delete Testnet"
|
||||
agents:
|
||||
- "queue=colo-deploy"
|
||||
EOF
|
||||
) | buildkite-agent pipeline upload
|
||||
;;
|
||||
*)
|
||||
echo "Error: Unsupported cloud provider: $CLOUD_PROVIDER"
|
||||
;;
|
||||
esac
|
||||
}
|
||||
trap 'cleanup_testnet $BASH_COMMAND' EXIT
|
||||
|
||||
function launchTestnet() {
|
||||
set -x
|
||||
|
||||
# shellcheck disable=SC2068
|
||||
execution_step "Create ${NUMBER_OF_VALIDATOR_NODES} ${CLOUD_PROVIDER} nodes"
|
||||
|
||||
case $CLOUD_PROVIDER in
|
||||
gce)
|
||||
if [[ -z $VALIDATOR_NODE_MACHINE_TYPE ]] ; then
|
||||
echo VALIDATOR_NODE_MACHINE_TYPE not defined
|
||||
exit 1
|
||||
fi
|
||||
# shellcheck disable=SC2068
|
||||
# shellcheck disable=SC2086
|
||||
net/gce.sh create \
|
||||
-d pd-ssd \
|
||||
-n "$NUMBER_OF_VALIDATOR_NODES" -c "$NUMBER_OF_CLIENT_NODES" \
|
||||
$maybeCustomMachineType "$VALIDATOR_NODE_MACHINE_TYPE" $maybeEnableGpu \
|
||||
-p "$TESTNET_TAG" $maybeCreateAllowBootFailures $maybePublicIpAddresses \
|
||||
${TESTNET_CLOUD_ZONES[@]/#/"-z "} \
|
||||
${ADDITIONAL_FLAGS[@]/#/" "}
|
||||
;;
|
||||
ec2)
|
||||
# shellcheck disable=SC2068
|
||||
# shellcheck disable=SC2086
|
||||
net/ec2.sh create \
|
||||
-n "$NUMBER_OF_VALIDATOR_NODES" -c "$NUMBER_OF_CLIENT_NODES" \
|
||||
$maybeCustomMachineType "$VALIDATOR_NODE_MACHINE_TYPE" $maybeEnableGpu \
|
||||
-p "$TESTNET_TAG" $maybeCreateAllowBootFailures $maybePublicIpAddresses \
|
||||
${TESTNET_CLOUD_ZONES[@]/#/"-z "} \
|
||||
${ADDITIONAL_FLAGS[@]/#/" "}
|
||||
;;
|
||||
azure)
|
||||
# shellcheck disable=SC2068
|
||||
# shellcheck disable=SC2086
|
||||
net/azure.sh create \
|
||||
-n "$NUMBER_OF_VALIDATOR_NODES" -c "$NUMBER_OF_CLIENT_NODES" \
|
||||
$maybeCustomMachineType "$VALIDATOR_NODE_MACHINE_TYPE" $maybeEnableGpu \
|
||||
-p "$TESTNET_TAG" $maybeCreateAllowBootFailures $maybePublicIpAddresses \
|
||||
${TESTNET_CLOUD_ZONES[@]/#/"-z "} \
|
||||
${ADDITIONAL_FLAGS[@]/#/" "}
|
||||
;;
|
||||
colo)
|
||||
# shellcheck disable=SC2068
|
||||
# shellcheck disable=SC2086
|
||||
net/colo.sh create \
|
||||
-n "$NUMBER_OF_VALIDATOR_NODES" -c "$NUMBER_OF_CLIENT_NODES" $maybeEnableGpu \
|
||||
-p "$TESTNET_TAG" $maybePublicIpAddresses \
|
||||
${ADDITIONAL_FLAGS[@]/#/" "}
|
||||
;;
|
||||
*)
|
||||
echo "Error: Unsupported cloud provider: $CLOUD_PROVIDER"
|
||||
;;
|
||||
esac
|
||||
|
||||
execution_step "Configure database"
|
||||
net/init-metrics.sh -e
|
||||
|
||||
execution_step "Fetch reusable testnet keypairs"
|
||||
if [[ ! -d net/keypairs ]] ; then
|
||||
git clone git@github.com:solana-labs/testnet-keypairs.git net/keypairs
|
||||
# If we have provider-specific keys (CoLo*, GCE*, etc) use them instead of generic val*
|
||||
if [[ -d net/keypairs/"${CLOUD_PROVIDER}" ]] ; then
|
||||
cp net/keypairs/"${CLOUD_PROVIDER}"/* net/keypairs/
|
||||
fi
|
||||
fi
|
||||
|
||||
if [[ "$CLOUD_PROVIDER" = "colo" ]] ; then
|
||||
execution_step "Stopping Colo nodes before we start"
|
||||
net/net.sh stop
|
||||
fi
|
||||
|
||||
execution_step "Start ${NUMBER_OF_VALIDATOR_NODES} node test"
|
||||
if [[ -n $CHANNEL ]]; then
|
||||
# shellcheck disable=SC2068
|
||||
# shellcheck disable=SC2086
|
||||
net/net.sh start -t "$CHANNEL" \
|
||||
"$maybeClientOptions" "$CLIENT_OPTIONS" $maybeStartAllowBootFailures \
|
||||
--gpu-mode $startGpuMode --client-delay-start $CLIENT_DELAY_START
|
||||
else
|
||||
# shellcheck disable=SC2068
|
||||
# shellcheck disable=SC2086
|
||||
net/net.sh start -T solana-release*.tar.bz2 \
|
||||
"$maybeClientOptions" "$CLIENT_OPTIONS" $maybeStartAllowBootFailures \
|
||||
--gpu-mode $startGpuMode --client-delay-start $CLIENT_DELAY_START
|
||||
fi
|
||||
|
||||
execution_step "Wait ${RAMP_UP_TIME} seconds for network throughput to stabilize"
|
||||
sleep "$RAMP_UP_TIME"
|
||||
|
||||
execution_step "Wait ${TEST_DURATION_SECONDS} seconds to complete test"
|
||||
sleep "$TEST_DURATION_SECONDS"
|
||||
|
||||
execution_step "Collect statistics about run"
|
||||
declare q_mean_tps='
|
||||
SELECT ROUND(MEAN("median_sum")) as "mean_tps" FROM (
|
||||
SELECT MEDIAN(sum_count) AS "median_sum" FROM (
|
||||
SELECT SUM("count") AS "sum_count"
|
||||
FROM "'$TESTNET_TAG'"."autogen"."bank-process_transactions"
|
||||
WHERE time > now() - '"$TEST_DURATION_SECONDS"'s AND count > 0
|
||||
GROUP BY time(1s), host_id)
|
||||
GROUP BY time(1s)
|
||||
)'
|
||||
|
||||
declare q_max_tps='
|
||||
SELECT MAX("median_sum") as "max_tps" FROM (
|
||||
SELECT MEDIAN(sum_count) AS "median_sum" FROM (
|
||||
SELECT SUM("count") AS "sum_count"
|
||||
FROM "'$TESTNET_TAG'"."autogen"."bank-process_transactions"
|
||||
WHERE time > now() - '"$TEST_DURATION_SECONDS"'s AND count > 0
|
||||
GROUP BY time(1s), host_id)
|
||||
GROUP BY time(1s)
|
||||
)'
|
||||
|
||||
declare q_mean_confirmation='
|
||||
SELECT round(mean("duration_ms")) as "mean_confirmation_ms"
|
||||
FROM "'$TESTNET_TAG'"."autogen"."validator-confirmation"
|
||||
WHERE time > now() - '"$TEST_DURATION_SECONDS"'s'
|
||||
|
||||
declare q_max_confirmation='
|
||||
SELECT round(max("duration_ms")) as "max_confirmation_ms"
|
||||
FROM "'$TESTNET_TAG'"."autogen"."validator-confirmation"
|
||||
WHERE time > now() - '"$TEST_DURATION_SECONDS"'s'
|
||||
|
||||
declare q_99th_confirmation='
|
||||
SELECT round(percentile("duration_ms", 99)) as "99th_percentile_confirmation_ms"
|
||||
FROM "'$TESTNET_TAG'"."autogen"."validator-confirmation"
|
||||
WHERE time > now() - '"$TEST_DURATION_SECONDS"'s'
|
||||
|
||||
curl -G "${INFLUX_HOST}/query?u=ro&p=topsecret" \
|
||||
--data-urlencode "db=${TESTNET_TAG}" \
|
||||
--data-urlencode "q=$q_mean_tps;$q_max_tps;$q_mean_confirmation;$q_max_confirmation;$q_99th_confirmation" |
|
||||
python system-test/testnet-automation-json-parser.py >>"$RESULT_FILE"
|
||||
|
||||
execution_step "Writing test results to ${RESULT_FILE}"
|
||||
RESULT_DETAILS=$(<"$RESULT_FILE")
|
||||
upload-ci-artifact "$RESULT_FILE"
|
||||
}
|
||||
|
||||
RESULT_DETAILS=
|
||||
STEP=
|
||||
execution_step "Initialize Environment"
|
||||
|
||||
cd "$(dirname "$0")/.."
|
||||
|
||||
[[ -n $TESTNET_TAG ]] || TESTNET_TAG=testnet-automation
|
||||
[[ -n $INFLUX_HOST ]] || INFLUX_HOST=https://metrics.solana.com:8086
|
||||
[[ -n $RAMP_UP_TIME ]] || RAMP_UP_TIME=0
|
||||
|
||||
if [[ -z $TEST_DURATION_SECONDS ]] ; then
|
||||
echo TEST_DURATION_SECONDS not defined
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [[ -z $NUMBER_OF_VALIDATOR_NODES ]] ; then
|
||||
echo NUMBER_OF_VALIDATOR_NODES not defined
|
||||
exit 1
|
||||
fi
|
||||
|
||||
startGpuMode="off"
|
||||
if [[ -z $ENABLE_GPU ]] ; then
|
||||
ENABLE_GPU=false
|
||||
fi
|
||||
if [[ "$ENABLE_GPU" = "true" ]] ; then
|
||||
maybeEnableGpu="--enable-gpu"
|
||||
startGpuMode="on"
|
||||
fi
|
||||
|
||||
if [[ -z $NUMBER_OF_CLIENT_NODES ]] ; then
|
||||
echo NUMBER_OF_CLIENT_NODES not defined
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [[ -z $SOLANA_METRICS_CONFIG ]]; then
|
||||
if [[ -z $SOLANA_METRICS_PARTIAL_CONFIG ]]; then
|
||||
echo SOLANA_METRICS_PARTIAL_CONFIG not defined
|
||||
exit 1
|
||||
fi
|
||||
export SOLANA_METRICS_CONFIG="db=$TESTNET_TAG,host=$INFLUX_HOST,$SOLANA_METRICS_PARTIAL_CONFIG"
|
||||
fi
|
||||
echo "SOLANA_METRICS_CONFIG: $SOLANA_METRICS_CONFIG"
|
||||
|
||||
if [[ -z $ALLOW_BOOT_FAILURES ]] ; then
|
||||
ALLOW_BOOT_FAILURES=false
|
||||
fi
|
||||
if [[ "$ALLOW_BOOT_FAILURES" = "true" ]] ; then
|
||||
maybeCreateAllowBootFailures="--allow-boot-failures"
|
||||
maybeStartAllowBootFailures="-F"
|
||||
fi
|
||||
|
||||
if [[ -z $USE_PUBLIC_IP_ADDRESSES ]] ; then
|
||||
USE_PUBLIC_IP_ADDRESSES=false
|
||||
fi
|
||||
if [[ "$USE_PUBLIC_IP_ADDRESSES" = "true" ]] ; then
|
||||
maybePublicIpAddresses="-P"
|
||||
fi
|
||||
|
||||
: "${CLIENT_DELAY_START:=0}"
|
||||
|
||||
if [[ -z $CHANNEL ]]; then
|
||||
execution_step "Downloading tar from build artifacts"
|
||||
buildkite-agent artifact download "solana-release*.tar.bz2" .
|
||||
fi
|
||||
|
||||
# shellcheck disable=SC1091
|
||||
source ci/upload-ci-artifact.sh
|
||||
source system-test/upload_results_to_slack.sh
|
||||
|
||||
maybeClientOptions=${CLIENT_OPTIONS:+"-c"}
|
||||
maybeCustomMachineType=${VALIDATOR_NODE_MACHINE_TYPE:+"--custom-machine-type"}
|
||||
|
||||
IFS=, read -r -a TESTNET_CLOUD_ZONES <<<"${TESTNET_ZONES}"
|
||||
|
||||
RESULT_FILE="$TESTNET_TAG"_SUMMARY_STATS_"$NUMBER_OF_VALIDATOR_NODES".log
|
||||
rm -f "$RESULT_FILE"
|
||||
|
||||
TEST_PARAMS_TO_DISPLAY=(CLOUD_PROVIDER \
|
||||
NUMBER_OF_VALIDATOR_NODES \
|
||||
ENABLE_GPU \
|
||||
VALIDATOR_NODE_MACHINE_TYPE \
|
||||
NUMBER_OF_CLIENT_NODES \
|
||||
CLIENT_OPTIONS \
|
||||
CLIENT_DELAY_START \
|
||||
TESTNET_ZONES \
|
||||
TEST_DURATION_SECONDS \
|
||||
USE_PUBLIC_IP_ADDRESSES \
|
||||
ALLOW_BOOT_FAILURES \
|
||||
ADDITIONAL_FLAGS)
|
||||
|
||||
TEST_CONFIGURATION=
|
||||
for i in "${TEST_PARAMS_TO_DISPLAY[@]}" ; do
|
||||
if [[ -n ${!i} ]] ; then
|
||||
TEST_CONFIGURATION+="${i} = ${!i} | "
|
||||
fi
|
||||
done
|
||||
|
||||
START_UNIX_MSECS="$(($(date +%s%N)/1000000))"
|
||||
|
||||
launchTestnet
|
Reference in New Issue
Block a user