diff --git a/ci/solana-testnet.yml b/ci/solana-testnet.yml deleted file mode 100755 index 296bbc4cec..0000000000 --- a/ci/solana-testnet.yml +++ /dev/null @@ -1,13 +0,0 @@ -steps: - - command: "ci/testnet-automation.sh" - label: "run testnet" - agents: - - "queue=testnet-deploy" - - - wait: ~ - continue_on_failure: true - - - command: "ci/testnet-automation-cleanup.sh" - label: "delete testnet" - agents: - - "queue=testnet-deploy" diff --git a/ci/testnet-automation-cleanup.sh b/ci/testnet-automation-cleanup.sh deleted file mode 100755 index ce69aa88a7..0000000000 --- a/ci/testnet-automation-cleanup.sh +++ /dev/null @@ -1,10 +0,0 @@ -#!/usr/bin/env bash -set -e - -cd "$(dirname "$0")/.." - -echo --- find testnet configuration -net/gce.sh config -p testnet-automation - -echo --- delete testnet -net/gce.sh delete -p testnet-automation diff --git a/ci/testnet-automation.sh b/ci/testnet-automation.sh deleted file mode 100755 index 814aaa1508..0000000000 --- a/ci/testnet-automation.sh +++ /dev/null @@ -1,96 +0,0 @@ -#!/usr/bin/env bash -set -e - -cd "$(dirname "$0")/.." - -if [[ -z $USE_PREBUILT_CHANNEL_TARBALL ]]; then - echo --- downloading tar from build artifacts - buildkite-agent artifact download "solana-release*.tar.bz2" . -fi - -# shellcheck disable=SC1091 -source ci/upload-ci-artifact.sh - -[[ -n $ITERATION_WAIT ]] || ITERATION_WAIT=300 -[[ -n $NUMBER_OF_NODES ]] || NUMBER_OF_NODES="10 25 50 100" -[[ -n $LEADER_CPU_MACHINE_TYPE ]] || - LEADER_CPU_MACHINE_TYPE="--machine-type n1-standard-16 --accelerator count=2,type=nvidia-tesla-v100" -[[ -n $CLIENT_COUNT ]] || CLIENT_COUNT=2 -[[ -n $TESTNET_TAG ]] || TESTNET_TAG=testnet-automation -[[ -n $TESTNET_ZONES ]] || TESTNET_ZONES="us-west1-b" -[[ -n $CHANNEL ]] || CHANNEL=beta -[[ -n $ADDITIONAL_FLAGS ]] || ADDITIONAL_FLAGS="" - -TESTNET_CLOUD_ZONES=(); while read -r -d, ; do TESTNET_CLOUD_ZONES+=( "$REPLY" ); done <<< "${TESTNET_ZONES}," - -launchTestnet() { - declare nodeCount=$1 - echo --- setup "$nodeCount" node test - - # shellcheck disable=SC2068 - net/gce.sh create \ - -d pd-ssd \ - -n "$nodeCount" -c "$CLIENT_COUNT" \ - -G "$LEADER_CPU_MACHINE_TYPE" \ - -p "$TESTNET_TAG" ${TESTNET_CLOUD_ZONES[@]/#/-z } "$ADDITIONAL_FLAGS" - - echo --- configure database - net/init-metrics.sh -e - - echo --- start "$nodeCount" node test - if [[ -n $USE_PREBUILT_CHANNEL_TARBALL ]]; then - net/net.sh start -o noValidatorSanity -t "$CHANNEL" - else - net/net.sh start -o noValidatorSanity -T solana-release*.tar.bz2 - fi - - echo --- wait "$ITERATION_WAIT" seconds to complete test - sleep "$ITERATION_WAIT" - - set -x - - declare q_mean_tps=' - SELECT round(mean("sum_count")) AS "mean_tps" FROM ( - SELECT sum("count") AS "sum_count" - FROM "testnet-automation"."autogen"."banking_stage-record_transactions" - WHERE time > now() - 300s GROUP BY time(1s) - )' - - declare q_max_tps=' - SELECT round(max("sum_count")) AS "max_tps" FROM ( - SELECT sum("count") AS "sum_count" - FROM "testnet-automation"."autogen"."banking_stage-record_transactions" - WHERE time > now() - 300s GROUP BY time(1s) - )' - - declare q_mean_confirmation=' - SELECT round(mean("duration_ms")) as "mean_confirmation" - FROM "testnet-automation"."autogen"."validator-confirmation" - WHERE time > now() - 300s' - - declare q_max_confirmation=' - SELECT round(max("duration_ms")) as "max_confirmation" - FROM "testnet-automation"."autogen"."validator-confirmation" - WHERE time > now() - 300s' - - declare q_99th_confirmation=' - SELECT round(percentile("duration_ms", 99)) as "99th_confirmation" - FROM "testnet-automation"."autogen"."validator-confirmation" - WHERE time > now() - 300s' - - curl -G "${INFLUX_HOST}/query?u=ro&p=topsecret" \ - --data-urlencode "db=testnet-automation" \ - --data-urlencode "q=$q_mean_tps;$q_max_tps;$q_mean_confirmation;$q_max_confirmation;$q_99th_confirmation" | - python ci/testnet-automation-json-parser.py >>TPS"$nodeCount".log - - upload-ci-artifact TPS"$nodeCount".log -} - -# This is needed, because buildkite doesn't let us define an array of numbers. -# The array is defined as a space separated string of numbers -# shellcheck disable=SC2206 -nodes_count_array=($NUMBER_OF_NODES) - -for n in "${nodes_count_array[@]}"; do - launchTestnet "$n" -done diff --git a/system-test/testnet-performance/colo-gpu-perf.yml b/system-test/testnet-performance/colo-gpu-perf.yml new file mode 100755 index 0000000000..7e2f982fc7 --- /dev/null +++ b/system-test/testnet-performance/colo-gpu-perf.yml @@ -0,0 +1,14 @@ +steps: + - command: "system-test/testnet-performance/testnet-automation.sh" + label: "COLO performance testnet GPU enabled" + env: + CLOUD_PROVIDER: "colo" + TESTNET_TAG: "colo-edge-perf-gpu-enabled" + RAMP_UP_TIME: 60 + TEST_DURATION: 300 + NUMBER_OF_VALIDATOR_NODES: 4 + NUMBER_OF_CLIENT_NODES: 2 + CLIENT_OPTIONS: "bench-tps=2=--tx_count 80000 --thread-batch-sleep-ms 1000" + ADDITIONAL_FLAGS: "" + agents: + - "queue=colo-deploy" diff --git a/system-test/testnet-performance/gce-cpu-only-perf.yml b/system-test/testnet-performance/gce-cpu-only-perf.yml new file mode 100755 index 0000000000..1047712cd9 --- /dev/null +++ b/system-test/testnet-performance/gce-cpu-only-perf.yml @@ -0,0 +1,16 @@ +steps: + - command: "system-test/testnet-performance/testnet-automation.sh" + label: "GCE performance testnets CPU ONLY" + env: + CLOUD_PROVIDER: "gce" + TESTNET_TAG: "gce-edge-perf-cpu-only" + RAMP_UP_TIME: 60 + TEST_DURATION: 300 + NUMBER_OF_VALIDATOR_NODES: 10 + VALIDATOR_NODE_MACHINE_TYPE: "--machine-type n1-standard-16" + NUMBER_OF_CLIENT_NODES: 1 + CLIENT_OPTIONS: "bench-tps=1=--tx_count 80000 --thread-batch-sleep-ms 1000" + TESTNET_ZONES: "us-west1-a,us-west1-b,us-central1-a,europe-west4-a" + ADDITIONAL_FLAGS: "" + agents: + - "queue=testnet-deploy" diff --git a/system-test/testnet-performance/gce-gpu-perf.yml b/system-test/testnet-performance/gce-gpu-perf.yml new file mode 100755 index 0000000000..755cd6ca7e --- /dev/null +++ b/system-test/testnet-performance/gce-gpu-perf.yml @@ -0,0 +1,16 @@ +steps: + - command: "system-test/testnet-performance/testnet-automation.sh" + label: "GCE performance testnets GPU ENABLED" + env: + CLOUD_PROVIDER: "gce" + TESTNET_TAG: "gce-edge-perf-gpu-enabled" + RAMP_UP_TIME: 60 + TEST_DURATION: 300 + NUMBER_OF_VALIDATOR_NODES: 10 + VALIDATOR_NODE_MACHINE_TYPE: "--machine-type n1-standard-16 --accelerator count=2,type=nvidia-tesla-v100" + NUMBER_OF_CLIENT_NODES: 1 + CLIENT_OPTIONS: "bench-tps=1=--tx_count 80000 --thread-batch-sleep-ms 1000" + TESTNET_ZONES: "us-west1-a,us-west1-b,us-central1-a,europe-west4-a" + ADDITIONAL_FLAGS: "" + agents: + - "queue=testnet-deploy" diff --git a/ci/testnet-automation-json-parser.py b/system-test/testnet-performance/testnet-automation-json-parser.py similarity index 100% rename from ci/testnet-automation-json-parser.py rename to system-test/testnet-performance/testnet-automation-json-parser.py diff --git a/system-test/testnet-performance/testnet-automation.sh b/system-test/testnet-performance/testnet-automation.sh new file mode 100755 index 0000000000..f7e93a8414 --- /dev/null +++ b/system-test/testnet-performance/testnet-automation.sh @@ -0,0 +1,178 @@ +#!/usr/bin/env bash +set -e + +# TODO: Make sure a dB named $TESTNET_TAG exists in the influxDB host, or can be created +[[ -n $TESTNET_TAG ]] || TESTNET_TAG=testnet-automation +[[ -n $INFLUX_HOST ]] || INFLUX_HOST=https://metrics.solana.com:8086 + +# TODO: Remove all default values, force explicitness in the testcase definition +[[ -n $TEST_DURATION ]] || TEST_DURATION=300 +[[ -n $RAMP_UP_TIME ]] || RAMP_UP_TIME=60 +[[ -n $NUMBER_OF_VALIDATOR_NODES ]] || NUMBER_OF_VALIDATOR_NODES=2 +[[ -n $NUMBER_OF_CLIENT_NODES ]] || NUMBER_OF_CLIENT_NODES=1 +[[ -n $TESTNET_ZONES ]] || TESTNET_ZONES="us-west1-a" + +function collect_logs { + echo --- collect logs from remote nodes + rm -rf net/log + net/net.sh logs + for logfile in net/log/* ; do + ( + new_log=net/log/"$TESTNET_TAG"_"$NUMBER_OF_VALIDATOR_NODES"-nodes_"$(basename "$logfile")" + cp "$logfile" "$new_log" + upload-ci-artifact "$new_log" + ) + done +} + +function cleanup_testnet { + ( + set +e + collect_logs + ) + + ( + set +e + echo --- Stop Network Software + net/net.sh stop + ) + + case $CLOUD_PROVIDER in + gce) + ( + cat < now() - '"$TEST_DURATION"'s GROUP BY time(1s) + )' + + declare q_max_tps=' + SELECT round(max("sum_count")) AS "max_tps" FROM ( + SELECT sum("count") AS "sum_count" + FROM "'$TESTNET_TAG'"."autogen"."banking_stage-record_transactions" + WHERE time > now() - '"$TEST_DURATION"'s GROUP BY time(1s) + )' + + declare q_mean_confirmation=' + SELECT round(mean("duration_ms")) as "mean_confirmation" + FROM "'$TESTNET_TAG'"."autogen"."validator-confirmation" + WHERE time > now() - '"$TEST_DURATION"'s' + + declare q_max_confirmation=' + SELECT round(max("duration_ms")) as "max_confirmation" + FROM "'$TESTNET_TAG'"."autogen"."validator-confirmation" + WHERE time > now() - '"$TEST_DURATION"'s' + + declare q_99th_confirmation=' + SELECT round(percentile("duration_ms", 99)) as "99th_confirmation" + FROM "'$TESTNET_TAG'"."autogen"."validator-confirmation" + WHERE time > now() - '"$TEST_DURATION"'s' + + RESULTS_FILE="$TESTNET_TAG"_SUMMARY_STATS_"$NUMBER_OF_VALIDATOR_NODES".log + curl -G "${INFLUX_HOST}/query?u=ro&p=topsecret" \ + --data-urlencode "db=${TESTNET_TAG}" \ + --data-urlencode "q=$q_mean_tps;$q_max_tps;$q_mean_confirmation;$q_max_confirmation;$q_99th_confirmation" | + python system-test/testnet-performance/testnet-automation-json-parser.py >>"$RESULTS_FILE" + + upload-ci-artifact "$RESULTS_FILE" +} + +cd "$(dirname "$0")/../.." + +if [[ -z $SOLANA_METRICS_CONFIG ]]; then + if [[ -z $SOLANA_METRICS_PARTIAL_CONFIG ]]; then + echo SOLANA_METRICS_PARTIAL_CONFIG not defined + exit 1 + fi + export SOLANA_METRICS_CONFIG="db=$TESTNET_TAG,host=$INFLUX_HOST,$SOLANA_METRICS_PARTIAL_CONFIG" +fi +echo "SOLANA_METRICS_CONFIG: $SOLANA_METRICS_CONFIG" + +if [[ -z $CHANNEL ]]; then + echo --- downloading tar from build artifacts + buildkite-agent artifact download "solana-release*.tar.bz2" . +fi + +# shellcheck disable=SC1091 +source ci/upload-ci-artifact.sh + +maybeClientOptions=${CLIENT_OPTIONS:+"-c"} +maybeMachineType=${VALIDATOR_NODE_MACHINE_TYPE:+"-G"} + +IFS=, read -r -a TESTNET_CLOUD_ZONES <<<"${TESTNET_ZONES}" + +launchTestnet