Implement nightly performance tests (#6140)
* Implement nightly performance tests on colo
This commit is contained in:
		| @@ -1,13 +0,0 @@ | |||||||
| steps: |  | ||||||
|   - command: "ci/testnet-automation.sh" |  | ||||||
|     label: "run testnet" |  | ||||||
|     agents: |  | ||||||
|       - "queue=testnet-deploy" |  | ||||||
|  |  | ||||||
|   - wait: ~ |  | ||||||
|     continue_on_failure: true |  | ||||||
|  |  | ||||||
|   - command: "ci/testnet-automation-cleanup.sh" |  | ||||||
|     label: "delete testnet" |  | ||||||
|     agents: |  | ||||||
|       - "queue=testnet-deploy" |  | ||||||
| @@ -1,10 +0,0 @@ | |||||||
| #!/usr/bin/env bash |  | ||||||
| set -e |  | ||||||
|  |  | ||||||
| cd "$(dirname "$0")/.." |  | ||||||
|  |  | ||||||
| echo --- find testnet configuration |  | ||||||
| net/gce.sh config -p testnet-automation |  | ||||||
|  |  | ||||||
| echo --- delete testnet |  | ||||||
| net/gce.sh delete -p testnet-automation |  | ||||||
| @@ -1,96 +0,0 @@ | |||||||
| #!/usr/bin/env bash |  | ||||||
| set -e |  | ||||||
|  |  | ||||||
| cd "$(dirname "$0")/.." |  | ||||||
|  |  | ||||||
| if [[ -z $USE_PREBUILT_CHANNEL_TARBALL ]]; then |  | ||||||
|   echo --- downloading tar from build artifacts |  | ||||||
|   buildkite-agent artifact download "solana-release*.tar.bz2" . |  | ||||||
| fi |  | ||||||
|  |  | ||||||
| # shellcheck disable=SC1091 |  | ||||||
| source ci/upload-ci-artifact.sh |  | ||||||
|  |  | ||||||
| [[ -n $ITERATION_WAIT ]] || ITERATION_WAIT=300 |  | ||||||
| [[ -n $NUMBER_OF_NODES ]] || NUMBER_OF_NODES="10 25 50 100" |  | ||||||
| [[ -n $LEADER_CPU_MACHINE_TYPE ]] || |  | ||||||
|   LEADER_CPU_MACHINE_TYPE="--machine-type n1-standard-16 --accelerator count=2,type=nvidia-tesla-v100" |  | ||||||
| [[ -n $CLIENT_COUNT ]] || CLIENT_COUNT=2 |  | ||||||
| [[ -n $TESTNET_TAG ]] || TESTNET_TAG=testnet-automation |  | ||||||
| [[ -n $TESTNET_ZONES ]] || TESTNET_ZONES="us-west1-b" |  | ||||||
| [[ -n $CHANNEL ]] || CHANNEL=beta |  | ||||||
| [[ -n $ADDITIONAL_FLAGS ]] || ADDITIONAL_FLAGS="" |  | ||||||
|  |  | ||||||
| TESTNET_CLOUD_ZONES=(); while read -r -d, ; do TESTNET_CLOUD_ZONES+=( "$REPLY" ); done <<< "${TESTNET_ZONES}," |  | ||||||
|  |  | ||||||
| launchTestnet() { |  | ||||||
|   declare nodeCount=$1 |  | ||||||
|   echo --- setup "$nodeCount" node test |  | ||||||
|  |  | ||||||
|   # shellcheck disable=SC2068 |  | ||||||
|   net/gce.sh create \ |  | ||||||
|     -d pd-ssd \ |  | ||||||
|     -n "$nodeCount" -c "$CLIENT_COUNT" \ |  | ||||||
|     -G "$LEADER_CPU_MACHINE_TYPE" \ |  | ||||||
|     -p "$TESTNET_TAG" ${TESTNET_CLOUD_ZONES[@]/#/-z } "$ADDITIONAL_FLAGS" |  | ||||||
|  |  | ||||||
|   echo --- configure database |  | ||||||
|   net/init-metrics.sh -e |  | ||||||
|  |  | ||||||
|   echo --- start "$nodeCount" node test |  | ||||||
|   if [[ -n $USE_PREBUILT_CHANNEL_TARBALL ]]; then |  | ||||||
|     net/net.sh start -o noValidatorSanity -t "$CHANNEL" |  | ||||||
|   else |  | ||||||
|     net/net.sh start -o noValidatorSanity -T solana-release*.tar.bz2 |  | ||||||
|   fi |  | ||||||
|  |  | ||||||
|   echo --- wait "$ITERATION_WAIT" seconds to complete test |  | ||||||
|   sleep "$ITERATION_WAIT" |  | ||||||
|  |  | ||||||
|   set -x |  | ||||||
|  |  | ||||||
|   declare q_mean_tps=' |  | ||||||
|     SELECT round(mean("sum_count")) AS "mean_tps" FROM ( |  | ||||||
|       SELECT sum("count") AS "sum_count" |  | ||||||
|         FROM "testnet-automation"."autogen"."banking_stage-record_transactions" |  | ||||||
|         WHERE time > now() - 300s GROUP BY time(1s) |  | ||||||
|     )' |  | ||||||
|  |  | ||||||
|   declare q_max_tps=' |  | ||||||
|     SELECT round(max("sum_count")) AS "max_tps" FROM ( |  | ||||||
|       SELECT sum("count") AS "sum_count" |  | ||||||
|         FROM "testnet-automation"."autogen"."banking_stage-record_transactions" |  | ||||||
|         WHERE time > now() - 300s GROUP BY time(1s) |  | ||||||
|     )' |  | ||||||
|  |  | ||||||
|   declare q_mean_confirmation=' |  | ||||||
|     SELECT round(mean("duration_ms")) as "mean_confirmation" |  | ||||||
|       FROM "testnet-automation"."autogen"."validator-confirmation" |  | ||||||
|       WHERE time > now() - 300s' |  | ||||||
|  |  | ||||||
|   declare q_max_confirmation=' |  | ||||||
|     SELECT round(max("duration_ms")) as "max_confirmation" |  | ||||||
|       FROM "testnet-automation"."autogen"."validator-confirmation" |  | ||||||
|       WHERE time > now() - 300s' |  | ||||||
|  |  | ||||||
|   declare q_99th_confirmation=' |  | ||||||
|     SELECT round(percentile("duration_ms", 99)) as "99th_confirmation" |  | ||||||
|       FROM "testnet-automation"."autogen"."validator-confirmation" |  | ||||||
|       WHERE time > now() - 300s' |  | ||||||
|  |  | ||||||
|   curl -G "${INFLUX_HOST}/query?u=ro&p=topsecret" \ |  | ||||||
|     --data-urlencode "db=testnet-automation" \ |  | ||||||
|     --data-urlencode "q=$q_mean_tps;$q_max_tps;$q_mean_confirmation;$q_max_confirmation;$q_99th_confirmation" | |  | ||||||
|     python ci/testnet-automation-json-parser.py >>TPS"$nodeCount".log |  | ||||||
|  |  | ||||||
|   upload-ci-artifact TPS"$nodeCount".log |  | ||||||
| } |  | ||||||
|  |  | ||||||
| # This is needed, because buildkite doesn't let us define an array of numbers. |  | ||||||
| # The array is defined as a space separated string of numbers |  | ||||||
| # shellcheck disable=SC2206 |  | ||||||
| nodes_count_array=($NUMBER_OF_NODES) |  | ||||||
|  |  | ||||||
| for n in "${nodes_count_array[@]}"; do |  | ||||||
|   launchTestnet "$n" |  | ||||||
| done |  | ||||||
							
								
								
									
										14
									
								
								system-test/testnet-performance/colo-gpu-perf.yml
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										14
									
								
								system-test/testnet-performance/colo-gpu-perf.yml
									
									
									
									
									
										Executable file
									
								
							| @@ -0,0 +1,14 @@ | |||||||
|  | steps: | ||||||
|  |   - command: "system-test/testnet-performance/testnet-automation.sh" | ||||||
|  |     label: "COLO performance testnet GPU enabled" | ||||||
|  |     env: | ||||||
|  |       CLOUD_PROVIDER: "colo" | ||||||
|  |       TESTNET_TAG: "colo-edge-perf-gpu-enabled" | ||||||
|  |       RAMP_UP_TIME: 60 | ||||||
|  |       TEST_DURATION: 300 | ||||||
|  |       NUMBER_OF_VALIDATOR_NODES: 4 | ||||||
|  |       NUMBER_OF_CLIENT_NODES: 2 | ||||||
|  |       CLIENT_OPTIONS: "bench-tps=2=--tx_count 80000 --thread-batch-sleep-ms 1000" | ||||||
|  |       ADDITIONAL_FLAGS: "" | ||||||
|  |     agents: | ||||||
|  |       - "queue=colo-deploy" | ||||||
							
								
								
									
										16
									
								
								system-test/testnet-performance/gce-cpu-only-perf.yml
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										16
									
								
								system-test/testnet-performance/gce-cpu-only-perf.yml
									
									
									
									
									
										Executable file
									
								
							| @@ -0,0 +1,16 @@ | |||||||
|  | steps: | ||||||
|  |   - command: "system-test/testnet-performance/testnet-automation.sh" | ||||||
|  |     label: "GCE performance testnets CPU ONLY" | ||||||
|  |     env: | ||||||
|  |       CLOUD_PROVIDER: "gce" | ||||||
|  |       TESTNET_TAG: "gce-edge-perf-cpu-only" | ||||||
|  |       RAMP_UP_TIME: 60 | ||||||
|  |       TEST_DURATION: 300 | ||||||
|  |       NUMBER_OF_VALIDATOR_NODES: 10 | ||||||
|  |       VALIDATOR_NODE_MACHINE_TYPE: "--machine-type n1-standard-16" | ||||||
|  |       NUMBER_OF_CLIENT_NODES: 1 | ||||||
|  |       CLIENT_OPTIONS: "bench-tps=1=--tx_count 80000 --thread-batch-sleep-ms 1000" | ||||||
|  |       TESTNET_ZONES: "us-west1-a,us-west1-b,us-central1-a,europe-west4-a" | ||||||
|  |       ADDITIONAL_FLAGS: "" | ||||||
|  |     agents: | ||||||
|  |       - "queue=testnet-deploy" | ||||||
							
								
								
									
										16
									
								
								system-test/testnet-performance/gce-gpu-perf.yml
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										16
									
								
								system-test/testnet-performance/gce-gpu-perf.yml
									
									
									
									
									
										Executable file
									
								
							| @@ -0,0 +1,16 @@ | |||||||
|  | steps: | ||||||
|  |   - command: "system-test/testnet-performance/testnet-automation.sh" | ||||||
|  |     label: "GCE performance testnets GPU ENABLED" | ||||||
|  |     env: | ||||||
|  |       CLOUD_PROVIDER: "gce" | ||||||
|  |       TESTNET_TAG: "gce-edge-perf-gpu-enabled" | ||||||
|  |       RAMP_UP_TIME: 60 | ||||||
|  |       TEST_DURATION: 300 | ||||||
|  |       NUMBER_OF_VALIDATOR_NODES: 10 | ||||||
|  |       VALIDATOR_NODE_MACHINE_TYPE: "--machine-type n1-standard-16 --accelerator count=2,type=nvidia-tesla-v100" | ||||||
|  |       NUMBER_OF_CLIENT_NODES: 1 | ||||||
|  |       CLIENT_OPTIONS: "bench-tps=1=--tx_count 80000 --thread-batch-sleep-ms 1000" | ||||||
|  |       TESTNET_ZONES: "us-west1-a,us-west1-b,us-central1-a,europe-west4-a" | ||||||
|  |       ADDITIONAL_FLAGS: "" | ||||||
|  |     agents: | ||||||
|  |       - "queue=testnet-deploy" | ||||||
							
								
								
									
										178
									
								
								system-test/testnet-performance/testnet-automation.sh
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										178
									
								
								system-test/testnet-performance/testnet-automation.sh
									
									
									
									
									
										Executable file
									
								
							| @@ -0,0 +1,178 @@ | |||||||
|  | #!/usr/bin/env bash | ||||||
|  | set -e | ||||||
|  |  | ||||||
|  | # TODO: Make sure a dB named $TESTNET_TAG exists in the influxDB host, or can be created | ||||||
|  | [[ -n $TESTNET_TAG ]] || TESTNET_TAG=testnet-automation | ||||||
|  | [[ -n $INFLUX_HOST ]] || INFLUX_HOST=https://metrics.solana.com:8086 | ||||||
|  |  | ||||||
|  | # TODO: Remove all default values, force explicitness in the testcase definition | ||||||
|  | [[ -n $TEST_DURATION ]] || TEST_DURATION=300 | ||||||
|  | [[ -n $RAMP_UP_TIME ]] || RAMP_UP_TIME=60 | ||||||
|  | [[ -n $NUMBER_OF_VALIDATOR_NODES ]] || NUMBER_OF_VALIDATOR_NODES=2 | ||||||
|  | [[ -n $NUMBER_OF_CLIENT_NODES ]] || NUMBER_OF_CLIENT_NODES=1 | ||||||
|  | [[ -n $TESTNET_ZONES ]] || TESTNET_ZONES="us-west1-a" | ||||||
|  |  | ||||||
|  | function collect_logs { | ||||||
|  |   echo --- collect logs from remote nodes | ||||||
|  |   rm -rf net/log | ||||||
|  |   net/net.sh logs | ||||||
|  |   for logfile in net/log/* ; do | ||||||
|  |     ( | ||||||
|  |       new_log=net/log/"$TESTNET_TAG"_"$NUMBER_OF_VALIDATOR_NODES"-nodes_"$(basename "$logfile")" | ||||||
|  |       cp "$logfile" "$new_log" | ||||||
|  |       upload-ci-artifact "$new_log" | ||||||
|  |     ) | ||||||
|  |   done | ||||||
|  | } | ||||||
|  |  | ||||||
|  | function cleanup_testnet { | ||||||
|  |   ( | ||||||
|  |     set +e | ||||||
|  |     collect_logs | ||||||
|  |   ) | ||||||
|  |  | ||||||
|  |   ( | ||||||
|  |     set +e | ||||||
|  |     echo --- Stop Network Software | ||||||
|  |     net/net.sh stop | ||||||
|  |   ) | ||||||
|  |  | ||||||
|  |   case $CLOUD_PROVIDER in | ||||||
|  |   gce) | ||||||
|  |   ( | ||||||
|  |     cat <<EOF | ||||||
|  | - wait: ~ | ||||||
|  |   continue_on_failure: true | ||||||
|  |  | ||||||
|  | - command: "net/gce.sh delete -p ${TESTNET_TAG}" | ||||||
|  |   label: "Delete Testnet" | ||||||
|  |   agents: | ||||||
|  |     - "queue=testnet-deploy" | ||||||
|  | EOF | ||||||
|  |   ) | buildkite-agent pipeline upload | ||||||
|  |   ;; | ||||||
|  |   colo) | ||||||
|  |     ( | ||||||
|  |     cat <<EOF | ||||||
|  | - wait: ~ | ||||||
|  |   continue_on_failure: true | ||||||
|  |  | ||||||
|  | - command: "net/colo.sh delete -p ${TESTNET_TAG}" | ||||||
|  |   label: "Delete Testnet" | ||||||
|  |   agents: | ||||||
|  |     - "queue=colo-deploy" | ||||||
|  | EOF | ||||||
|  |   ) | buildkite-agent pipeline upload | ||||||
|  |   ;; | ||||||
|  |   *) | ||||||
|  |     echo "Error: Unsupported cloud provider: $CLOUD_PROVIDER" | ||||||
|  |     ;; | ||||||
|  |   esac | ||||||
|  | } | ||||||
|  | trap cleanup_testnet EXIT | ||||||
|  |  | ||||||
|  | launchTestnet() { | ||||||
|  |   set -x | ||||||
|  |  | ||||||
|  |   # shellcheck disable=SC2068 | ||||||
|  |   echo --- create "$NUMBER_OF_VALIDATOR_NODES" nodes | ||||||
|  |  | ||||||
|  |   case $CLOUD_PROVIDER in | ||||||
|  |     gce) | ||||||
|  |     # shellcheck disable=SC2068 | ||||||
|  |       net/gce.sh create \ | ||||||
|  |         -d pd-ssd \ | ||||||
|  |         -n "$NUMBER_OF_VALIDATOR_NODES" -c "$NUMBER_OF_CLIENT_NODES" \ | ||||||
|  |         "$maybeMachineType" "$VALIDATOR_NODE_MACHINE_TYPE" \ | ||||||
|  |         -p "$TESTNET_TAG" ${TESTNET_CLOUD_ZONES[@]/#/"-z "} "$ADDITIONAL_FLAGS" | ||||||
|  |       ;; | ||||||
|  |     colo) | ||||||
|  |       net/colo.sh create \ | ||||||
|  |         -n "$NUMBER_OF_VALIDATOR_NODES" -c "$NUMBER_OF_CLIENT_NODES" -g \ | ||||||
|  |         -p "$TESTNET_TAG" "$ADDITIONAL_FLAGS" | ||||||
|  |       ;; | ||||||
|  |     *) | ||||||
|  |       echo "Error: Unsupported cloud provider: $CLOUD_PROVIDER" | ||||||
|  |       ;; | ||||||
|  |     esac | ||||||
|  |  | ||||||
|  |   echo --- configure database | ||||||
|  |   net/init-metrics.sh -e | ||||||
|  |  | ||||||
|  |   echo --- start "$NUMBER_OF_VALIDATOR_NODES" node test | ||||||
|  |   if [[ -n $CHANNEL ]]; then | ||||||
|  |     net/net.sh start -t "$CHANNEL" "$maybeClientOptions" "$CLIENT_OPTIONS" | ||||||
|  |   else | ||||||
|  |     net/net.sh start -T solana-release*.tar.bz2 "$maybeClientOptions" "$CLIENT_OPTIONS" | ||||||
|  |   fi | ||||||
|  |  | ||||||
|  |   echo --- wait "$RAMP_UP_TIME" seconds for network throughput to stabilize | ||||||
|  |   sleep "$RAMP_UP_TIME" | ||||||
|  |  | ||||||
|  |   echo --- wait "$TEST_DURATION" seconds to complete test | ||||||
|  |   sleep "$TEST_DURATION" | ||||||
|  |  | ||||||
|  |   echo --- collect statistics about run | ||||||
|  |   declare q_mean_tps=' | ||||||
|  |     SELECT round(mean("sum_count")) AS "mean_tps" FROM ( | ||||||
|  |       SELECT sum("count") AS "sum_count" | ||||||
|  |         FROM "'$TESTNET_TAG'"."autogen"."banking_stage-record_transactions" | ||||||
|  |         WHERE time > now() - '"$TEST_DURATION"'s GROUP BY time(1s) | ||||||
|  |     )' | ||||||
|  |  | ||||||
|  |   declare q_max_tps=' | ||||||
|  |     SELECT round(max("sum_count")) AS "max_tps" FROM ( | ||||||
|  |       SELECT sum("count") AS "sum_count" | ||||||
|  |         FROM "'$TESTNET_TAG'"."autogen"."banking_stage-record_transactions" | ||||||
|  |         WHERE time > now() - '"$TEST_DURATION"'s GROUP BY time(1s) | ||||||
|  |     )' | ||||||
|  |  | ||||||
|  |   declare q_mean_confirmation=' | ||||||
|  |     SELECT round(mean("duration_ms")) as "mean_confirmation" | ||||||
|  |       FROM "'$TESTNET_TAG'"."autogen"."validator-confirmation" | ||||||
|  |       WHERE time > now() - '"$TEST_DURATION"'s' | ||||||
|  |  | ||||||
|  |   declare q_max_confirmation=' | ||||||
|  |     SELECT round(max("duration_ms")) as "max_confirmation" | ||||||
|  |       FROM "'$TESTNET_TAG'"."autogen"."validator-confirmation" | ||||||
|  |       WHERE time > now() - '"$TEST_DURATION"'s' | ||||||
|  |  | ||||||
|  |   declare q_99th_confirmation=' | ||||||
|  |     SELECT round(percentile("duration_ms", 99)) as "99th_confirmation" | ||||||
|  |       FROM "'$TESTNET_TAG'"."autogen"."validator-confirmation" | ||||||
|  |       WHERE time > now() - '"$TEST_DURATION"'s' | ||||||
|  |  | ||||||
|  |   RESULTS_FILE="$TESTNET_TAG"_SUMMARY_STATS_"$NUMBER_OF_VALIDATOR_NODES".log | ||||||
|  |   curl -G "${INFLUX_HOST}/query?u=ro&p=topsecret" \ | ||||||
|  |     --data-urlencode "db=${TESTNET_TAG}" \ | ||||||
|  |     --data-urlencode "q=$q_mean_tps;$q_max_tps;$q_mean_confirmation;$q_max_confirmation;$q_99th_confirmation" | | ||||||
|  |     python system-test/testnet-performance/testnet-automation-json-parser.py >>"$RESULTS_FILE" | ||||||
|  |  | ||||||
|  |   upload-ci-artifact "$RESULTS_FILE" | ||||||
|  | } | ||||||
|  |  | ||||||
|  | cd "$(dirname "$0")/../.." | ||||||
|  |  | ||||||
|  | if [[ -z $SOLANA_METRICS_CONFIG ]]; then | ||||||
|  |   if [[ -z $SOLANA_METRICS_PARTIAL_CONFIG ]]; then | ||||||
|  |     echo SOLANA_METRICS_PARTIAL_CONFIG not defined | ||||||
|  |     exit 1 | ||||||
|  |   fi | ||||||
|  |   export SOLANA_METRICS_CONFIG="db=$TESTNET_TAG,host=$INFLUX_HOST,$SOLANA_METRICS_PARTIAL_CONFIG" | ||||||
|  | fi | ||||||
|  | echo "SOLANA_METRICS_CONFIG: $SOLANA_METRICS_CONFIG" | ||||||
|  |  | ||||||
|  | if [[ -z $CHANNEL ]]; then | ||||||
|  |   echo --- downloading tar from build artifacts | ||||||
|  |   buildkite-agent artifact download "solana-release*.tar.bz2" . | ||||||
|  | fi | ||||||
|  |  | ||||||
|  | # shellcheck disable=SC1091 | ||||||
|  | source ci/upload-ci-artifact.sh | ||||||
|  |  | ||||||
|  | maybeClientOptions=${CLIENT_OPTIONS:+"-c"} | ||||||
|  | maybeMachineType=${VALIDATOR_NODE_MACHINE_TYPE:+"-G"} | ||||||
|  |  | ||||||
|  | IFS=, read -r -a TESTNET_CLOUD_ZONES <<<"${TESTNET_ZONES}" | ||||||
|  |  | ||||||
|  | launchTestnet | ||||||
		Reference in New Issue
	
	Block a user