Push perf test results to slack app (#6371)

* Add script to publish testnet results to slack * Obscure webhook URL * fixup * Replace read with cat redirection * Turn back on net restart * Pick nits * Make symlink before trying to delete its contents * Display test config in slack and pick Trents nit not to maybe rm -rf /* * Clean up results print * Minor nits * Turn the test settings back up to 11 * typo * Shellcheck * Just a few more fields * fix payload formatting * Del clear-config.sh * Mount secondary * Add commit SHA link and Grafana time range URL * Add fancy buttons instead of text URLs * Tighten up test config display * Fixup display nits * chellsheck * Rebase and fix typo
2019-10-21 20:00:17 -04:00
parent d1b18a5060
commit 00809a67c0
8 changed files with 167 additions and 27 deletions
--- a/net/common.sh
+++ b/net/common.sh
@ -113,11 +113,14 @@ clear_config_dir() {
 SECONDARY_DISK_MOUNT_POINT=/mnt/extra-disk
 setup_secondary_mount() {
  # If there is a secondary disk, symlink the config/ dir there
-  if [[ -d $SECONDARY_DISK_MOUNT_POINT ]] && \
+  (
-    [[ -w $SECONDARY_DISK_MOUNT_POINT ]]; then
+    set -x
-    mkdir -p $SECONDARY_DISK_MOUNT_POINT/config
+    if [[ -d $SECONDARY_DISK_MOUNT_POINT ]] && \
-    rm -rf "$SOLANA_CONFIG_DIR"
+      [[ -w $SECONDARY_DISK_MOUNT_POINT ]]; then
-    ln -sfT $SECONDARY_DISK_MOUNT_POINT/config "$SOLANA_CONFIG_DIR"
+      mkdir -p $SECONDARY_DISK_MOUNT_POINT/config
-  fi
+      rm -rf "$SOLANA_CONFIG_DIR"
      ln -sfT $SECONDARY_DISK_MOUNT_POINT/config "$SOLANA_CONFIG_DIR"
    fi
  )
 }
--- a/net/remote/remote-node.sh
+++ b/net/remote/remote-node.sh
@ -152,6 +152,7 @@ EOF
    set -x
    if [[ $skipSetup != true ]]; then
      clear_config_dir "$SOLANA_CONFIG_DIR"
      setup_secondary_mount
      if [[ -n $internalNodesLamports ]]; then
        echo "---" >> config/fullnode-balances.yml
@ -248,6 +249,7 @@ EOF
    fi
    if [[ $skipSetup != true ]]; then
      clear_config_dir "$SOLANA_CONFIG_DIR"
      setup_secondary_mount
      [[ -z $internalNodesLamports ]] || net/scripts/rsync-retry.sh -vPrc \
      "$entrypointIp":~/solana/config/fullnode-"$nodeIndex"-identity.json config/fullnode-identity.json
    fi
--- a/system-test/testnet-performance/colo-gpu-perf.yml
+++ b/system-test/testnet-performance/colo-gpu-perf.yml
@ -2,13 +2,14 @@ steps:
  - command: "system-test/testnet-performance/testnet-automation.sh"
    label: "COLO performance testnet GPU enabled"
    env:
      UPLOAD_RESULTS_TO_SLACK: "true"
      CLOUD_PROVIDER: "colo"
      TESTNET_TAG: "colo-edge-perf-gpu-enabled"
-      RAMP_UP_TIME: 60
+      RAMP_UP_TIME: 0
-      TEST_DURATION: 300
+      TEST_DURATION: 600
      NUMBER_OF_VALIDATOR_NODES: 4
      NUMBER_OF_CLIENT_NODES: 2
-      CLIENT_OPTIONS: "bench-tps=2=--tx_count 80000 --thread-batch-sleep-ms 1000"
+      CLIENT_OPTIONS: "bench-tps=2=--tx_count 15000 --thread-batch-sleep-ms 250"
      ADDITIONAL_FLAGS: ""
    agents:
      - "queue=colo-deploy"
--- a/system-test/testnet-performance/gce-cpu-only-perf.yml
+++ b/system-test/testnet-performance/gce-cpu-only-perf.yml
@ -2,6 +2,7 @@ steps:
  - command: "system-test/testnet-performance/testnet-automation.sh"
    label: "GCE performance testnets CPU ONLY"
    env:
      UPLOAD_RESULTS_TO_SLACK: "true"
      CLOUD_PROVIDER: "gce"
      TESTNET_TAG: "gce-edge-perf-cpu-only"
      RAMP_UP_TIME: 60
--- a/system-test/testnet-performance/gce-gpu-perf.yml
+++ b/system-test/testnet-performance/gce-gpu-perf.yml
@ -2,14 +2,15 @@ steps:
  - command: "system-test/testnet-performance/testnet-automation.sh"
    label: "GCE performance testnets GPU ENABLED"
    env:
      UPLOAD_RESULTS_TO_SLACK: "true"
      CLOUD_PROVIDER: "gce"
      TESTNET_TAG: "gce-edge-perf-gpu-enabled"
-      RAMP_UP_TIME: 60
+      RAMP_UP_TIME: 0
-      TEST_DURATION: 300
+      TEST_DURATION: 600
-      NUMBER_OF_VALIDATOR_NODES: 10
+      NUMBER_OF_VALIDATOR_NODES: 50
      VALIDATOR_NODE_MACHINE_TYPE: "--machine-type n1-standard-16 --accelerator count=2,type=nvidia-tesla-v100"
-      NUMBER_OF_CLIENT_NODES: 1
+      NUMBER_OF_CLIENT_NODES: 2
-      CLIENT_OPTIONS: "bench-tps=1=--tx_count 80000 --thread-batch-sleep-ms 1000"
+      CLIENT_OPTIONS: "bench-tps=2=--tx_count 15000 --thread-batch-sleep-ms 250"
      TESTNET_ZONES: "us-west1-a,us-west1-b,us-central1-a,europe-west4-a"
      ADDITIONAL_FLAGS: ""
    agents:
--- a/system-test/testnet-performance/testnet-automation-json-parser.py
+++ b/system-test/testnet-performance/testnet-automation-json-parser.py
@ -2,6 +2,9 @@
 import sys, json
 data=json.load(sys.stdin)
-print[\
+
-   ([result['series'][0]['columns'][1].encode(), result['series'][0]['values'][0][1]]) \
+if 'results' in data:
-   for result in data['results']]
+   for result in data['results']:
      print result['series'][0]['columns'][1].encode() + ': ' + str(result['series'][0]['values'][0][1])
 else:
   print "No results returned from CURL request"
--- a/system-test/testnet-performance/testnet-automation.sh
+++ b/system-test/testnet-performance/testnet-automation.sh
@ -7,10 +7,9 @@ set -e
 # TODO: Remove all default values, force explicitness in the testcase definition
 [[ -n $TEST_DURATION ]] || TEST_DURATION=300
-[[ -n $RAMP_UP_TIME ]] || RAMP_UP_TIME=60
+[[ -n $RAMP_UP_TIME ]] || RAMP_UP_TIME=0
 [[ -n $NUMBER_OF_VALIDATOR_NODES ]] || NUMBER_OF_VALIDATOR_NODES=2
 [[ -n $NUMBER_OF_CLIENT_NODES ]] || NUMBER_OF_CLIENT_NODES=1
 [[ -n $TESTNET_ZONES ]] || TESTNET_ZONES="us-west1-a"
 function collect_logs {
  echo --- collect logs from remote nodes
@ -26,6 +25,11 @@ function collect_logs {
 }
 function cleanup_testnet {
  FINISH_UNIX_MSECS="$(($(date +%s%N)/1000000))"
  if [[ -n $UPLOAD_RESULTS_TO_SLACK ]] ; then
    upload_results_to_slack
  fi
  (
    set +e
    collect_logs
@ -101,9 +105,9 @@ launchTestnet() {
  echo --- start "$NUMBER_OF_VALIDATOR_NODES" node test
  if [[ -n $CHANNEL ]]; then
-    net/net.sh start -t "$CHANNEL" "$maybeClientOptions" "$CLIENT_OPTIONS"
+    net/net.sh restart -t "$CHANNEL" "$maybeClientOptions" "$CLIENT_OPTIONS"
  else
-    net/net.sh start -T solana-release*.tar.bz2 "$maybeClientOptions" "$CLIENT_OPTIONS"
+    net/net.sh restart -T solana-release*.tar.bz2 "$maybeClientOptions" "$CLIENT_OPTIONS"
  fi
  echo --- wait "$RAMP_UP_TIME" seconds for network throughput to stabilize
@ -128,27 +132,27 @@ launchTestnet() {
    )'
  declare q_mean_confirmation='
-    SELECT round(mean("duration_ms")) as "mean_confirmation"
+    SELECT round(mean("duration_ms")) as "mean_confirmation_ms"
      FROM "'$TESTNET_TAG'"."autogen"."validator-confirmation"
      WHERE time > now() - '"$TEST_DURATION"'s'
  declare q_max_confirmation='
-    SELECT round(max("duration_ms")) as "max_confirmation"
+    SELECT round(max("duration_ms")) as "max_confirmation_ms"
      FROM "'$TESTNET_TAG'"."autogen"."validator-confirmation"
      WHERE time > now() - '"$TEST_DURATION"'s'
  declare q_99th_confirmation='
-    SELECT round(percentile("duration_ms", 99)) as "99th_confirmation"
+    SELECT round(percentile("duration_ms", 99)) as "99th_percentile_confirmation_ms"
      FROM "'$TESTNET_TAG'"."autogen"."validator-confirmation"
      WHERE time > now() - '"$TEST_DURATION"'s'
  RESULTS_FILE="$TESTNET_TAG"_SUMMARY_STATS_"$NUMBER_OF_VALIDATOR_NODES".log
  curl -G "${INFLUX_HOST}/query?u=ro&p=topsecret" \
    --data-urlencode "db=${TESTNET_TAG}" \
    --data-urlencode "q=$q_mean_tps;$q_max_tps;$q_mean_confirmation;$q_max_confirmation;$q_99th_confirmation" |
-    python system-test/testnet-performance/testnet-automation-json-parser.py >>"$RESULTS_FILE"
+    python system-test/testnet-performance/testnet-automation-json-parser.py >>"$RESULT_FILE"
-  upload-ci-artifact "$RESULTS_FILE"
+  RESULT_DETAILS=$(<"$RESULT_FILE")
  upload-ci-artifact "$RESULT_FILE"
 }
 cd "$(dirname "$0")/../.."
@ -169,10 +173,33 @@ fi
 # shellcheck disable=SC1091
 source ci/upload-ci-artifact.sh
 source system-test/testnet-performance/upload_results_to_slack.sh
 maybeClientOptions=${CLIENT_OPTIONS:+"-c"}
 maybeMachineType=${VALIDATOR_NODE_MACHINE_TYPE:+"-G"}
 IFS=, read -r -a TESTNET_CLOUD_ZONES <<<"${TESTNET_ZONES}"
 RESULT_FILE="$TESTNET_TAG"_SUMMARY_STATS_"$NUMBER_OF_VALIDATOR_NODES".log
 rm -f $RESULT_FILE
 RESULT_DETAILS="Test failed to finish"
 TEST_PARAMS_TO_DISPLAY=(CLOUD_PROVIDER \
                        NUMBER_OF_VALIDATOR_NODES \
                        VALIDATOR_NODE_MACHINE_TYPE \
                        NUMBER_OF_CLIENT_NODES \
                        CLIENT_OPTIONS \
                        TESTNET_ZONES \
                        TEST_DURATION \
                        ADDITIONAL_FLAGS)
 TEST_CONFIGURATION=
 for i in "${TEST_PARAMS_TO_DISPLAY[@]}" ; do
  if [[ -n ${!i} ]] ; then
    TEST_CONFIGURATION+="${i} = ${!i} | "
  fi
 done
 START_UNIX_MSECS="$(($(date +%s%N)/1000000))"
 launchTestnet
--- a/system-test/testnet-performance/upload_results_to_slack.sh
+++ b/system-test/testnet-performance/upload_results_to_slack.sh
@ -0,0 +1,102 @@
 upload_results_to_slack() {
  echo --- Uploading results to Slack Performance Results App
  if [[ -z $SLACK_WEBHOOK_URL ]] ; then
    echo "SLACK_WEBHOOOK_URL undefined"
    exit 1
  fi
  [[ -n $BUILDKITE_MESSAGE ]] || BUILDKITE_MESSAGE="Message not defined"
  if [[ -n $BUILDKITE_COMMIT ]] ; then
    COMMIT_BUTTON_TEXT="$(echo "$BUILDKITE_COMMIT" | head -c 8)"
    COMMIT_URL="https://github.com/solana-labs/solana/commit/${BUILDKITE_COMMIT}"
  else
    COMMIT_BUTTON_TEXT="Commit not defined"
    COMMIT_URL="https://github.com/solana-labs/solana/commits/master"
  fi
  if [[ -n $BUILDKITE_BUILD_URL ]] ; then
    BUILD_BUTTON_TEXT="Build Kite Job"
  else
    BUILD_BUTTON_TEXT="Build URL not defined"
    BUILDKITE_BUILD_URL="https://buildkite.com/solana-labs/"
  fi
  GRAFANA_URL="https://metrics.solana.com:3000/d/testnet-${CHANNEL:-edge}/testnet-monitor-${CHANNEL:-edge}?var-testnet=${TESTNET_TAG:-testnet-automation}&from=${START_UNIX_MSECS:-0}&to=${FINISH_UNIX_MSECS:-0}"
  [[ -n $RESULT_DETAILS ]] || RESULT_DETAILS="Undefined"
  [[ -n $TEST_CONFIGURATION ]] || TEST_CONFIGURATION="Undefined"
  payLoad="$(cat <<EOF
 {
 "blocks": [
 		{
 			"type": "section",
 			"text": {
 				"type": "mrkdwn",
 				"text": "*New Build: $BUILDKITE_MESSAGE*"
 			}
 		},
    {
 			"type": "actions",
 			"elements": [
 				{
 					"type": "button",
 					"text": {
 						"type": "plain_text",
 						"text": "$COMMIT_BUTTON_TEXT",
 						"emoji": true
 					},
 					"url": "$COMMIT_URL"
 				},
        {
 					"type": "button",
 					"text": {
 						"type": "plain_text",
 						"text": "$BUILD_BUTTON_TEXT",
 						"emoji": true
 					},
 					"url": "$BUILDKITE_BUILD_URL"
 				},
        {
 					"type": "button",
 					"text": {
 						"type": "plain_text",
 						"text": "Grafana",
 						"emoji": true
 					},
 					"url": "$GRAFANA_URL"
 				}
 			]
 		},
 		{
 			"type": "divider"
    },
    {
 			"type": "section",
 			"text": {
 				"type": "mrkdwn",
 				"text": "Test Configuration: \n\`\`\`$TEST_CONFIGURATION\`\`\`"
 			}
 		},
 		{
 			"type": "divider"
 		},
 		{
 			"type": "section",
 			"text": {
 				"type": "mrkdwn",
 				"text": "Result Details: \n\`\`\`$RESULT_DETAILS\`\`\`"
 			}
 		}
 	]
 }
 EOF
 )"
  curl -X POST \
  -H 'Content-type: application/json' \
  --data "$payLoad" \
  "$SLACK_WEBHOOK_URL"
 }