Add system test to measure recovery after partition (#20902)
* Add system test to measure recovery after partition * shellcheck * increase partition length until failure * adjust parameters and output * different stopping condition
This commit is contained in:
		@@ -110,6 +110,21 @@ function get_current_stake {
 | 
				
			|||||||
    '$HOME/.cargo/bin/solana --url http://127.0.0.1:8899 validators --output=json | grep -o "totalCurrentStake\": [0-9]*" | cut -d: -f2'
 | 
					    '$HOME/.cargo/bin/solana --url http://127.0.0.1:8899 validators --output=json | grep -o "totalCurrentStake\": [0-9]*" | cut -d: -f2'
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					function get_validator_confirmation_time {
 | 
				
			||||||
 | 
					  SINCE=$1
 | 
				
			||||||
 | 
					  declare q_mean_confirmation='
 | 
				
			||||||
 | 
					    SELECT ROUND(MEAN("duration_ms")) as "mean_confirmation_ms"
 | 
				
			||||||
 | 
					      FROM "'$TESTNET_TAG'"."autogen"."validator-confirmation"
 | 
				
			||||||
 | 
					      WHERE time > now() - '"$SINCE"'s'
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  mean_confirmation_ms=$( \
 | 
				
			||||||
 | 
					      curl -G "${INFLUX_HOST}/query?u=ro&p=topsecret" \
 | 
				
			||||||
 | 
					        --data-urlencode "db=${TESTNET_TAG}" \
 | 
				
			||||||
 | 
					        --data-urlencode "q=$q_mean_confirmation" |
 | 
				
			||||||
 | 
					      python3 "${REPO_ROOT}"/system-test/testnet-automation-json-parser.py --empty_error |
 | 
				
			||||||
 | 
					      cut -d' ' -f2)
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
function collect_performance_statistics {
 | 
					function collect_performance_statistics {
 | 
				
			||||||
  execution_step "Collect performance statistics about run"
 | 
					  execution_step "Collect performance statistics about run"
 | 
				
			||||||
  declare q_mean_tps='
 | 
					  declare q_mean_tps='
 | 
				
			||||||
 
 | 
				
			|||||||
							
								
								
									
										22
									
								
								system-test/partition-testcases/gce-partition-recovery.yml
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										22
									
								
								system-test/partition-testcases/gce-partition-recovery.yml
									
									
									
									
									
										Executable file
									
								
							@@ -0,0 +1,22 @@
 | 
				
			|||||||
 | 
					steps:
 | 
				
			||||||
 | 
					  - command: "system-test/testnet-automation.sh"
 | 
				
			||||||
 | 
					    label: "Partition recovery on GCE"
 | 
				
			||||||
 | 
					    env:
 | 
				
			||||||
 | 
					      UPLOAD_RESULTS_TO_SLACK: "true"
 | 
				
			||||||
 | 
					      CLOUD_PROVIDER: "gce"
 | 
				
			||||||
 | 
					      ENABLE_GPU: "false"
 | 
				
			||||||
 | 
					      NUMBER_OF_VALIDATOR_NODES: 9
 | 
				
			||||||
 | 
					      VALIDATOR_NODE_MACHINE_TYPE: "--machine-type n1-standard-16"
 | 
				
			||||||
 | 
					      NUMBER_OF_CLIENT_NODES: 1
 | 
				
			||||||
 | 
					      ADDITIONAL_FLAGS: "--dedicated"
 | 
				
			||||||
 | 
					      SKIP_PERF_RESULTS: "true"
 | 
				
			||||||
 | 
					      EXTRA_PRIMORDIAL_STAKES: 4
 | 
				
			||||||
 | 
					      TEST_TYPE: "script"
 | 
				
			||||||
 | 
					      WARMUP_SLOTS_BEFORE_TEST: 400
 | 
				
			||||||
 | 
					      PRE_PARTITION_DURATION: 120
 | 
				
			||||||
 | 
					      PARTITION_DURATION: 360
 | 
				
			||||||
 | 
					      PARTITION_INCREMENT: 60
 | 
				
			||||||
 | 
					      NETEM_CONFIG_FILE: "system-test/netem-configs/complete-loss-two-partitions"
 | 
				
			||||||
 | 
					      CUSTOM_SCRIPT: "system-test/partition-testcases/measure-partition-recovery.sh"
 | 
				
			||||||
 | 
					    agents:
 | 
				
			||||||
 | 
					      - "queue=gce-deploy"
 | 
				
			||||||
							
								
								
									
										81
									
								
								system-test/partition-testcases/measure-partition-recovery.sh
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										81
									
								
								system-test/partition-testcases/measure-partition-recovery.sh
									
									
									
									
									
										Executable file
									
								
							@@ -0,0 +1,81 @@
 | 
				
			|||||||
 | 
					#!/usr/bin/env bash
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					set -ex
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# shellcheck disable=SC1090
 | 
				
			||||||
 | 
					# shellcheck disable=SC1091
 | 
				
			||||||
 | 
					source "$(dirname "$0")"/../automation_utils.sh
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					RESULT_FILE="$1"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[[ -n $TESTNET_TAG ]] || TESTNET_TAG=${CLOUD_PROVIDER}-testnet-automation
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					if [[ -z $NETEM_CONFIG_FILE  ]]; then
 | 
				
			||||||
 | 
					  echo "Error: For this test NETEM_CONFIG_FILE must be specified"
 | 
				
			||||||
 | 
					  exit 1
 | 
				
			||||||
 | 
					fi
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					if [[ -z $PRE_PARTITION_DURATION ]]; then
 | 
				
			||||||
 | 
					  PRE_PARTITION_DURATION=60
 | 
				
			||||||
 | 
					fi
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					if [[ -z $PARTITION_DURATION ]]; then
 | 
				
			||||||
 | 
					  PARTITION_DURATION=300
 | 
				
			||||||
 | 
					fi
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					if [[ -z $PARTITION_INCREMENT ]]; then
 | 
				
			||||||
 | 
					  PARTITION_INCREMENT=60
 | 
				
			||||||
 | 
					fi
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					num_online_nodes=$(( NUMBER_OF_VALIDATOR_NODES + 1 ))
 | 
				
			||||||
 | 
					if [[ -n "$NUMBER_OF_OFFLINE_NODES" ]]; then
 | 
				
			||||||
 | 
					  num_online_nodes=$(( num_online_nodes - NUMBER_OF_OFFLINE_NODES ))
 | 
				
			||||||
 | 
					fi
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					execution_step "Measuring validator confirmation time for $PRE_PARTITION_DURATION seconds"
 | 
				
			||||||
 | 
					sleep "$PRE_PARTITION_DURATION"
 | 
				
			||||||
 | 
					get_validator_confirmation_time "$PRE_PARTITION_DURATION"
 | 
				
			||||||
 | 
					# shellcheck disable=SC2154
 | 
				
			||||||
 | 
					execution_step "Pre partition validator confirmation time is $mean_confirmation_ms ms"
 | 
				
			||||||
 | 
					echo "Pre partition validator confirmation time: $mean_confirmation_ms ms" >> "$RESULT_FILE"
 | 
				
			||||||
 | 
					target=$mean_confirmation_ms
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					while true; do
 | 
				
			||||||
 | 
					  execution_step "Applying partition config $NETEM_CONFIG_FILE for $PARTITION_DURATION seconds"
 | 
				
			||||||
 | 
					  echo "Partitioning for $PARTITION_DURATION seconds" >> "$RESULT_FILE"
 | 
				
			||||||
 | 
					  "${REPO_ROOT}"/net/net.sh netem --config-file "$NETEM_CONFIG_FILE" -n $num_online_nodes
 | 
				
			||||||
 | 
					  sleep "$PARTITION_DURATION"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  execution_step "Resolving partition"
 | 
				
			||||||
 | 
					  "${REPO_ROOT}"/net/net.sh netem --config-file "$NETEM_CONFIG_FILE" --netem-cmd cleanup -n $num_online_nodes
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  get_validator_confirmation_time 10
 | 
				
			||||||
 | 
					  SECONDS=0
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  # This happens when we haven't confirmed anything recently so the query returns an empty string
 | 
				
			||||||
 | 
					  while [[ -z $mean_confirmation_ms ]]; do
 | 
				
			||||||
 | 
					    sleep 5
 | 
				
			||||||
 | 
					    get_validator_confirmation_time 10
 | 
				
			||||||
 | 
					    if [[ $SECONDS -gt $PARTITION_DURATION ]]; then
 | 
				
			||||||
 | 
					      echo "  No confirmations seen after $SECONDS seconds" >> "$RESULT_FILE"
 | 
				
			||||||
 | 
					      exit 0
 | 
				
			||||||
 | 
					    fi
 | 
				
			||||||
 | 
					  done
 | 
				
			||||||
 | 
					  echo "  Validator confirmation is $mean_confirmation_ms ms $SECONDS seconds after resolving the partition" >> "$RESULT_FILE"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  last=""
 | 
				
			||||||
 | 
					  while [[ -z $mean_confirmation_ms || $mean_confirmation_ms -gt $target ]]; do
 | 
				
			||||||
 | 
					    sleep 5
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    if [[ -n $mean_confirmation_ms && -n $last && $mean_confirmation_ms -gt $(echo "$last * 1.2" | bc) || $SECONDS -gt $PARTITION_DURATION ]]; then
 | 
				
			||||||
 | 
					      echo "  Unable to make progress after $SECONDS seconds. Last confirmation time was $mean_confirmation_ms ms" >> "$RESULT_FILE"
 | 
				
			||||||
 | 
					      exit 0
 | 
				
			||||||
 | 
					    fi
 | 
				
			||||||
 | 
					    last=$mean_confirmation_ms
 | 
				
			||||||
 | 
					    get_validator_confirmation_time 10
 | 
				
			||||||
 | 
					  done
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  echo "  Recovered in $SECONDS seconds: validator confirmation to fall to $mean_confirmation_ms ms" >> "$RESULT_FILE"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  PARTITION_DURATION=$(( PARTITION_DURATION + PARTITION_INCREMENT ))
 | 
				
			||||||
 | 
					done
 | 
				
			||||||
@@ -1,5 +1,9 @@
 | 
				
			|||||||
#!/usr/bin/env python3
 | 
					#!/usr/bin/env python3
 | 
				
			||||||
import sys, json
 | 
					import sys, json, argparse
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					parser = argparse.ArgumentParser()
 | 
				
			||||||
 | 
					parser.add_argument("--empty_error", action="store_true", help="If present, do not print error message")
 | 
				
			||||||
 | 
					args = parser.parse_args()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
data=json.load(sys.stdin)
 | 
					data=json.load(sys.stdin)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -7,7 +11,7 @@ if 'results' in data:
 | 
				
			|||||||
   for result in data['results']:
 | 
					   for result in data['results']:
 | 
				
			||||||
      if 'series' in result:
 | 
					      if 'series' in result:
 | 
				
			||||||
         print(result['series'][0]['columns'][1] + ': ' + str(result['series'][0]['values'][0][1]))
 | 
					         print(result['series'][0]['columns'][1] + ': ' + str(result['series'][0]['values'][0][1]))
 | 
				
			||||||
      else:
 | 
					      elif not args.empty_error:
 | 
				
			||||||
         print("An expected result from CURL request is missing")
 | 
					         print("An expected result from CURL request is missing")
 | 
				
			||||||
else:
 | 
					elif not args.empty_error:
 | 
				
			||||||
   print("No results returned from CURL request")
 | 
					   print("No results returned from CURL request")
 | 
				
			||||||
 
 | 
				
			|||||||
		Reference in New Issue
	
	Block a user