- Encapsulate QoS Service metrics reporting within QosServioce, so client (#21191)

code (eg banking_stage) doesn't need to worry about it. - Remove dead cost_* stats from banking_stage, clean up call path.
2021-11-18 15:35:30 -06:00
parent a272e19f8d
commit 0ca255220e
3 changed files with 197 additions and 103 deletions
--- a/core/src/qos_service.rs
+++ b/core/src/qos_service.rs
@@ -9,58 +9,69 @@ use {
        cost_model::{CostModel, TransactionCost},
        cost_tracker::CostTrackerError,
    },
-    solana_sdk::transaction::{self, SanitizedTransaction, TransactionError},
-    std::sync::{Arc, RwLock},
+    solana_sdk::{
+        timing::AtomicInterval,
+        transaction::{self, SanitizedTransaction, TransactionError},
+    },
+    std::{
+        sync::{
+            atomic::{AtomicBool, AtomicU64, Ordering},
+            Arc, RwLock,
+        },
+        thread::{self, Builder, JoinHandle},
+        time::Duration,
+    },
 };

-#[derive(Default)]
-pub struct QosServiceStats {
-    compute_cost_time: u64,
-    cost_tracking_time: u64,
-    selected_txs_count: u64,
-    retried_txs_per_block_limit_count: u64,
-    retried_txs_per_account_limit_count: u64,
-}
-
-impl QosServiceStats {
-    pub fn report(&mut self) {
-        datapoint_info!(
-            "qos-service-stats",
-            ("compute_cost_time", self.compute_cost_time, i64),
-            ("cost_tracking_time", self.cost_tracking_time, i64),
-            ("selected_txs_count", self.selected_txs_count, i64),
-            (
-                "retried_txs_per_block_limit_count",
-                self.retried_txs_per_block_limit_count,
-                i64
-            ),
-            (
-                "retried_txs_per_account_limit_count",
-                self.retried_txs_per_account_limit_count,
-                i64
-            ),
-        );
-    }
-}
-
 pub struct QosService {
    cost_model: Arc<RwLock<CostModel>>,
+    metrics: Arc<QosServiceMetrics>,
+    reporting_thread: Option<JoinHandle<()>>,
+    running_flag: Arc<AtomicBool>,
+}
+
+impl Drop for QosService {
+    fn drop(&mut self) {
+        self.running_flag.store(false, Ordering::Relaxed);
+        self.reporting_thread
+            .take()
+            .unwrap()
+            .join()
+            .expect("qos service metrics reporting thread failed to join");
+    }
 }

 impl QosService {
    pub fn new(cost_model: Arc<RwLock<CostModel>>) -> Self {
-        Self { cost_model }
+        let running_flag = Arc::new(AtomicBool::new(true));
+        let metrics = Arc::new(QosServiceMetrics::default());
+
+        let running_flag_clone = running_flag.clone();
+        let metrics_clone = metrics.clone();
+        let reporting_thread = Some(
+            Builder::new()
+                .name("solana-qos-service-metrics-repoting".to_string())
+                .spawn(move || {
+                    Self::reporting_loop(running_flag_clone, metrics_clone);
+                })
+                .unwrap(),
+        );
+        Self {
+            cost_model,
+            metrics,
+            reporting_thread,
+            running_flag,
+        }
    }

    pub fn compute_transaction_costs<'a>(
        &self,
        transactions: impl Iterator<Item = &'a SanitizedTransaction>,
        demote_program_write_locks: bool,
-        stats: &mut QosServiceStats,
    ) -> Vec<TransactionCost> {
        let mut compute_cost_time = Measure::start("compute_cost_time");
        let cost_model = self.cost_model.read().unwrap();
-        let txs_costs = transactions
+        let txs_costs: Vec<_> = transactions
            .map(|tx| {
                let cost = cost_model.calculate_cost(tx, demote_program_write_locks);
                debug!(
@@ -73,7 +84,12 @@ impl QosService {
            })
            .collect();
        compute_cost_time.stop();
-        stats.compute_cost_time += compute_cost_time.as_us();
+        self.metrics
+            .compute_cost_time
+            .fetch_add(compute_cost_time.as_us(), Ordering::Relaxed);
+        self.metrics
+            .compute_cost_count
+            .fetch_add(txs_costs.len() as u64, Ordering::Relaxed);
        txs_costs
    }

@@ -84,7 +100,6 @@ impl QosService {
        transactions: impl Iterator<Item = &'a SanitizedTransaction>,
        transactions_costs: impl Iterator<Item = &'a TransactionCost>,
        bank: &Arc<Bank>,
-        stats: &mut QosServiceStats,
    ) -> Vec<transaction::Result<()>> {
        let mut cost_tracking_time = Measure::start("cost_tracking_time");
        let mut cost_tracker = bank.write_cost_tracker().unwrap();
@@ -93,18 +108,18 @@ impl QosService {
            .map(|(tx, cost)| match cost_tracker.try_add(tx, cost) {
                Ok(current_block_cost) => {
                    debug!("slot {:?}, transaction {:?}, cost {:?}, fit into current block, current block cost {}", bank.slot(), tx, cost, current_block_cost);
-                    stats.selected_txs_count += 1;
+                    self.metrics.selected_txs_count.fetch_add(1, Ordering::Relaxed);
                    Ok(())
                },
                Err(e) => {
                    debug!("slot {:?}, transaction {:?}, cost {:?}, not fit into current block, '{:?}'", bank.slot(), tx, cost, e);
                    match e {
                        CostTrackerError::WouldExceedBlockMaxLimit => {
-                            stats.retried_txs_per_block_limit_count += 1;
+                            self.metrics.retried_txs_per_block_limit_count.fetch_add(1, Ordering::Relaxed);
                            Err(TransactionError::WouldExceedMaxBlockCostLimit)
                        }
                        CostTrackerError::WouldExceedAccountMaxLimit => {
-                            stats.retried_txs_per_account_limit_count += 1;
+                            self.metrics.retried_txs_per_account_limit_count.fetch_add(1, Ordering::Relaxed);
                            Err(TransactionError::WouldExceedMaxAccountCostLimit)
                        }
                    }
@@ -112,9 +127,72 @@ impl QosService {
            })
            .collect();
        cost_tracking_time.stop();
-        stats.cost_tracking_time += cost_tracking_time.as_us();
+        self.metrics
+            .cost_tracking_time
+            .fetch_add(cost_tracking_time.as_us(), Ordering::Relaxed);
        select_results
    }
+
+    fn reporting_loop(running_flag: Arc<AtomicBool>, metrics: Arc<QosServiceMetrics>) {
+        while running_flag.load(Ordering::Relaxed) {
+            // hardcode to report every 1000ms
+            metrics.report(1000u64);
+            thread::sleep(Duration::from_millis(100));
+        }
+    }
+}
+
+#[derive(Default)]
+struct QosServiceMetrics {
+    last_report: AtomicInterval,
+    compute_cost_time: AtomicU64,
+    compute_cost_count: AtomicU64,
+    cost_tracking_time: AtomicU64,
+    selected_txs_count: AtomicU64,
+    retried_txs_per_block_limit_count: AtomicU64,
+    retried_txs_per_account_limit_count: AtomicU64,
+}
+
+impl QosServiceMetrics {
+    pub fn report(&self, report_interval_ms: u64) {
+        if self.last_report.should_update(report_interval_ms) {
+            datapoint_info!(
+                "qos-service-stats",
+                (
+                    "compute_cost_time",
+                    self.compute_cost_time.swap(0, Ordering::Relaxed) as i64,
+                    i64
+                ),
+                (
+                    "compute_cost_count",
+                    self.compute_cost_count.swap(0, Ordering::Relaxed) as i64,
+                    i64
+                ),
+                (
+                    "cost_tracking_time",
+                    self.cost_tracking_time.swap(0, Ordering::Relaxed) as i64,
+                    i64
+                ),
+                (
+                    "selected_txs_count",
+                    self.selected_txs_count.swap(0, Ordering::Relaxed) as i64,
+                    i64
+                ),
+                (
+                    "retried_txs_per_block_limit_count",
+                    self.retried_txs_per_block_limit_count
+                        .swap(0, Ordering::Relaxed) as i64,
+                    i64
+                ),
+                (
+                    "retried_txs_per_account_limit_count",
+                    self.retried_txs_per_account_limit_count
+                        .swap(0, Ordering::Relaxed) as i64,
+                    i64
+                ),
+            );
+        }
+    }
 }

 #[cfg(test)]
@@ -135,7 +213,7 @@ mod tests {
    };

    #[test]
-    fn test_compute_transactions_costs() {
+    fn test_compute_transaction_costs() {
        solana_logger::setup();

        // make a vec of txs
@@ -158,11 +236,7 @@ mod tests {

        let cost_model = Arc::new(RwLock::new(CostModel::default()));
        let qos_service = QosService::new(cost_model.clone());
-        let txs_costs = qos_service.compute_transaction_costs(
-            txs.iter(),
-            false,
-            &mut QosServiceStats::default(),
-        );
+        let txs_costs = qos_service.compute_transaction_costs(txs.iter(), false);

        // verify the size of txs_costs and its contents
        assert_eq!(txs_costs.len(), txs.len());
@@ -214,23 +288,14 @@ mod tests {
        let txs = vec![transfer_tx.clone(), vote_tx.clone(), transfer_tx, vote_tx];

        let qos_service = QosService::new(cost_model);
-        let txs_costs = qos_service.compute_transaction_costs(
-            txs.iter(),
-            false,
-            &mut QosServiceStats::default(),
-        );
+        let txs_costs = qos_service.compute_transaction_costs(txs.iter(), false);

        // set cost tracker limit to fit 1 transfer tx, vote tx bypasses limit check
        let cost_limit = transfer_tx_cost;
        bank.write_cost_tracker()
            .unwrap()
            .set_limits(cost_limit, cost_limit);
-        let results = qos_service.select_transactions_per_cost(
-            txs.iter(),
-            txs_costs.iter(),
-            &bank,
-            &mut QosServiceStats::default(),
-        );
+        let results = qos_service.select_transactions_per_cost(txs.iter(), txs_costs.iter(), &bank);

        // verify that first transfer tx and all votes are allowed
        assert_eq!(results.len(), txs.len());
@@ -239,4 +304,46 @@ mod tests {
        assert!(results[2].is_err());
        assert!(results[3].is_ok());
    }
+
+    #[test]
+    fn test_async_report_metrics() {
+        solana_logger::setup();
+
+        // make a vec of txs
+        let txs_count = 2048usize;
+        let keypair = Keypair::new();
+        let transfer_tx = SanitizedTransaction::from_transaction_for_tests(
+            system_transaction::transfer(&keypair, &keypair.pubkey(), 1, Hash::default()),
+        );
+        let mut txs_1 = Vec::with_capacity(txs_count);
+        let mut txs_2 = Vec::with_capacity(txs_count);
+        for _i in 0..txs_count {
+            txs_1.push(transfer_tx.clone());
+            txs_2.push(transfer_tx.clone());
+        }
+
+        let cost_model = Arc::new(RwLock::new(CostModel::default()));
+        let qos_service = Arc::new(QosService::new(cost_model));
+        let qos_service_1 = qos_service.clone();
+        let qos_service_2 = qos_service.clone();
+
+        let th_1 = thread::spawn(move || {
+            qos_service_1.compute_transaction_costs(txs_1.iter(), false);
+        });
+
+        let th_2 = thread::spawn(move || {
+            qos_service_2.compute_transaction_costs(txs_2.iter(), false);
+        });
+
+        th_1.join().expect("qos service 1 faield to join");
+        th_2.join().expect("qos service 2 faield to join");
+
+        assert_eq!(
+            txs_count as u64 * 2,
+            qos_service
+                .metrics
+                .compute_cost_count
+                .load(Ordering::Relaxed)
+        );
+    }
 }