multi-pass bin scanning (#15377)

* multi-pass bin scanning * pr feedback * format * fix typo * adjust metrics for code changes * merge errors
2021-03-18 10:32:07 -05:00
parent 0988c2f1d6
commit 4beb39f7a1
2 changed files with 799 additions and 127 deletions
--- a/runtime/src/accounts_db.rs
+++ b/runtime/src/accounts_db.rs
@ -20,7 +20,7 @@
 use crate::{
    accounts_cache::{AccountsCache, CachedAccount, SlotCache},
-    accounts_hash::{AccountsHash, CalculateHashIntermediate, HashStats},
+    accounts_hash::{AccountsHash, CalculateHashIntermediate, HashStats, PreviousPass},
    accounts_index::{
        AccountIndex, AccountsIndex, AccountsIndexRootsStats, Ancestors, IndexKey, IsCached,
        SlotList, SlotSlice, ZeroLamport,
@ -54,7 +54,7 @@ use std::{
    collections::{hash_map::Entry, BTreeMap, BTreeSet, HashMap, HashSet},
    convert::TryFrom,
    io::{Error as IoError, Result as IoResult},
-    ops::RangeBounds,
+    ops::{Range, RangeBounds},
    path::{Path, PathBuf},
    sync::atomic::{AtomicBool, AtomicU64, AtomicUsize, Ordering},
    sync::{Arc, Mutex, MutexGuard, RwLock},
@ -3705,9 +3705,11 @@ impl AccountsDb {
        storage: &[SnapshotStorage],
        mut stats: &mut crate::accounts_hash::HashStats,
        bins: usize,
        bin_range: &Range<usize>,
    ) -> Vec<Vec<Vec<CalculateHashIntermediate>>> {
        let max_plus_1 = std::u8::MAX as usize + 1;
        assert!(bins <= max_plus_1 && bins > 0);
        assert!(bin_range.start < bins && bin_range.end <= bins && bin_range.start < bin_range.end);
        let mut time = Measure::start("scan all accounts");
        stats.num_snapshot_storage = storage.len();
        let result: Vec<Vec<Vec<CalculateHashIntermediate>>> = Self::scan_account_storage_no_bank(
@ -3716,6 +3718,12 @@ impl AccountsDb {
            |loaded_account: LoadedAccount,
             accum: &mut Vec<Vec<CalculateHashIntermediate>>,
             slot: Slot| {
                let pubkey = *loaded_account.pubkey();
                let pubkey_to_bin_index = pubkey.as_ref()[0] as usize * bins / max_plus_1;
                if !bin_range.contains(&pubkey_to_bin_index) {
                    return;
                }
                let version = loaded_account.write_version();
                let raw_lamports = loaded_account.lamports();
                let zero_raw_lamports = raw_lamports == 0;
@ -3729,7 +3737,6 @@ impl AccountsDb {
                    )
                };
                let pubkey = *loaded_account.pubkey();
                let source_item = CalculateHashIntermediate::new(
                    version,
                    *loaded_account.loaded_hash(),
@ -3737,12 +3744,11 @@ impl AccountsDb {
                    slot,
                    pubkey,
                );
                let rng_index = pubkey.as_ref()[0] as usize * bins / max_plus_1;
                let max = accum.len();
                if max == 0 {
                    accum.extend(vec![Vec::new(); bins]);
                }
-                accum[rng_index].push(source_item);
+                accum[pubkey_to_bin_index].push(source_item);
            },
        );
        time.stop();
@ -3759,14 +3765,46 @@ impl AccountsDb {
        let scan_and_hash = || {
            let mut stats = HashStats::default();
            // When calculating hashes, it is helpful to break the pubkeys found into bins based on the pubkey value.
            // More bins means smaller vectors to sort, copy, etc.
            const PUBKEY_BINS_FOR_CALCULATING_HASHES: usize = 64;
            // # of passes should be a function of the total # of accounts that are active.
            // higher passes = slower total time, lower dynamic memory usage
            // lower passes = faster total time, higher dynamic memory usage
            // passes=2 cuts dynamic memory usage in approximately half.
            let num_scan_passes: usize = 2;
            let bins_per_pass = PUBKEY_BINS_FOR_CALCULATING_HASHES / num_scan_passes;
            assert_eq!(
                bins_per_pass * num_scan_passes,
                PUBKEY_BINS_FOR_CALCULATING_HASHES
            ); // evenly divisible
            let mut previous_pass = PreviousPass::default();
            let mut final_result = (Hash::default(), 0);
            for pass in 0..num_scan_passes {
                let bounds = Range {
                    start: pass * bins_per_pass,
                    end: (pass + 1) * bins_per_pass,
                };
                let result = Self::scan_snapshot_stores(
                    storages,
                    &mut stats,
                    PUBKEY_BINS_FOR_CALCULATING_HASHES,
                    &bounds,
                );
-            AccountsHash::rest_of_hash_calculation(result, &mut stats)
+                let (hash, lamports, for_next_pass) = AccountsHash::rest_of_hash_calculation(
                    result,
                    &mut stats,
                    pass == num_scan_passes - 1,
                    previous_pass,
                );
                previous_pass = for_next_pass;
                final_result = (hash, lamports);
            }
            final_result
        };
        if let Some(thread_pool) = thread_pool {
            thread_pool.install(scan_and_hash)
@ -5015,13 +5053,49 @@ pub mod tests {
    #[should_panic(expected = "assertion failed: bins <= max_plus_1 && bins > 0")]
    fn test_accountsdb_scan_snapshot_stores_illegal_bins2() {
        let mut stats = HashStats::default();
-        AccountsDb::scan_snapshot_stores(&[], &mut stats, 257);
+        let bounds = Range { start: 0, end: 0 };
        AccountsDb::scan_snapshot_stores(&[], &mut stats, 257, &bounds);
    }
    #[test]
    #[should_panic(expected = "assertion failed: bins <= max_plus_1 && bins > 0")]
    fn test_accountsdb_scan_snapshot_stores_illegal_bins() {
        let mut stats = HashStats::default();
-        AccountsDb::scan_snapshot_stores(&[], &mut stats, 0);
+        let bounds = Range { start: 0, end: 0 };
        AccountsDb::scan_snapshot_stores(&[], &mut stats, 0, &bounds);
    }
    #[test]
    #[should_panic(
        expected = "bin_range.start < bins && bin_range.end <= bins &&\\n    bin_range.start < bin_range.end"
    )]
    fn test_accountsdb_scan_snapshot_stores_illegal_range_start() {
        let mut stats = HashStats::default();
        let bounds = Range { start: 2, end: 2 };
        AccountsDb::scan_snapshot_stores(&[], &mut stats, 2, &bounds);
    }
    #[test]
    #[should_panic(
        expected = "bin_range.start < bins && bin_range.end <= bins &&\\n    bin_range.start < bin_range.end"
    )]
    fn test_accountsdb_scan_snapshot_stores_illegal_range_end() {
        let mut stats = HashStats::default();
        let bounds = Range { start: 1, end: 3 };
        AccountsDb::scan_snapshot_stores(&[], &mut stats, 2, &bounds);
    }
    #[test]
    #[should_panic(
        expected = "bin_range.start < bins && bin_range.end <= bins &&\\n    bin_range.start < bin_range.end"
    )]
    fn test_accountsdb_scan_snapshot_stores_illegal_range_inverse() {
        let mut stats = HashStats::default();
        let bounds = Range { start: 1, end: 0 };
        AccountsDb::scan_snapshot_stores(&[], &mut stats, 2, &bounds);
    }
    fn sample_storages_and_accounts() -> (SnapshotStorages, Vec<CalculateHashIntermediate>) {
@ -5108,11 +5182,28 @@ pub mod tests {
        let bins = 1;
        let mut stats = HashStats::default();
-        let result = AccountsDb::scan_snapshot_stores(&storages, &mut stats, bins);
+
        let result = AccountsDb::scan_snapshot_stores(
            &storages,
            &mut stats,
            bins,
            &Range {
                start: 0,
                end: bins,
            },
        );
        assert_eq!(result, vec![vec![raw_expected.clone()]]);
        let bins = 2;
-        let result = AccountsDb::scan_snapshot_stores(&storages, &mut stats, bins);
+        let result = AccountsDb::scan_snapshot_stores(
            &storages,
            &mut stats,
            bins,
            &Range {
                start: 0,
                end: bins,
            },
        );
        let mut expected = vec![Vec::new(); bins];
        expected[0].push(raw_expected[0].clone());
        expected[0].push(raw_expected[1].clone());
@ -5121,7 +5212,15 @@ pub mod tests {
        assert_eq!(result, vec![expected]);
        let bins = 4;
-        let result = AccountsDb::scan_snapshot_stores(&storages, &mut stats, bins);
+        let result = AccountsDb::scan_snapshot_stores(
            &storages,
            &mut stats,
            bins,
            &Range {
                start: 0,
                end: bins,
            },
        );
        let mut expected = vec![Vec::new(); bins];
        expected[0].push(raw_expected[0].clone());
        expected[1].push(raw_expected[1].clone());
@ -5130,7 +5229,15 @@ pub mod tests {
        assert_eq!(result, vec![expected]);
        let bins = 256;
-        let result = AccountsDb::scan_snapshot_stores(&storages, &mut stats, bins);
+        let result = AccountsDb::scan_snapshot_stores(
            &storages,
            &mut stats,
            bins,
            &Range {
                start: 0,
                end: bins,
            },
        );
        let mut expected = vec![Vec::new(); bins];
        expected[0].push(raw_expected[0].clone());
        expected[127].push(raw_expected[1].clone());
@ -5151,13 +5258,126 @@ pub mod tests {
        storages[0].splice(0..0, vec![arc; MAX_ITEMS_PER_CHUNK]);
        let mut stats = HashStats::default();
-        let result = AccountsDb::scan_snapshot_stores(&storages, &mut stats, bins);
+        let result = AccountsDb::scan_snapshot_stores(
            &storages,
            &mut stats,
            bins,
            &Range {
                start: 0,
                end: bins,
            },
        );
        assert_eq!(result.len(), 2); // 2 chunks
        assert_eq!(result[0].len(), 0); // nothing found in first slots
        assert_eq!(result[1].len(), bins);
        assert_eq!(result[1], vec![raw_expected]);
    }
    #[test]
    fn test_accountsdb_scan_snapshot_stores_binning() {
        let mut stats = HashStats::default();
        let (mut storages, raw_expected) = sample_storages_and_accounts();
        // just the first bin of 2
        let bins = 2;
        let result = AccountsDb::scan_snapshot_stores(
            &storages,
            &mut stats,
            bins,
            &Range {
                start: 0,
                end: bins / 2,
            },
        );
        let mut expected = vec![Vec::new(); bins];
        expected[0].push(raw_expected[0].clone());
        expected[0].push(raw_expected[1].clone());
        assert_eq!(result, vec![expected]);
        // just the second bin of 2
        let result = AccountsDb::scan_snapshot_stores(
            &storages,
            &mut stats,
            bins,
            &Range {
                start: 1,
                end: bins,
            },
        );
        let mut expected = vec![Vec::new(); bins];
        expected[bins - 1].push(raw_expected[2].clone());
        expected[bins - 1].push(raw_expected[3].clone());
        assert_eq!(result, vec![expected]);
        // 1 bin at a time of 4
        let bins = 4;
        for bin in 0..bins {
            let result = AccountsDb::scan_snapshot_stores(
                &storages,
                &mut stats,
                bins,
                &Range {
                    start: bin,
                    end: bin + 1,
                },
            );
            let mut expected = vec![Vec::new(); bins];
            expected[bin].push(raw_expected[bin].clone());
            assert_eq!(result, vec![expected]);
        }
        let bins = 256;
        let bin_locations = vec![0, 127, 128, 255];
        for bin in 0..bins {
            let result = AccountsDb::scan_snapshot_stores(
                &storages,
                &mut stats,
                bins,
                &Range {
                    start: bin,
                    end: bin + 1,
                },
            );
            let mut expected = vec![];
            if let Some(index) = bin_locations.iter().position(|&r| r == bin) {
                expected = vec![Vec::new(); bins];
                expected[bin].push(raw_expected[index].clone());
            }
            assert_eq!(result, vec![expected]);
        }
        // enough stores to get to 2nd chunk
        // range is for only 1 bin out of 256.
        let bins = 256;
        let (_temp_dirs, paths) = get_temp_accounts_paths(1).unwrap();
        let slot_expected: Slot = 0;
        let size: usize = 123;
        let data = AccountStorageEntry::new(&paths[0], slot_expected, 0, size as u64);
        let arc = Arc::new(data);
        const MAX_ITEMS_PER_CHUNK: usize = 5_000;
        storages[0].splice(0..0, vec![arc; MAX_ITEMS_PER_CHUNK]);
        let mut stats = HashStats::default();
        let result = AccountsDb::scan_snapshot_stores(
            &storages,
            &mut stats,
            bins,
            &Range {
                start: 127,
                end: 128,
            },
        );
        assert_eq!(result.len(), 2); // 2 chunks
        assert_eq!(result[0].len(), 0); // nothing found in first slots
        let mut expected = vec![Vec::new(); bins];
        expected[127].push(raw_expected[1].clone());
        assert_eq!(result[1].len(), bins);
        assert_eq!(result[1], expected);
    }
    #[test]
    fn test_accountsdb_calculate_accounts_hash_without_index_simple() {
        solana_logger::setup();
--- a/runtime/src/accounts_hash.rs
+++ b/runtime/src/accounts_hash.rs
@ -11,6 +11,13 @@ use std::{convert::TryInto, sync::Mutex};
 pub const ZERO_RAW_LAMPORTS_SENTINEL: u64 = std::u64::MAX;
 pub const MERKLE_FANOUT: usize = 16;
 #[derive(Default, Debug)]
 pub struct PreviousPass {
    pub reduced_hashes: Vec<Vec<Hash>>,
    pub remaining_unhashed: Vec<Hash>,
    pub lamports: u64,
 }
 #[derive(Debug, Default)]
 pub struct HashStats {
    pub scan_time_total_us: u64,
@ -197,8 +204,9 @@ impl AccountsHash {
            MERKLE_FANOUT,
            None,
            |start: usize| cumulative_offsets.get_slice(&hashes, start),
            None,
        );
-        (result, hash_total)
+        (result.0, hash_total)
    }
    pub fn compute_merkle_root(hashes: Vec<(Pubkey, Hash)>, fanout: usize) -> Hash {
@ -259,48 +267,85 @@ impl AccountsHash {
        }
    }
-    // This function is designed to allow hashes to be located in multiple, perhaps multiply deep vecs.
+    fn calculate_three_level_chunks(
    // The caller provides a function to return a slice from the source data.
    pub fn compute_merkle_root_from_slices<'a, F>(
        total_hashes: usize,
        fanout: usize,
        max_levels_per_pass: Option<usize>,
-        get_hashes: F,
+        specific_level_count: Option<usize>,
-    ) -> Hash
+    ) -> (usize, usize, bool) {
    where
        F: Fn(usize) -> &'a [Hash] + std::marker::Sync,
    {
        if total_hashes == 0 {
            return Hasher::default().result();
        }
        let mut time = Measure::start("time");
        const THREE_LEVEL_OPTIMIZATION: usize = 3; // this '3' is dependent on the code structure below where we manually unroll
        let target = fanout.pow(THREE_LEVEL_OPTIMIZATION as u32);
        // Only use the 3 level optimization if we have at least 4 levels of data.
        // Otherwise, we'll be serializing a parallel operation.
        let threshold = target * fanout;
-        let three_level = max_levels_per_pass.unwrap_or(usize::MAX) >= THREE_LEVEL_OPTIMIZATION
+        let mut three_level = max_levels_per_pass.unwrap_or(usize::MAX) >= THREE_LEVEL_OPTIMIZATION
            && total_hashes >= threshold;
-        let num_hashes_per_chunk = if three_level { target } else { fanout };
+        if three_level {
            if let Some(specific_level_count_value) = specific_level_count {
                three_level = specific_level_count_value >= THREE_LEVEL_OPTIMIZATION;
            }
        }
        let (num_hashes_per_chunk, levels_hashed) = if three_level {
            (target, THREE_LEVEL_OPTIMIZATION)
        } else {
            (fanout, 1)
        };
        (num_hashes_per_chunk, levels_hashed, three_level)
    }
    // This function is designed to allow hashes to be located in multiple, perhaps multiply deep vecs.
    // The caller provides a function to return a slice from the source data.
    pub fn compute_merkle_root_from_slices<'a, F>(
        total_hashes: usize,
        fanout: usize,
        max_levels_per_pass: Option<usize>,
        get_hash_slice_starting_at_index: F,
        specific_level_count: Option<usize>,
    ) -> (Hash, Vec<Hash>)
    where
        F: Fn(usize) -> &'a [Hash] + std::marker::Sync,
    {
        if total_hashes == 0 {
            return (Hasher::default().result(), vec![]);
        }
        let mut time = Measure::start("time");
        let (num_hashes_per_chunk, levels_hashed, three_level) = Self::calculate_three_level_chunks(
            total_hashes,
            fanout,
            max_levels_per_pass,
            specific_level_count,
        );
        let chunks = Self::div_ceil(total_hashes, num_hashes_per_chunk);
        // initial fetch - could return entire slice
-        let data: &[Hash] = get_hashes(0);
+        let data: &[Hash] = get_hash_slice_starting_at_index(0);
        let data_len = data.len();
        let result: Vec<_> = (0..chunks)
            .into_par_iter()
            .map(|i| {
                // summary:
                // this closure computes 1 or 3 levels of merkle tree (all chunks will be 1 or all will be 3)
                // for a subset (our chunk) of the input data [start_index..end_index]
                // index into get_hash_slice_starting_at_index where this chunk's range begins
                let start_index = i * num_hashes_per_chunk;
                // index into get_hash_slice_starting_at_index where this chunk's range ends
                let end_index = std::cmp::min(start_index + num_hashes_per_chunk, total_hashes);
                // will compute the final result for this closure
                let mut hasher = Hasher::default();
                // index into 'data' where we are currently pulling data
                // if we exhaust our data, then we will request a new slice, and data_index resets to 0, the beginning of the new slice
                let mut data_index = start_index;
                // source data, which we may refresh when we exhaust
                let mut data = data;
                // len of the source data
                let mut data_len = data_len;
                if !three_level {
@ -308,8 +353,8 @@ impl AccountsHash {
                    // The result of this loop is a single hash value from fanout input hashes.
                    for i in start_index..end_index {
                        if data_index >= data_len {
-                            // fetch next slice
+                            // we exhausted our data, fetch next slice starting at i
-                            data = get_hashes(i);
+                            data = get_hash_slice_starting_at_index(i);
                            data_len = data.len();
                            data_index = 0;
                        }
@ -318,7 +363,43 @@ impl AccountsHash {
                    }
                } else {
                    // hash 3 levels of fanout simultaneously.
                    // This codepath produces 1 hash value for between 1..=fanout^3 input hashes.
                    // It is equivalent to running the normal merkle tree calculation 3 iterations on the input.
                    //
                    // big idea:
                    //  merkle trees usually reduce the input vector by a factor of fanout with each iteration
                    //  example with fanout 2:
                    //   start:     [0,1,2,3,4,5,6,7]      in our case: [...16M...] or really, 1B
                    //   iteration0 [.5, 2.5, 4.5, 6.5]                 [... 1M...]
                    //   iteration1 [1.5, 5.5]                          [...65k...]
                    //   iteration2 3.5                                 [...4k... ]
                    //  So iteration 0 consumes N elements, hashes them in groups of 'fanout' and produces a vector of N/fanout elements
                    //   and the process repeats until there is only 1 hash left.
                    //
                    //  With the three_level code path, we make each chunk we iterate of size fanout^3 (4096)
                    //  So, the input could be 16M hashes and the output will be 4k hashes, or N/fanout^3
                    //  The goal is to reduce the amount of data that has to be constructed and held in memory.
                    //  When we know we have enough hashes, then, in 1 pass, we hash 3 levels simultaneously, storing far fewer intermediate hashes.
                    //
                    // Now, some details:
                    // The result of this loop is a single hash value from fanout^3 input hashes.
                    // concepts:
                    //  what we're conceptually hashing: "raw_hashes"[start_index..end_index]
                    //   example: [a,b,c,d,e,f]
                    //   but... hashes[] may really be multiple vectors that are pieced together.
                    //   example: [[a,b],[c],[d,e,f]]
                    //   get_hash_slice_starting_at_index(any_index) abstracts that and returns a slice starting at raw_hashes[any_index..]
                    //   such that the end of get_hash_slice_starting_at_index may be <, >, or = end_index
                    //   example: get_hash_slice_starting_at_index(1) returns [b]
                    //            get_hash_slice_starting_at_index(3) returns [d,e,f]
                    // This code is basically 3 iterations of merkle tree hashing occurring simultaneously.
                    // The first fanout raw hashes are hashed in hasher_k. This is iteration0
                    // Once hasher_k has hashed fanout hashes, hasher_k's result hash is hashed in hasher_j and then discarded
                    // hasher_k then starts over fresh and hashes the next fanout raw hashes. This is iteration0 again for a new set of data.
                    // Once hasher_j has hashed fanout hashes (from k), hasher_j's result hash is hashed in hasher and then discarded
                    // Once hasher has hashed fanout hashes (from j), then the result of hasher is the hash for fanout^3 raw hashes.
                    // If there are < fanout^3 hashes, then this code stops when it runs out of raw hashes and returns whatever it hashed.
                    // This is always how the very last elements work in a merkle tree.
                    let mut i = start_index;
                    while i < end_index {
                        let mut hasher_j = Hasher::default();
@ -327,8 +408,8 @@ impl AccountsHash {
                            let end = std::cmp::min(end_index - i, fanout);
                            for _k in 0..end {
                                if data_index >= data_len {
-                                    // fetch next slice
+                                    // we exhausted our data, fetch next slice starting at i
-                                    data = get_hashes(i);
+                                    data = get_hash_slice_starting_at_index(i);
                                    data_len = data.len();
                                    data_index = 0;
                                }
@ -351,13 +432,47 @@ impl AccountsHash {
        time.stop();
        debug!("hashing {} {}", total_hashes, time);
        if let Some(mut specific_level_count_value) = specific_level_count {
            specific_level_count_value -= levels_hashed;
            if specific_level_count_value == 0 {
                (Hash::default(), result)
            } else {
                assert!(specific_level_count_value > 0);
                // We did not hash the number of levels required by 'specific_level_count', so repeat
                Self::compute_merkle_root_from_slices_recurse(
                    result,
                    fanout,
                    max_levels_per_pass,
                    Some(specific_level_count_value),
                )
            }
        } else {
            (
                if result.len() == 1 {
                    result[0]
                } else {
                    Self::compute_merkle_root_recurse(result, fanout)
                },
                vec![], // no intermediate results needed by caller
            )
        }
    }
    pub fn compute_merkle_root_from_slices_recurse(
        hashes: Vec<Hash>,
        fanout: usize,
        max_levels_per_pass: Option<usize>,
        specific_level_count: Option<usize>,
    ) -> (Hash, Vec<Hash>) {
        Self::compute_merkle_root_from_slices(
            hashes.len(),
            fanout,
            max_levels_per_pass,
            |start| &hashes[start..],
            specific_level_count,
        )
    }
    pub fn accumulate_account_hashes(mut hashes: Vec<(Pubkey, Hash)>) -> Hash {
        Self::sort_hashes_by_pubkey(&mut hashes);
@ -405,7 +520,7 @@ impl AccountsHash {
        }
        flatten_time.stop();
        stats.flatten_time_total_us += flatten_time.as_us();
-        stats.unreduced_entries = raw_len;
+        stats.unreduced_entries += raw_len;
        data_by_pubkey
    }
@ -569,50 +684,120 @@ impl AccountsHash {
        (result, sum)
    }
    fn flatten_hashes_and_hash(
        hashes: Vec<Vec<Vec<Hash>>>,
        fanout: usize,
        stats: &mut HashStats,
    ) -> Hash {
        let mut hash_time = Measure::start("flat2");
        let offsets = CumulativeOffsets::from_raw_2d(&hashes);
        let get_slice = |start: usize| -> &[Hash] { offsets.get_slice_2d(&hashes, start) };
        let hash = AccountsHash::compute_merkle_root_from_slices(
            offsets.total_count,
            fanout,
            None,
            get_slice,
        );
        hash_time.stop();
        stats.hash_time_total_us += hash_time.as_us();
        stats.hash_total = offsets.total_count;
        hash
    }
    // input:
    // vec: unordered, created by parallelism
    //   vec: [0..bins] - where bins are pubkey ranges
-    //     vec: [..] - items which fin in the containing bin, unordered within this vec
+    //     vec: [..] - items which fit in the containing bin, unordered within this vec
    // so, assumption is middle vec is bins sorted by pubkey
    pub fn rest_of_hash_calculation(
        data_sections_by_pubkey: Vec<Vec<Vec<CalculateHashIntermediate>>>,
        mut stats: &mut HashStats,
-    ) -> (Hash, u64) {
+        is_last_pass: bool,
        mut previous_state: PreviousPass,
    ) -> (Hash, u64, PreviousPass) {
        let outer = Self::flatten_hash_intermediate(data_sections_by_pubkey, &mut stats);
        let sorted_data_by_pubkey = Self::sort_hash_intermediate(outer, &mut stats);
-        let (hashes, total_lamports) =
+        let (mut hashes, mut total_lamports) =
            Self::de_dup_and_eliminate_zeros(sorted_data_by_pubkey, &mut stats);
-        let hash = Self::flatten_hashes_and_hash(hashes, MERKLE_FANOUT, &mut stats);
+        total_lamports += previous_state.lamports;
        if !previous_state.remaining_unhashed.is_empty() {
            // these items were not hashed last iteration because they didn't divide evenly
            hashes.insert(0, vec![previous_state.remaining_unhashed]);
            previous_state.remaining_unhashed = Vec::new();
        }
        let mut next_pass = PreviousPass::default();
        let cumulative = CumulativeOffsets::from_raw_2d(&hashes);
        let mut hash_total = cumulative.total_count;
        stats.hash_total += hash_total;
        next_pass.reduced_hashes = previous_state.reduced_hashes;
        const TARGET_FANOUT_LEVEL: usize = 3;
        let target_fanout = MERKLE_FANOUT.pow(TARGET_FANOUT_LEVEL as u32);
        if !is_last_pass {
            next_pass.lamports = total_lamports;
            total_lamports = 0;
            // Save hashes that don't evenly hash. They will be combined with hashes from the next pass.
            let left_over_hashes = hash_total % target_fanout;
            // move tail hashes that don't evenly hash into a 1d vector for next time
            let mut i = hash_total - left_over_hashes;
            while i < hash_total {
                let data = cumulative.get_slice_2d(&hashes, i);
                next_pass.remaining_unhashed.extend(data);
                i += data.len();
            }
            hash_total -= left_over_hashes; // this is enough to cause the hashes at the end of the data set to be ignored
        }
        // if we have raw hashes to process and
        //   we are not the last pass (we already modded against target_fanout) OR
        //   we have previously surpassed target_fanout and hashed some already to the target_fanout level. In that case, we know
        //     we need to hash whatever is left here to the target_fanout level.
        if hash_total != 0 && (!is_last_pass || !next_pass.reduced_hashes.is_empty()) {
            let mut hash_time = Measure::start("hash");
            let partial_hashes = Self::compute_merkle_root_from_slices(
                hash_total, // note this does not include the ones that didn't divide evenly, unless we're in the last iteration
                MERKLE_FANOUT,
                Some(TARGET_FANOUT_LEVEL),
                |start| cumulative.get_slice_2d(&hashes, start),
                Some(TARGET_FANOUT_LEVEL),
            )
            .1;
            hash_time.stop();
            stats.hash_time_total_us += hash_time.as_us();
            next_pass.reduced_hashes.push(partial_hashes);
        }
        let no_progress = is_last_pass && next_pass.reduced_hashes.is_empty() && !hashes.is_empty();
        if no_progress {
            // we never made partial progress, so hash everything now
            hashes.into_iter().for_each(|v| {
                v.into_iter().for_each(|v| {
                    if !v.is_empty() {
                        next_pass.reduced_hashes.push(v);
                    }
                });
            });
        }
        let hash = if is_last_pass {
            let cumulative = CumulativeOffsets::from_raw(&next_pass.reduced_hashes);
            let hash = if cumulative.total_count == 1 && !no_progress {
                // all the passes resulted in a single hash, that means we're done, so we had <= MERKLE_ROOT total hashes
                cumulative.get_slice(&next_pass.reduced_hashes, 0)[0]
            } else {
                let mut hash_time = Measure::start("hash");
                // hash all the rest and combine and hash until we have only 1 hash left
                let (hash, _) = Self::compute_merkle_root_from_slices(
                    cumulative.total_count,
                    MERKLE_FANOUT,
                    None,
                    |start| cumulative.get_slice(&next_pass.reduced_hashes, start),
                    None,
                );
                hash_time.stop();
                stats.hash_time_total_us += hash_time.as_us();
                hash
            };
            next_pass.reduced_hashes = Vec::new();
            hash
        } else {
            Hash::default()
        };
        if is_last_pass {
            stats.log();
-
+        }
-        (hash, total_lamports)
+        (hash, total_lamports, next_pass)
    }
 }
@ -662,6 +847,8 @@ pub mod tests {
        let result = AccountsHash::rest_of_hash_calculation(
            vec![vec![account_maps.clone()]],
            &mut HashStats::default(),
            true,
            PreviousPass::default(),
        );
        let expected_hash = Hash::from_str("8j9ARGFv4W2GfML7d3sVJK2MePwrikqYnu6yqer28cCa").unwrap();
        assert_eq!((result.0, result.1), (expected_hash, 88));
@ -675,6 +862,8 @@ pub mod tests {
        let result = AccountsHash::rest_of_hash_calculation(
            vec![vec![account_maps.clone()]],
            &mut HashStats::default(),
            true,
            PreviousPass::default(),
        );
        let expected_hash = Hash::from_str("EHv9C5vX7xQjjMpsJMzudnDTzoTSRwYkqLzY8tVMihGj").unwrap();
        assert_eq!((result.0, result.1), (expected_hash, 108));
@ -688,11 +877,302 @@ pub mod tests {
        let result = AccountsHash::rest_of_hash_calculation(
            vec![vec![account_maps]],
            &mut HashStats::default(),
            true,
            PreviousPass::default(),
        );
        let expected_hash = Hash::from_str("7NNPg5A8Xsg1uv4UFm6KZNwsipyyUnmgCrznP6MBWoBZ").unwrap();
        assert_eq!((result.0, result.1), (expected_hash, 118));
    }
    #[test]
    fn test_accountsdb_multi_pass_rest_of_hash_calculation() {
        solana_logger::setup();
        // passes:
        // 0: empty, NON-empty, empty, empty final
        // 1: NON-empty, empty final
        // 2: NON-empty, empty, empty final
        for pass in 0..3 {
            let mut account_maps: Vec<CalculateHashIntermediate> = Vec::new();
            let key = Pubkey::new(&[11u8; 32]);
            let hash = Hash::new(&[1u8; 32]);
            let val = CalculateHashIntermediate::new(0, hash, 88, Slot::default(), key);
            account_maps.push(val);
            // 2nd key - zero lamports, so will be removed
            let key = Pubkey::new(&[12u8; 32]);
            let hash = Hash::new(&[2u8; 32]);
            let val = CalculateHashIntermediate::new(
                0,
                hash,
                ZERO_RAW_LAMPORTS_SENTINEL,
                Slot::default(),
                key,
            );
            account_maps.push(val);
            let mut previous_pass = PreviousPass::default();
            if pass == 0 {
                // first pass that is not last and is empty
                let result = AccountsHash::rest_of_hash_calculation(
                    vec![vec![vec![]]],
                    &mut HashStats::default(),
                    false, // not last pass
                    previous_pass,
                );
                assert_eq!(result.0, Hash::default());
                assert_eq!(result.1, 0);
                previous_pass = result.2;
                assert_eq!(previous_pass.remaining_unhashed.len(), 0);
                assert_eq!(previous_pass.reduced_hashes.len(), 0);
                assert_eq!(previous_pass.lamports, 0);
            }
            let result = AccountsHash::rest_of_hash_calculation(
                vec![vec![account_maps.clone()]],
                &mut HashStats::default(),
                false, // not last pass
                previous_pass,
            );
            assert_eq!(result.0, Hash::default());
            assert_eq!(result.1, 0);
            let mut previous_pass = result.2;
            assert_eq!(previous_pass.remaining_unhashed, vec![account_maps[0].hash]);
            assert_eq!(previous_pass.reduced_hashes.len(), 0);
            assert_eq!(previous_pass.lamports, account_maps[0].lamports);
            let expected_hash =
                Hash::from_str("8j9ARGFv4W2GfML7d3sVJK2MePwrikqYnu6yqer28cCa").unwrap();
            if pass == 2 {
                let result = AccountsHash::rest_of_hash_calculation(
                    vec![vec![vec![]]],
                    &mut HashStats::default(),
                    false,
                    previous_pass,
                );
                previous_pass = result.2;
                assert_eq!(previous_pass.remaining_unhashed, vec![account_maps[0].hash]);
                assert_eq!(previous_pass.reduced_hashes.len(), 0);
                assert_eq!(previous_pass.lamports, account_maps[0].lamports);
            }
            let result = AccountsHash::rest_of_hash_calculation(
                vec![vec![vec![]]],
                &mut HashStats::default(),
                true, // finally, last pass
                previous_pass,
            );
            let previous_pass = result.2;
            assert_eq!(previous_pass.remaining_unhashed.len(), 0);
            assert_eq!(previous_pass.reduced_hashes.len(), 0);
            assert_eq!(previous_pass.lamports, 0);
            assert_eq!((result.0, result.1), (expected_hash, 88));
        }
    }
    #[test]
    fn test_accountsdb_multi_pass_rest_of_hash_calculation_partial() {
        solana_logger::setup();
        let mut account_maps: Vec<CalculateHashIntermediate> = Vec::new();
        let key = Pubkey::new(&[11u8; 32]);
        let hash = Hash::new(&[1u8; 32]);
        let val = CalculateHashIntermediate::new(0, hash, 88, Slot::default(), key);
        account_maps.push(val);
        let key = Pubkey::new(&[12u8; 32]);
        let hash = Hash::new(&[2u8; 32]);
        let val = CalculateHashIntermediate::new(0, hash, 20, Slot::default(), key);
        account_maps.push(val);
        let result = AccountsHash::rest_of_hash_calculation(
            vec![vec![vec![account_maps[0].clone()]]],
            &mut HashStats::default(),
            false, // not last pass
            PreviousPass::default(),
        );
        assert_eq!(result.0, Hash::default());
        assert_eq!(result.1, 0);
        let previous_pass = result.2;
        assert_eq!(previous_pass.remaining_unhashed, vec![account_maps[0].hash]);
        assert_eq!(previous_pass.reduced_hashes.len(), 0);
        assert_eq!(previous_pass.lamports, account_maps[0].lamports);
        let result = AccountsHash::rest_of_hash_calculation(
            vec![vec![vec![account_maps[1].clone()]]],
            &mut HashStats::default(),
            false, // not last pass
            previous_pass,
        );
        assert_eq!(result.0, Hash::default());
        assert_eq!(result.1, 0);
        let previous_pass = result.2;
        assert_eq!(
            previous_pass.remaining_unhashed,
            vec![account_maps[0].hash, account_maps[1].hash]
        );
        assert_eq!(previous_pass.reduced_hashes.len(), 0);
        let total_lamports_expected = account_maps[0].lamports + account_maps[1].lamports;
        assert_eq!(previous_pass.lamports, total_lamports_expected);
        let result = AccountsHash::rest_of_hash_calculation(
            vec![vec![vec![]]],
            &mut HashStats::default(),
            true,
            previous_pass,
        );
        let previous_pass = result.2;
        assert_eq!(previous_pass.remaining_unhashed.len(), 0);
        assert_eq!(previous_pass.reduced_hashes.len(), 0);
        assert_eq!(previous_pass.lamports, 0);
        let expected_hash = AccountsHash::compute_merkle_root(
            account_maps
                .iter()
                .map(|a| (a.pubkey, a.hash))
                .collect::<Vec<_>>(),
            MERKLE_FANOUT,
        );
        assert_eq!(
            (result.0, result.1),
            (expected_hash, total_lamports_expected)
        );
    }
    #[test]
    fn test_accountsdb_multi_pass_rest_of_hash_calculation_partial_hashes() {
        solana_logger::setup();
        let mut account_maps: Vec<CalculateHashIntermediate> = Vec::new();
        const TARGET_FANOUT_LEVEL: usize = 3;
        let target_fanout = MERKLE_FANOUT.pow(TARGET_FANOUT_LEVEL as u32);
        let mut total_lamports_expected = 0;
        let plus1 = target_fanout + 1;
        for i in 0..plus1 * 2 {
            let lamports = (i + 1) as u64;
            total_lamports_expected += lamports;
            let key = Pubkey::new_unique();
            let hash = Hash::new_unique();
            let val = CalculateHashIntermediate::new(0, hash, lamports, Slot::default(), key);
            account_maps.push(val);
        }
        let chunk = account_maps[0..plus1].to_vec();
        let mut sorted = chunk.clone();
        sorted.sort_by(AccountsHash::compare_two_hash_entries);
        // first 4097 hashes (1 left over)
        let result = AccountsHash::rest_of_hash_calculation(
            vec![vec![chunk]],
            &mut HashStats::default(),
            false, // not last pass
            PreviousPass::default(),
        );
        assert_eq!(result.0, Hash::default());
        assert_eq!(result.1, 0);
        let previous_pass = result.2;
        let left_over_1 = sorted[plus1 - 1].hash;
        assert_eq!(previous_pass.remaining_unhashed, vec![left_over_1]);
        assert_eq!(previous_pass.reduced_hashes.len(), 1);
        let expected_hash = AccountsHash::compute_merkle_root(
            sorted[0..target_fanout]
                .iter()
                .map(|a| (a.pubkey, a.hash))
                .collect::<Vec<_>>(),
            MERKLE_FANOUT,
        );
        assert_eq!(previous_pass.reduced_hashes[0], vec![expected_hash]);
        assert_eq!(
            previous_pass.lamports,
            account_maps[0..plus1]
                .iter()
                .map(|i| i.lamports)
                .sum::<u64>()
        );
        let chunk = account_maps[plus1..plus1 * 2].to_vec();
        let mut sorted2 = chunk.clone();
        sorted2.sort_by(AccountsHash::compare_two_hash_entries);
        let mut with_left_over = vec![left_over_1];
        with_left_over.extend(sorted2[0..plus1 - 2].to_vec().into_iter().map(|i| i.hash));
        let expected_hash2 = AccountsHash::compute_merkle_root(
            with_left_over[0..target_fanout]
                .iter()
                .map(|a| (Pubkey::default(), *a))
                .collect::<Vec<_>>(),
            MERKLE_FANOUT,
        );
        // second 4097 hashes (2 left over)
        let result = AccountsHash::rest_of_hash_calculation(
            vec![vec![chunk]],
            &mut HashStats::default(),
            false, // not last pass
            previous_pass,
        );
        assert_eq!(result.0, Hash::default());
        assert_eq!(result.1, 0);
        let previous_pass = result.2;
        assert_eq!(
            previous_pass.remaining_unhashed,
            vec![sorted2[plus1 - 2].hash, sorted2[plus1 - 1].hash]
        );
        assert_eq!(previous_pass.reduced_hashes.len(), 2);
        assert_eq!(
            previous_pass.reduced_hashes,
            vec![vec![expected_hash], vec![expected_hash2]]
        );
        assert_eq!(
            previous_pass.lamports,
            account_maps[0..plus1 * 2]
                .iter()
                .map(|i| i.lamports)
                .sum::<u64>()
        );
        let result = AccountsHash::rest_of_hash_calculation(
            vec![vec![vec![]]],
            &mut HashStats::default(),
            true,
            previous_pass,
        );
        let previous_pass = result.2;
        assert_eq!(previous_pass.remaining_unhashed.len(), 0);
        assert_eq!(previous_pass.reduced_hashes.len(), 0);
        assert_eq!(previous_pass.lamports, 0);
        let mut combined = sorted;
        combined.extend(sorted2);
        let expected_hash = AccountsHash::compute_merkle_root(
            combined
                .iter()
                .map(|a| (a.pubkey, a.hash))
                .collect::<Vec<_>>(),
            MERKLE_FANOUT,
        );
        assert_eq!(
            (result.0, result.1),
            (expected_hash, total_lamports_expected)
        );
    }
    #[test]
    fn test_accountsdb_de_dup_accounts_zero_chunks() {
        let (hashes, lamports) =
@ -923,40 +1403,6 @@ pub mod tests {
        }
    }
    #[test]
    fn test_accountsdb_flatten_hashes_and_hash() {
        solana_logger::setup();
        const COUNT: usize = 4;
        let hashes: Vec<_> = (0..COUNT)
            .into_iter()
            .map(|i| Hash::new(&[(i) as u8; 32]))
            .collect();
        let expected =
            AccountsHash::compute_merkle_root_loop(hashes.clone(), MERKLE_FANOUT, |i| *i);
        assert_eq!(
            AccountsHash::flatten_hashes_and_hash(
                vec![vec![hashes.clone()]],
                MERKLE_FANOUT,
                &mut HashStats::default()
            ),
            expected,
        );
        for in_first in 1..COUNT - 1 {
            assert_eq!(
                AccountsHash::flatten_hashes_and_hash(
                    vec![vec![
                        hashes.clone()[0..in_first].to_vec(),
                        hashes.clone()[in_first..COUNT].to_vec()
                    ]],
                    MERKLE_FANOUT,
                    &mut HashStats::default()
                ),
                expected
            );
        }
    }
    #[test]
    fn test_sort_hash_intermediate() {
        solana_logger::setup();
@ -1367,25 +1813,23 @@ pub mod tests {
        let temp: Vec<_> = hashes.iter().map(|h| (Pubkey::default(), *h)).collect();
        let result = AccountsHash::compute_merkle_root(temp, fanout);
        let reduced: Vec<_> = hashes.clone();
-        let result2 =
+        let result2 = AccountsHash::compute_merkle_root_from_slices(
-            AccountsHash::compute_merkle_root_from_slices(hashes.len(), fanout, None, |start| {
+            hashes.len(),
                &reduced[start..]
            });
        assert_eq!(result, result2, "len: {}", hashes.len());
        let result2 =
            AccountsHash::compute_merkle_root_from_slices(hashes.len(), fanout, Some(1), |start| {
                &reduced[start..]
            });
        assert_eq!(result, result2, "len: {}", hashes.len());
        let reduced2: Vec<_> = hashes.iter().map(|x| vec![*x]).collect();
        let result2 = AccountsHash::flatten_hashes_and_hash(
            vec![reduced2],
            fanout,
-            &mut HashStats::default(),
+            None,
            |start| &reduced[start..],
            None,
        );
-        assert_eq!(result, result2, "len: {}", hashes.len());
+        assert_eq!(result, result2.0, "len: {}", hashes.len());
        let result2 = AccountsHash::compute_merkle_root_from_slices(
            hashes.len(),
            fanout,
            Some(1),
            |start| &reduced[start..],
            None,
        );
        assert_eq!(result, result2.0, "len: {}", hashes.len());
        let max = std::cmp::min(reduced.len(), fanout * 2);
        for left in 0..max {
@ -1394,9 +1838,17 @@ pub mod tests {
                    vec![reduced[0..left].to_vec(), reduced[left..right].to_vec()],
                    vec![reduced[right..].to_vec()],
                ];
-                let result2 =
+                let offsets = CumulativeOffsets::from_raw_2d(&src);
-                    AccountsHash::flatten_hashes_and_hash(src, fanout, &mut HashStats::default());
+
-                assert_eq!(result, result2);
+                let get_slice = |start: usize| -> &[Hash] { offsets.get_slice_2d(&src, start) };
                let result2 = AccountsHash::compute_merkle_root_from_slices(
                    offsets.total_count,
                    fanout,
                    None,
                    get_slice,
                    None,
                );
                assert_eq!(result, result2.0);
            }
        }
        result