Skip to content

Commit 1632fc1

Browse files
committed
txgraph: Track multiple potential would-be clusters in Trim (improvement)
In the existing Trim function, as soon as the set of accepted transactions would exceed the max cluster size or count limit, the acceptance loop is stopped, removing all later transactions. However, it is possible that by excluding some of those transactions the would-be cluster splits apart into multiple would-clusters. And those clusters may well permit far more transactions before their limits are reached. Take this into account by using a union-find structure inside TrimTxData to keep track of the count/size of all would-be clusters that would be formed at any point, and only reject transactions which would cause these resulting partitions to exceed their limits. This is not an optimization in terms of CPU usage or memory; it just improves the quality of the transactions removed by Trim().
1 parent 4608df3 commit 1632fc1

File tree

3 files changed

+131
-53
lines changed

3 files changed

+131
-53
lines changed

src/bench/txgraph.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,8 @@ void BenchTxGraphTrim(benchmark::Bench& bench)
112112
});
113113

114114
assert(!graph->IsOversized());
115+
// At least 99% of chains must survive.
116+
assert(graph->GetTransactionCount() >= (NUM_TOP_CHAINS * NUM_TX_PER_TOP_CHAIN * 99) / 100);
115117
}
116118

117119
} // namespace

src/test/txgraph_tests.cpp

Lines changed: 12 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -68,14 +68,11 @@ BOOST_AUTO_TEST_CASE(txgraph_trim_zigzag)
6868
graph->SanityCheck();
6969
BOOST_CHECK(!graph->IsOversized(/*main_only=*/false));
7070

71-
BOOST_CHECK_EQUAL(removed_refs.size(), NUM_TOTAL_TX - MAX_CLUSTER_COUNT);
72-
BOOST_CHECK_EQUAL(graph->GetTransactionCount(), MAX_CLUSTER_COUNT);
73-
74-
// Only prefix of size max_cluster_count is left. That's the first half of the top and first half of the bottom.
71+
// We only need to trim the middle bottom transaction to end up with 2 clusters each within cluster limits.
72+
BOOST_CHECK_EQUAL(removed_refs.size(), 1);
73+
BOOST_CHECK_EQUAL(graph->GetTransactionCount(), MAX_CLUSTER_COUNT * 2 - 2);
7574
for (unsigned int i = 0; i < refs.size(); ++i) {
76-
const bool first_half = (i < (NUM_BOTTOM_TX / 2)) ||
77-
(i >= NUM_BOTTOM_TX && i < NUM_BOTTOM_TX + NUM_TOP_TX / 2 + 1);
78-
BOOST_CHECK_EQUAL(graph->Exists(refs[i]), first_half);
75+
BOOST_CHECK_EQUAL(graph->Exists(refs[i]), i != (NUM_BOTTOM_TX / 2));
7976
}
8077
}
8178

@@ -129,13 +126,12 @@ BOOST_AUTO_TEST_CASE(txgraph_trim_flower)
129126
graph->SanityCheck();
130127
BOOST_CHECK(!graph->IsOversized(/*main_only=*/false));
131128

132-
BOOST_CHECK_EQUAL(removed_refs.size(), NUM_TOTAL_TX - MAX_CLUSTER_COUNT);
133-
BOOST_CHECK_EQUAL(graph->GetTransactionCount(), MAX_CLUSTER_COUNT);
134-
135-
// Only prefix of size max_cluster_count (last max_cluster_count top transactions) is left.
136-
for (unsigned int i = 0; i < refs.size(); ++i) {
137-
const bool top_highest_feerate = i > (NUM_TOTAL_TX - MAX_CLUSTER_COUNT - 1);
138-
BOOST_CHECK_EQUAL(graph->Exists(refs[i]), top_highest_feerate);
129+
// Since only the bottom transaction connects these clusters, we only need to remove it.
130+
BOOST_CHECK_EQUAL(removed_refs.size(), 1);
131+
BOOST_CHECK_EQUAL(graph->GetTransactionCount(false), MAX_CLUSTER_COUNT * 2);
132+
BOOST_CHECK(!graph->Exists(refs[0]));
133+
for (unsigned int i = 1; i < refs.size(); ++i) {
134+
BOOST_CHECK(graph->Exists(refs[i]));
139135
}
140136
}
141137

@@ -248,8 +244,8 @@ BOOST_AUTO_TEST_CASE(txgraph_trim_huge)
248244
BOOST_CHECK(removed_refs.size() == total_tx_count - graph->GetTransactionCount());
249245
graph->SanityCheck();
250246

251-
// At least one original chain must survive.
252-
BOOST_CHECK(graph->GetTransactionCount() >= NUM_TX_PER_TOP_CHAIN);
247+
// At least 99% of chains must survive.
248+
BOOST_CHECK(graph->GetTransactionCount() >= (NUM_TOP_CHAINS * NUM_TX_PER_TOP_CHAIN * 99) / 100);
253249
}
254250

255251
BOOST_AUTO_TEST_CASE(txgraph_trim_big_singletons)

src/txgraph.cpp

Lines changed: 117 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -68,10 +68,30 @@ struct TrimTxData
6868
// Fields only used internally by TxGraphImpl::Trim():
6969
/** Number of unmet dependencies this transaction has. -1 if the transaction is included. */
7070
uint32_t m_deps_left;
71+
/** Number of dependencies that apply to this transaction as child. */
72+
uint32_t m_parent_count;
73+
/** Where in deps_by_child those dependencies begin. */
74+
uint32_t m_parent_offset;
7175
/** Number of dependencies that apply to this transaction as parent. */
7276
uint32_t m_children_count;
73-
/** Where in deps those dependencies begin. */
77+
/** Where in deps_by_parent those dependencies begin. */
7478
uint32_t m_children_offset;
79+
80+
// Fields only used internally by TxGraphImpl::Trim()'s union-find implementation, and only for
81+
// transactions that are definitely included or definitely rejected.
82+
//
83+
// As transactions get processed, they get organized into trees which form partitions
84+
// representing the would-be clusters up to that point. The root of each tree is a
85+
// representative for that partition. See
86+
// https://en.wikipedia.org/wiki/Disjoint-set_data_structure.
87+
//
88+
/** Pointer to another TrimTxData, towards the root of the tree. If this is a root, m_uf_parent
89+
* is equal to this itself. */
90+
TrimTxData* m_uf_parent;
91+
/** If this is a root, the total number of transactions in the partition. */
92+
uint32_t m_uf_count;
93+
/** If this is a root, the total size of transactions in the partition. */
94+
uint64_t m_uf_size;
7595
};
7696

7797
/** A grouping of connected transactions inside a TxGraphImpl::ClusterSet. */
@@ -2609,24 +2629,28 @@ std::vector<TxGraph::Ref*> TxGraphImpl::Trim() noexcept
26092629
// but respecting the dependencies being added.
26102630
//
26112631
// This rudimentary linearization is computed lazily, by putting all eligible (no unmet
2612-
// dependencies) transactions in a heap, and popping the highest-feerate one from it. This
2613-
// continues as long as the number or size of all picked transactions together does not exceed
2614-
// the graph's configured cluster limits. All remaining transactions are then marked as
2615-
// removed.
2632+
// dependencies) transactions in a heap, and popping the highest-feerate one from it. Along the
2633+
// way, the counts and sizes of the would-be clusters up to that point are tracked (by
2634+
// partitioning the involved transactions using a union-find structure). Any transaction whose
2635+
// addition would cause a violation is removed, along with all their descendants.
26162636
//
26172637
// A next invocation of GroupClusters (after applying the removals) will compute the new
26182638
// resulting clusters, and none of them will violate the limits.
26192639

26202640
/** All dependencies (both to be added ones, and implicit ones between consecutive transactions
2621-
* in existing cluster linearizations). */
2622-
std::vector<std::pair<GraphIndex, GraphIndex>> deps;
2641+
* in existing cluster linearizations), sorted by parent. */
2642+
std::vector<std::pair<GraphIndex, GraphIndex>> deps_by_parent;
2643+
/** Same, but sorted by child. */
2644+
std::vector<std::pair<GraphIndex, GraphIndex>> deps_by_child;
26232645
/** Information about all transactions involved in a Cluster group to be trimmed, sorted by
26242646
* GraphIndex. It contains entries both for transactions that have already been included,
26252647
* and ones that have not yet been. */
26262648
std::vector<TrimTxData> trim_data;
26272649
/** Iterators into trim_data, treated as a max heap according to cmp_fn below. Each entry is
26282650
* a transaction that has not yet been included yet, but all its ancestors have. */
26292651
std::vector<std::vector<TrimTxData>::iterator> trim_heap;
2652+
/** The list of representatives of the partitions a given transaction depends on. */
2653+
std::vector<TrimTxData*> current_deps;
26302654

26312655
/** Function to define the ordering of trim_heap. */
26322656
static constexpr auto cmp_fn = [](auto a, auto b) noexcept {
@@ -2637,6 +2661,38 @@ std::vector<TxGraph::Ref*> TxGraphImpl::Trim() noexcept
26372661
return a->m_chunk_feerate < b->m_chunk_feerate;
26382662
};
26392663

2664+
/** Given a TrimTxData entry, find the representative of the partition it is in. */
2665+
static constexpr auto find_fn = [](TrimTxData* arg) noexcept {
2666+
while (arg != arg->m_uf_parent) {
2667+
// Replace pointer to parent with pointer to grandparent (path splitting).
2668+
// See https://en.wikipedia.org/wiki/Disjoint-set_data_structure#Finding_set_representatives.
2669+
auto par = arg->m_uf_parent;
2670+
arg->m_uf_parent = par->m_uf_parent;
2671+
arg = par;
2672+
}
2673+
return arg;
2674+
};
2675+
2676+
/** Given two TrimTxData entries, union the partitions they are in, and return the
2677+
* representative. */
2678+
static constexpr auto union_fn = [](TrimTxData* arg1, TrimTxData* arg2) noexcept {
2679+
// Replace arg1 and arg2 by their representatives.
2680+
auto rep1 = find_fn(arg1);
2681+
auto rep2 = find_fn(arg2);
2682+
// Bail out if both representatives are the same, because that means arg1 and arg2 are in
2683+
// the same partition already.
2684+
if (rep1 == rep2) return rep1;
2685+
// Pick the lower-count root to become a child of the higher-count one.
2686+
// See https://en.wikipedia.org/wiki/Disjoint-set_data_structure#Union_by_size.
2687+
if (rep1->m_uf_count < rep2->m_uf_count) std::swap(rep1, rep2);
2688+
rep2->m_uf_parent = rep1;
2689+
// Add the statistics of arg2 (which is no longer a representative) to those of arg1 (which
2690+
// is now the representative for both).
2691+
rep1->m_uf_size += rep2->m_uf_size;
2692+
rep1->m_uf_count += rep2->m_uf_count;
2693+
return rep1;
2694+
};
2695+
26402696
/** Get iterator to TrimTxData entry for a given index. */
26412697
auto locate_fn = [&](GraphIndex index) noexcept {
26422698
auto it = std::lower_bound(trim_data.begin(), trim_data.end(), index, [](TrimTxData& elem, GraphIndex idx) noexcept {
@@ -2650,14 +2706,15 @@ std::vector<TxGraph::Ref*> TxGraphImpl::Trim() noexcept
26502706
for (const auto& group_data : clusterset.m_group_data->m_groups) {
26512707
trim_data.clear();
26522708
trim_heap.clear();
2653-
deps.clear();
2709+
deps_by_child.clear();
2710+
deps_by_parent.clear();
26542711

26552712
// Gather trim data and implicit dependency data from all involved Clusters.
26562713
auto cluster_span = std::span{clusterset.m_group_data->m_group_clusters}
26572714
.subspan(group_data.m_cluster_offset, group_data.m_cluster_count);
26582715
uint64_t size{0};
26592716
for (Cluster* cluster : cluster_span) {
2660-
size += cluster->AppendTrimData(trim_data, deps);
2717+
size += cluster->AppendTrimData(trim_data, deps_by_child);
26612718
}
26622719
// If this group of Clusters does not violate any limits, continue to the next group.
26632720
if (trim_data.size() <= m_max_cluster_count && size <= m_max_cluster_size) continue;
@@ -2666,53 +2723,57 @@ std::vector<TxGraph::Ref*> TxGraphImpl::Trim() noexcept
26662723
// anymore.
26672724
std::sort(trim_data.begin(), trim_data.end(), [](auto& a, auto& b) noexcept { return a.m_index < b.m_index; });
26682725

2669-
// Add the explicitly added dependencies to deps.
2670-
deps.insert(deps.end(),
2671-
clusterset.m_deps_to_add.begin() + group_data.m_deps_offset,
2672-
clusterset.m_deps_to_add.begin() + group_data.m_deps_offset + group_data.m_deps_count);
2673-
2674-
// Sort deps by child transaction GraphIndex.
2675-
std::sort(deps.begin(), deps.end(), [](auto& a, auto& b) noexcept { return a.second < b.second; });
2676-
// Fill m_deps_left in trim_data, and initially populate trim_heap. Because of the sort
2677-
// above, all dependencies involving the same child are grouped together, so a single
2678-
// linear scan suffices.
2679-
auto deps_it = deps.begin();
2726+
// Add the explicitly added dependencies to deps_by_child.
2727+
deps_by_child.insert(deps_by_child.end(),
2728+
clusterset.m_deps_to_add.begin() + group_data.m_deps_offset,
2729+
clusterset.m_deps_to_add.begin() + group_data.m_deps_offset + group_data.m_deps_count);
2730+
2731+
// Sort deps_by_child by child transaction GraphIndex. The order will not be changed
2732+
// anymore after this.
2733+
std::sort(deps_by_child.begin(), deps_by_child.end(), [](auto& a, auto& b) noexcept { return a.second < b.second; });
2734+
// Fill m_parents_count and m_parents_offset in trim_data, as well as m_deps_left, and
2735+
// initially populate trim_heap. Because of the sort above, all dependencies involving the
2736+
// same child are grouped together, so a single linear scan suffices.
2737+
auto deps_it = deps_by_child.begin();
26802738
for (auto trim_it = trim_data.begin(); trim_it != trim_data.end(); ++trim_it) {
2739+
trim_it->m_parent_offset = deps_it - deps_by_child.begin();
26812740
trim_it->m_deps_left = 0;
2682-
while (deps_it != deps.end() && deps_it->second == trim_it->m_index) {
2741+
while (deps_it != deps_by_child.end() && deps_it->second == trim_it->m_index) {
26832742
++trim_it->m_deps_left;
26842743
++deps_it;
26852744
}
2745+
trim_it->m_parent_count = trim_it->m_deps_left;
26862746
// If this transaction has no unmet dependencies, and is not oversized, add it to the
26872747
// heap (just append for now, the heapification happens below).
26882748
if (trim_it->m_deps_left == 0 && trim_it->m_tx_size <= m_max_cluster_size) {
26892749
trim_heap.push_back(trim_it);
26902750
}
26912751
}
2692-
Assume(deps_it == deps.end());
2752+
Assume(deps_it == deps_by_child.end());
26932753

2694-
// Sort deps by parent transaction GraphIndex. The order will not be changed anymore after
2695-
// this.
2696-
std::sort(deps.begin(), deps.end(), [](auto& a, auto& b) noexcept { return a.first < b.first; });
2754+
// Construct deps_by_parent, sorted by parent transaction GraphIndex. The order will not be
2755+
// changed anymore after this.
2756+
deps_by_parent = deps_by_child;
2757+
std::sort(deps_by_parent.begin(), deps_by_parent.end(), [](auto& a, auto& b) noexcept { return a.first < b.first; });
26972758
// Fill m_children_offset and m_children_count in trim_data. Because of the sort above, all
26982759
// dependencies involving the same parent are grouped together, so a single linear scan
26992760
// suffices.
2700-
deps_it = deps.begin();
2761+
deps_it = deps_by_parent.begin();
27012762
for (auto& trim_entry : trim_data) {
27022763
trim_entry.m_children_count = 0;
2703-
trim_entry.m_children_offset = deps_it - deps.begin();
2704-
while (deps_it != deps.end() && deps_it->first == trim_entry.m_index) {
2764+
trim_entry.m_children_offset = deps_it - deps_by_parent.begin();
2765+
while (deps_it != deps_by_parent.end() && deps_it->first == trim_entry.m_index) {
27052766
++trim_entry.m_children_count;
27062767
++deps_it;
27072768
}
27082769
}
2709-
Assume(deps_it == deps.end());
2770+
Assume(deps_it == deps_by_parent.end());
27102771

27112772
// Build a heap of all transactions with 0 unmet dependencies.
27122773
std::make_heap(trim_heap.begin(), trim_heap.end(), cmp_fn);
27132774

27142775
// Iterate over to-be-included transactions, and convert them to included transactions, or
2715-
// decide to stop if doing so would violate resource limits.
2776+
// skip them if adding them would violate resource limits of the would-be cluster.
27162777
//
27172778
// It is possible that the heap empties without ever hitting either cluster limit, in case
27182779
// the implied graph (to be added dependencies plus implicit dependency between each
@@ -2723,25 +2784,44 @@ std::vector<TxGraph::Ref*> TxGraphImpl::Trim() noexcept
27232784
// as all added dependencies are in the same direction (from old mempool transactions to
27242785
// new from-block transactions); cycles require dependencies in both directions to be
27252786
// added.
2726-
uint32_t total_count{0};
2727-
uint64_t total_size{0};
27282787
while (!trim_heap.empty()) {
27292788
// Move the best remaining transaction to the end of trim_heap.
27302789
std::pop_heap(trim_heap.begin(), trim_heap.end(), cmp_fn);
27312790
// Pop it, and find its TrimTxData.
27322791
auto& entry = *trim_heap.back();
27332792
trim_heap.pop_back();
27342793

2794+
// Initialize it as a singleton partition.
2795+
entry.m_uf_parent = &entry;
2796+
entry.m_uf_count = 1;
2797+
entry.m_uf_size = entry.m_tx_size;
2798+
2799+
// Find the distinct transaction partitions this entry depends on.
2800+
current_deps.clear();
2801+
for (auto& [par, chl] : std::span{deps_by_child}.subspan(entry.m_parent_offset, entry.m_parent_count)) {
2802+
Assume(chl == entry.m_index);
2803+
current_deps.push_back(find_fn(&*locate_fn(par)));
2804+
}
2805+
std::sort(current_deps.begin(), current_deps.end());
2806+
current_deps.erase(std::unique(current_deps.begin(), current_deps.end()), current_deps.end());
2807+
27352808
// Compute resource counts.
2736-
total_count += 1;
2737-
total_size += entry.m_tx_size;
2738-
// Stop if this would violate any limit.
2739-
if (total_count > m_max_cluster_count || total_size > m_max_cluster_size) break;
2809+
uint32_t new_count = 1;
2810+
uint64_t new_size = entry.m_tx_size;
2811+
for (TrimTxData* ptr : current_deps) {
2812+
new_count += ptr->m_uf_count;
2813+
new_size += ptr->m_uf_size;
2814+
}
2815+
// Skip the entry if this would violate any limit.
2816+
if (new_count > m_max_cluster_count || new_size > m_max_cluster_size) continue;
27402817

2818+
// Union the partitions this transaction and all its dependencies are in together.
2819+
auto rep = &entry;
2820+
for (TrimTxData* ptr : current_deps) rep = union_fn(ptr, rep);
27412821
// Mark the entry as included (so the loop below will not remove the transaction).
27422822
entry.m_deps_left = uint32_t(-1);
27432823
// Mark each to-be-added dependency involving this transaction as parent satisfied.
2744-
for (auto& [par, chl] : std::span{deps}.subspan(entry.m_children_offset, entry.m_children_count)) {
2824+
for (auto& [par, chl] : std::span{deps_by_parent}.subspan(entry.m_children_offset, entry.m_children_count)) {
27452825
Assume(par == entry.m_index);
27462826
auto chl_it = locate_fn(chl);
27472827
// Reduce the number of unmet dependencies of chl_it, and if that brings the number

0 commit comments

Comments
 (0)