Improve dedup (#19565)

arvidn · web-flow · commit 5e0a0df94a26 · 2025-05-02T09:58:46.000-07:00
* improve dedup

* address review comments
diff --git a/chia/_tests/core/mempool/test_mempool_manager.py b/chia/_tests/core/mempool/test_mempool_manager.py
@@ -34,6 +34,8 @@
     can_replace,
     check_removals,
     compute_assert_height,
+    is_atom_canonical,
+    is_clvm_canonical,
     optional_max,
     optional_min,
 )
@@ -88,6 +90,74 @@
 TEST_HEIGHT = uint32(5)
 
 
+@pytest.mark.parametrize("clvm_hex", ["80", "ff8080", "ff7f03", "ffff8080ff8080"])
+def test_clvm_canonical(clvm_hex: str) -> None:
+    clvm_buf = bytes.fromhex(clvm_hex)
+    assert is_clvm_canonical(clvm_buf)
+
+
+@pytest.mark.parametrize(
+    "clvm_hex",
+    [
+        "fffe80",
+        "c000",
+        "c03f",
+        "e00000",
+        "e01fff",
+        "f0000000",
+        "f00fffff",
+        "f800000000",
+        "f807ffffff",
+        "fc0000000000",
+        "fc03ffffffff",
+        "fe",
+        "ff808080",
+    ],
+)
+def test_clvm_not_canonical(clvm_hex: str) -> None:
+    clvm_buf = bytes.fromhex(clvm_hex)
+    assert not is_clvm_canonical(clvm_buf)
+
+
+@pytest.mark.parametrize(
+    "clvm_hex, expect",
+    [
+        ("c000", 2 + 0),
+        ("c03f", 2 + 0x3F),
+        ("e00000", 3 + 0),
+        ("e01fff", 3 + 0x1FFF),
+        ("f0000000", 4 + 0),
+        ("f00fffff", 4 + 0xFFFFF),
+        ("f800000000", 5 + 0),
+        ("f807ffffff", 5 + 0x7FFFFFF),
+        ("fc0000000000", 6 + 0),
+        ("fc03ffffffff", 6 + 0x3FFFFFFFF),
+    ],
+)
+def test_atom_not_canonical(clvm_hex: str, expect: int) -> None:
+    clvm_buf = bytes.fromhex(clvm_hex)
+    atom_len, is_canonical = is_atom_canonical(clvm_buf, 0)
+    assert atom_len == expect
+    assert not is_canonical
+
+
+@pytest.mark.parametrize(
+    "clvm_hex, expect",
+    [
+        ("c040", 2 + 0x40),
+        ("e02000", 3 + 0x2000),
+        ("f0100000", 4 + 0x100000),
+        ("f808000000", 5 + 0x8000000),
+        ("fc0400000000", 6 + 0x400000000),
+    ],
+)
+def test_atom_canonical(clvm_hex: str, expect: int) -> None:
+    clvm_buf = bytes.fromhex(clvm_hex)
+    atom_len, is_canonical = is_atom_canonical(clvm_buf, 0)
+    assert atom_len == expect
+    assert is_canonical
+
+
 @dataclasses.dataclass(frozen=True)
 class TestBlockRecord:
     """
@@ -760,8 +830,12 @@ def test_optional_max() -> None:
     assert optional_max(uint32(123), uint32(234)) == uint32(234)
 
 
-def mk_coin_spend(coin: Coin) -> CoinSpend:
-    return make_spend(coin, SerializedProgram.to(None), SerializedProgram.to(None))
+def mk_coin_spend(coin: Coin, solution: Optional[str] = None) -> CoinSpend:
+    return make_spend(
+        coin,
+        SerializedProgram.to(None),
+        SerializedProgram.to(bytes.fromhex(solution) if solution is not None else None),
+    )
 
 
 def mk_bcs(coin_spend: CoinSpend, flags: int = 0) -> BundleCoinSpend:
@@ -781,6 +855,7 @@ def mk_item(
     assert_height: Optional[int] = None,
     assert_before_height: Optional[int] = None,
     assert_before_seconds: Optional[int] = None,
+    solution: Optional[str] = None,
     flags: list[int] = [],
 ) -> MempoolItem:
     # we don't actually care about the puzzle and solutions for the purpose of
@@ -793,7 +868,8 @@ def mk_item(
     for c, f in zip(coins, flags):
         coin_id = c.name()
         spend_ids.append((coin_id, f))
-        coin_spend = mk_coin_spend(c)
+        coin_spend = mk_coin_spend(c, solution=solution)
+        solution = None
         coin_spends.append(coin_spend)
         bundle_coin_spends[coin_id] = mk_bcs(coin_spend, f)
     spend_bundle = SpendBundle(coin_spends, G2Element())
@@ -1642,6 +1718,7 @@ async def get_coin_records(coin_ids: Collection[bytes32]) -> list[CoinRecord]:
 
     mempool_manager = await instantiate_mempool_manager(get_coin_records)
     # Create a bunch of mempool items that spend the coin in different ways
+    # only the first one will be accepted
     for i in range(3):
         _, _, result = await generate_and_add_spendbundle(
             mempool_manager,
@@ -1651,10 +1728,13 @@ async def get_coin_records(coin_ids: Collection[bytes32]) -> list[CoinRecord]:
             ],
             coin,
         )
-        assert result[1] == MempoolInclusionStatus.SUCCESS
-    assert len(list(mempool_manager.mempool.get_items_by_coin_id(coin_id))) == 3
-    assert mempool_manager.mempool.size() == 3
-    assert len(list(mempool_manager.mempool.items_by_feerate())) == 3
+        if i == 0:
+            assert result[1] == MempoolInclusionStatus.SUCCESS
+        else:
+            assert result[1] == MempoolInclusionStatus.PENDING
+    assert len(list(mempool_manager.mempool.get_items_by_coin_id(coin_id))) == 1
+    assert mempool_manager.mempool.size() == 1
+    assert len(list(mempool_manager.mempool.items_by_feerate())) == 1
     # Setup a new peak where the incoming block has spent the coin
     # Mark this coin as spent
     test_coin_records = {coin_id: CoinRecord(coin, uint32(0), TEST_HEIGHT, False, uint64(0))}
@@ -1833,7 +1913,7 @@ async def make_setup_and_coins(
     sb_ef_name = sb_ef.name()
     await send_to_mempool(full_node_api, sb_ef)
     # Send also a transaction EG that spends E differently from DE and EF,
-    # so that it doesn't get deduplicated on E with them
+    # to ensure it's rejected by the mempool
     conditions = [
         [ConditionOpcode.CREATE_COIN, IDENTITY_PUZZLE_HASH, e_coin.amount],
         [ConditionOpcode.ASSERT_MY_COIN_ID, e_coin.name()],
@@ -1851,14 +1931,13 @@ async def make_setup_and_coins(
     [tx_g] = action_scope.side_effects.transactions
     assert tx_g.spend_bundle is not None
     sb_e2g = SpendBundle.aggregate([sb_e2, tx_g.spend_bundle])
-    sb_e2g_name = sb_e2g.name()
-    await send_to_mempool(full_node_api, sb_e2g)
+    await send_to_mempool(full_node_api, sb_e2g, expecting_conflict=True)
 
     # Make sure our coin IDs to spend bundles mappings are correct
     assert get_sb_names_by_coin_id(full_node_api, coins[4].coin.name()) == {sb_de_name}
-    assert get_sb_names_by_coin_id(full_node_api, e_coin_id) == {sb_de_name, sb_ef_name, sb_e2g_name}
+    assert get_sb_names_by_coin_id(full_node_api, e_coin_id) == {sb_de_name, sb_ef_name}
     assert get_sb_names_by_coin_id(full_node_api, coins[5].coin.name()) == {sb_ef_name}
-    assert get_sb_names_by_coin_id(full_node_api, g_coin_id) == {sb_e2g_name}
+    assert get_sb_names_by_coin_id(full_node_api, g_coin_id) == set()
 
     await farm_a_block(full_node_api, wallet_node, ph)
 
@@ -2520,7 +2599,7 @@ async def test_advancing_ff(use_optimization: bool) -> None:
     assert spend.latest_singleton_coin == spend_c.coin.name()
 
 
-@pytest.mark.parametrize("flags", [ELIGIBLE_FOR_DEDUP, ELIGIBLE_FOR_FF])
+@pytest.mark.parametrize("flags", [ELIGIBLE_FOR_DEDUP, ELIGIBLE_FOR_FF, ELIGIBLE_FOR_FF | ELIGIBLE_FOR_DEDUP])
 @pytest.mark.anyio
 async def test_check_removals_with_block_creation(flags: int) -> None:
     LAUNCHER_ID = bytes32([1] * 32)
@@ -2560,6 +2639,18 @@ async def test_check_removals_with_block_creation(flags: int) -> None:
     assert set(removals) == {singleton_spend.coin, TEST_COIN}
 
 
+@pytest.mark.anyio
+async def test_dedup_not_canonical() -> None:
+    # this is 1, but with a non-canonical encoding
+    coin_spend = mk_coin_spend(TEST_COIN, solution="c00101")
+    coins = TestCoins(coins=[], lineage={})
+    mempool_manager = await setup_mempool(coins)
+    sb = SpendBundle([coin_spend], G2Element())
+    sb_conds = make_test_conds(spend_ids=[(TEST_COIN, ELIGIBLE_FOR_DEDUP)])
+    bundle_add_info = await mempool_manager.add_spend_bundle(sb, sb_conds, sb.name(), uint32(1))
+    assert bundle_add_info.status == MempoolInclusionStatus.FAILED
+
+
 def make_coin_record(coin: Coin, spent_block_index: int = 0) -> CoinRecord:
     return CoinRecord(coin, uint32(0), uint32(spent_block_index), False, TEST_TIMESTAMP)
 
@@ -2626,6 +2717,21 @@ class CheckRemovalsCase:
         conflicting_mempool_items={TEST_COIN_ID: [mk_item([TEST_COIN], flags=[ELIGIBLE_FOR_DEDUP])]},
         expected_result=(None, []),
     ),
+    CheckRemovalsCase(
+        id="Dedup coin, Dedup mempool conflict with different solution",
+        removals={TEST_COIN_ID: TEST_COIN_RECORD},
+        bundle_coin_spends={TEST_COIN_ID: mk_bcs(mk_coin_spend(TEST_COIN, solution="ff8080"), ELIGIBLE_FOR_DEDUP)},
+        conflicting_mempool_items={TEST_COIN_ID: [mk_item([TEST_COIN], flags=[ELIGIBLE_FOR_DEDUP])]},
+        expected_result=(
+            Err.MEMPOOL_CONFLICT,
+            [
+                mk_item(
+                    [TEST_COIN],
+                    flags=[ELIGIBLE_FOR_DEDUP],
+                )
+            ],
+        ),
+    ),
     CheckRemovalsCase(
         id="Regular coin, mempool conflict",
         removals={TEST_COIN_ID: TEST_COIN_RECORD},
diff --git a/chia/full_node/mempool_manager.py b/chia/full_node/mempool_manager.py
@@ -150,6 +150,92 @@ class NewPeakItem:
 QUOTE_EXECUTION_COST = 20
 
 
+def is_atom_canonical(clvm_buffer: bytes, offset: int) -> tuple[int, bool]:
+    b = clvm_buffer[offset]
+    if (b & 0b11000000) == 0b10000000:
+        # 6 bits length prefix
+        mask = 0b00111111
+        prefix_len = 0
+        min_value = 1
+    elif (b & 0b11100000) == 0b11000000:
+        # 5 + 8 bits length prefix
+        mask = 0b00011111
+        prefix_len = 1
+        min_value = 1 << 6
+    elif (b & 0b11110000) == 0b11100000:
+        # 4 + 8 + 8 bits length prefix
+        mask = 0b00001111
+        prefix_len = 2
+        min_value = 1 << (5 + 8)
+    elif (b & 0b11111000) == 0b11110000:
+        # 3 + 8 + 8 + 8 bits length prefix
+        mask = 0b00000111
+        prefix_len = 3
+        min_value = 1 << (4 + 8 + 8)
+    elif (b & 0b11111100) == 0b11111000:
+        # 2 + 8 + 8 + 8 + 8 bits length prefix
+        mask = 0b00000011
+        prefix_len = 4
+        min_value = 1 << (3 + 8 + 8 + 8)
+    elif (b & 0b11111110) == 0b11111100:
+        # 1 + 8 + 8 + 8 + 8 + 8 bits length prefix
+        mask = 0b00000001
+        prefix_len = 5
+        min_value = 1 << (2 + 8 + 8 + 8 + 8)
+
+    atom_len = b & mask
+    for i in range(prefix_len):
+        atom_len <<= 8
+        offset += 1
+        atom_len |= clvm_buffer[offset]
+
+    return 1 + prefix_len + atom_len, atom_len >= min_value
+
+
+def is_clvm_canonical(clvm_buffer: bytes) -> bool:
+    """
+    checks whether the CLVM serialization is all canonical representation.
+    atoms can be serialized in more than one way by using more bytes than
+    necessary to encode the length prefix. This functions ensures that all atoms are
+    encoded with the shortest representation. back-references are not allowed
+    and will make this function return false
+    """
+    assert clvm_buffer != b""
+
+    offset = 0
+    tokens_left = 1
+    while True:
+        b = clvm_buffer[offset]
+
+        # pair
+        if b == 0xFF:
+            tokens_left += 1
+            offset += 1
+            continue
+
+        # back references cannot be considered canonical, since they may be
+        # encoded in many different ways
+        if b == 0xFE:
+            return False
+
+        # small atom or NIL
+        if b <= 0x80:
+            tokens_left -= 1
+            offset += 1
+        else:
+            atom_len, canonical = is_atom_canonical(clvm_buffer, offset)
+            if not canonical:
+                return False
+            tokens_left -= 1
+            offset += atom_len
+
+        if tokens_left == 0:
+            break
+
+    # if there's garbage at the end, it's not canonical
+    return offset == len(clvm_buffer)
+
+
 def check_removals(
     removals: dict[bytes32, CoinRecord],
     bundle_coin_spends: dict[bytes32, BundleCoinSpend],
@@ -167,20 +253,35 @@ def check_removals(
         # 1. Checks if it's been spent already
         if removals[coin_id].spent and not coin_bcs.eligible_for_fast_forward:
             return Err.DOUBLE_SPEND, []
+
         # 2. Checks if there's a mempool conflict
-        # Only consider conflicts if the coin is not eligible for deduplication
         conflicting_items = get_items_by_coin_ids([coin_id])
-        if not coin_bcs.eligible_for_fast_forward and not coin_bcs.eligible_for_dedup:
-            conflicts.update(conflicting_items)
-            continue
         for item in conflicting_items:
             if item in conflicts:
                 continue
             conflict_bcs = item.bundle_coin_spends[coin_id]
-            if (coin_bcs.eligible_for_fast_forward and not conflict_bcs.eligible_for_fast_forward) or (
-                coin_bcs.eligible_for_dedup and not conflict_bcs.eligible_for_dedup
+            # if the spend we're adding to the mempool is not DEDUP nor FF, it's
+            # just a regular conflict
+            if not coin_bcs.eligible_for_fast_forward and not coin_bcs.eligible_for_dedup:
+                conflicts.add(item)
+
+            # if the spend we're adding is FF, but there's a conflicting spend
+            # that isn't FF, they can't be chained, so that's a conflict
+            elif coin_bcs.eligible_for_fast_forward and not conflict_bcs.eligible_for_fast_forward:
+                conflicts.add(item)
+
+            # if the spend we're adding is DEDUP, but there's a conflicting spend
+            # that isn't DEDUP, we cannot merge them, so that's a conflict
+            elif coin_bcs.eligible_for_dedup and not conflict_bcs.eligible_for_dedup:
+                conflicts.add(item)
+
+            # if the spend we're adding is DEDUP but the existing spend has a
+            # different solution, we cannot merge them, so that's a conflict
+            elif coin_bcs.eligible_for_dedup and bytes(coin_bcs.coin_spend.solution) != bytes(
+                conflict_bcs.coin_spend.solution
             ):
                 conflicts.add(item)
+
     if len(conflicts) > 0:
         return Err.MEMPOOL_CONFLICT, list(conflicts)
     return None, []
@@ -523,6 +624,8 @@ async def validate_spend_bundle(
                 coin_id,
                 EligibilityAndAdditions(is_eligible_for_dedup=False, spend_additions=[], ff_puzzle_hash=None),
             )
+
+            supports_dedup = eligibility_info.is_eligible_for_dedup and is_clvm_canonical(bytes(coin_spend.solution))
             mark_as_fast_forward = eligibility_info.ff_puzzle_hash is not None and supports_fast_forward(coin_spend)
             latest_singleton_coin = None
             if mark_as_fast_forward:
@@ -536,7 +639,7 @@ async def validate_spend_bundle(
                 latest_singleton_coin = lineage_info.coin_id
             bundle_coin_spends[coin_id] = BundleCoinSpend(
                 coin_spend=coin_spend,
-                eligible_for_dedup=eligibility_info.is_eligible_for_dedup,
+                eligible_for_dedup=supports_dedup,
                 eligible_for_fast_forward=mark_as_fast_forward,
                 additions=eligibility_info.spend_additions,
                 latest_singleton_coin=latest_singleton_coin,