Merge bitcoin#20557: addrman: Fix new table bucketing during unserialization

laanwj · gades · commit 0bff4331731d · 2022-05-08T18:23:23.000+03:00
4676a4f [addrman] Don't repeat "Bucketing method was updated" log multiple times (John Newbery) 4362923 [addrman] Improve serialization comments (John Newbery) ac3547e [addrman] Improve variable naming/code style of touched code. (John Newbery) a5c9b04 [addrman] Don't rebucket new table entries unnecessarily (John Newbery) 8062d92 [addrman] Rename asmap version to asmap checksum (John Newbery) 009b8e0 [addrman] Improve variable naming/code style of touched code. (John Newbery) b4c5fda [addrman] Fix new table bucketing during unserialization (John Newbery) Pull request description: This fixes three issues in addrman unserialization. 1. An addrman entry can appear in up to 8 new table buckets. We store this entry->bucket indexing during shutdown so that on restart we can restore the entries to their correct buckets. Commit ec45646 broke the deserialization code so that each entry could only be put in up to one new bucket. 2. Unserialization may result in an entry appearing in a 9th bucket. If the entry already appears in 8 buckets don't try to place it in another bucket. 3. We unnecessarily rebucket when reading a peers.dat with file version 1. Don't do that. ACKs for top commit: vasild: ACK 4676a4f glozow: re-ACK bitcoin@4676a4f, changes were a rename, comments, and removing repeat-logging. naumenkogs: ACK 4676a4f laanwj: Code review ACK 4676a4f dhruv: ACK 4676a4f ryanofsky: Code review ACK 4676a4f. I'm not previously familiar with this code but all the changes here do make sense and seem like improvements. Left some notes and comments, but they aren't important so feel to ignore. Tree-SHA512: b228984f6dec5910be23c3740ae20258da33bcf66ceb7edb10e5a53163450f743bab349e47f09808b7e8d40f27143119ec3e0981d7e678aa494d8559a1c99c23
diff --git a/src/addrman.h b/src/addrman.h
@@ -315,22 +315,20 @@ class CAddrMan
      * * nNew
      * * nTried
      * * number of "new" buckets XOR 2**30
-     * * all nNew addrinfos in vvNew
-     * * all nTried addrinfos in vvTried
-     * * for each bucket:
+     * * all new addresses (total count: nNew)
+     * * all tried addresses (total count: nTried)
+     * * for each new bucket:
      *   * number of elements
-     *   * for each element: index
+     *   * for each element: index in the serialized "all new addresses"
+     * * asmap checksum
      *
      * 2**30 is xorred with the number of buckets to make addrman deserializer v0 detect it
      * as incompatible. This is necessary because it did not check the version number on
      * deserialization.
      *
-     * Notice that vvTried, mapAddr and vVector are never encoded explicitly;
+     * vvNew, vvTried, mapInfo, mapAddr and vRandom are never encoded explicitly;
      * they are instead reconstructed from the other information.
      *
-     * vvNew is serialized, but only used if ADDRMAN_UNKNOWN_BUCKET_COUNT didn't change,
-     * otherwise it is reconstructed as well.
-     *
      * This format is more complex, but significantly smaller (at most 1.5 MiB), and supports
      * changes to the ADDRMAN_ parameters without breaking the on-disk structure.
      *
@@ -388,13 +386,13 @@ class CAddrMan
                 }
             }
         }
-        // Store asmap version after bucket entries so that it
+        // Store asmap checksum after bucket entries so that it
         // can be ignored by older clients for backward compatibility.
-        uint256 asmap_version;
+        uint256 asmap_checksum;
         if (m_asmap.size() != 0) {
-            asmap_version = SerializeHash(m_asmap);
+            asmap_checksum = SerializeHash(m_asmap);
         }
-        s << asmap_version;
+        s << asmap_checksum;
     }
 
     template <typename Stream>
@@ -477,47 +475,63 @@ class CAddrMan
         nTried -= nLost;
 
         // Store positions in the new table buckets to apply later (if possible).
-        std::map<int, int> entryToBucket; // Represents which entry belonged to which bucket when serializing
-
-        for (int bucket = 0; bucket < nUBuckets; bucket++) {
-            int nSize = 0;
-            s >> nSize;
-            for (int n = 0; n < nSize; n++) {
-                int nIndex = 0;
-                s >> nIndex;
-                if (nIndex >= 0 && nIndex < nNew) {
-                    entryToBucket[nIndex] = bucket;
+        // An entry may appear in up to ADDRMAN_NEW_BUCKETS_PER_ADDRESS buckets,
+        // so we store all bucket-entry_index pairs to iterate through later.
+        std::vector<std::pair<int, int>> bucket_entries;
+
+        for (int bucket = 0; bucket < nUBuckets; ++bucket) {
+            int num_entries{0};
+            s >> num_entries;
+            for (int n = 0; n < num_entries; ++n) {
+                int entry_index{0};
+                s >> entry_index;
+                if (entry_index >= 0 && entry_index < nNew) {
+                    bucket_entries.emplace_back(bucket, entry_index);
                 }
             }
         }
 
-        uint256 supplied_asmap_version;
+        // If the bucket count and asmap checksum haven't changed, then attempt
+        // to restore the entries to the buckets/positions they were in before
+        // serialization.
+        uint256 supplied_asmap_checksum;
         if (m_asmap.size() != 0) {
-            supplied_asmap_version = SerializeHash(m_asmap);
+            supplied_asmap_checksum = SerializeHash(m_asmap);
         }
-        uint256 serialized_asmap_version;
+        uint256 serialized_asmap_checksum;
         if (format >= Format::V2_ASMAP) {
-            s >> serialized_asmap_version;
+            s >> serialized_asmap_checksum;
         }
+        const bool restore_bucketing{nUBuckets == ADDRMAN_NEW_BUCKET_COUNT &&
+                                     serialized_asmap_checksum == supplied_asmap_checksum};
 
-        for (int n = 0; n < nNew; n++) {
-            CAddrInfo &info = mapInfo[n];
-            int bucket = entryToBucket[n];
-            int nUBucketPos = info.GetBucketPosition(nKey, true, bucket);
-            if (format >= Format::V2_ASMAP && nUBuckets == ADDRMAN_NEW_BUCKET_COUNT && vvNew[bucket][nUBucketPos] == -1 &&
-                info.nRefCount < ADDRMAN_NEW_BUCKETS_PER_ADDRESS && serialized_asmap_version == supplied_asmap_version) {
+        if (!restore_bucketing) {
+            LogPrint(BCLog::ADDRMAN, "Bucketing method was updated, re-bucketing addrman entries from disk\n");
+        }
+
+        for (auto bucket_entry : bucket_entries) {
+            int bucket{bucket_entry.first};
+            const int entry_index{bucket_entry.second};
+            CAddrInfo& info = mapInfo[entry_index];
+
+            // The entry shouldn't appear in more than
+            // ADDRMAN_NEW_BUCKETS_PER_ADDRESS. If it has already, just skip
+            // this bucket_entry.
+            if (info.nRefCount >= ADDRMAN_NEW_BUCKETS_PER_ADDRESS) continue;
+
+            int bucket_position = info.GetBucketPosition(nKey, true, bucket);
+            if (restore_bucketing && vvNew[bucket][bucket_position] == -1) {
                 // Bucketing has not changed, using existing bucket positions for the new table
-                vvNew[bucket][nUBucketPos] = n;
-                info.nRefCount++;
+                vvNew[bucket][bucket_position] = entry_index;
+                ++info.nRefCount;
             } else {
-                // In case the new table data cannot be used (format unknown, bucket count wrong or new asmap),
+                // In case the new table data cannot be used (bucket count wrong or new asmap),
                 // try to give them a reference based on their primary source address.
-                LogPrint(BCLog::ADDRMAN, "Bucketing method was updated, re-bucketing addrman entries from disk\n");
                 bucket = info.GetNewBucket(nKey, m_asmap);
-                nUBucketPos = info.GetBucketPosition(nKey, true, bucket);
-                if (vvNew[bucket][nUBucketPos] == -1) {
-                    vvNew[bucket][nUBucketPos] = n;
-                    info.nRefCount++;
+                bucket_position = info.GetBucketPosition(nKey, true, bucket);
+                if (vvNew[bucket][bucket_position] == -1) {
+                    vvNew[bucket][bucket_position] = entry_index;
+                    ++info.nRefCount;
                 }
             }
         }