mongodb-labs
diff --git a/‎.github/workflows/all.yml‎
Lines changed: 15 additions & 1 deletion b/‎.github/workflows/all.yml‎
Lines changed: 15 additions & 1 deletion
diff --git a/‎README.md‎
Lines changed: 33 additions & 1 deletion b/‎README.md‎
Lines changed: 33 additions & 1 deletion
diff --git a/‎internal/comparehashed/tohashedindexkey.go‎
Lines changed: 22 additions & 0 deletions b/‎internal/comparehashed/tohashedindexkey.go‎
Lines changed: 22 additions & 0 deletions
diff --git a/‎internal/partitions/partition.go‎
Lines changed: 61 additions & 48 deletions b/‎internal/partitions/partition.go‎
Lines changed: 61 additions & 48 deletions
@@ -22,6 +22,15 @@ jobs:
               srcConnStr: mongodb://localhost:27020,localhost:27021,localhost:27022
               dstConnStr: mongodb://localhost:27030,localhost:27031,localhost:27032
 
+        exclude:
+          - mongodb_versions: [ '4.2', '4.2' ]
+            toHashedIndexKey: true
+          - mongodb_versions: [ '4.2', '4.4' ]
+            toHashedIndexKey: true
+          - mongodb_versions: [ '4.2', '5.0' ]
+            toHashedIndexKey: true
+          - mongodb_versions: [ '4.2', '6.0' ]
+            toHashedIndexKey: true
 
         # versions are: source, destination
         mongodb_versions:
@@ -33,10 +42,12 @@ jobs:
           - [ '4.4', '4.4' ]
           - [ '4.4', '5.0' ]
           - [ '4.4', '6.0' ]
+          - [ '4.4', '8.0' ]
 
           - [ '5.0', '5.0' ]
           - [ '5.0', '6.0' ]
           - [ '5.0', '7.0' ]
+          - [ '5.0', '8.0' ]
 
           - [ '6.0', '6.0' ]
           - [ '6.0', '7.0' ]
@@ -47,6 +58,8 @@ jobs:
 
           - [ '8.0', '8.0' ]
 
+        toHashedIndexKey: [true, false]
+
         topology:
           - name: replset
             srcConnStr: mongodb://localhost:27020,localhost:27021,localhost:27022
@@ -67,7 +80,7 @@ jobs:
     # versions need.
     runs-on: ubuntu-22.04
 
-    name: ${{ matrix.mongodb_versions[0] }} to ${{ matrix.mongodb_versions[1] }}, ${{ matrix.topology.name }}
+    name: ${{ matrix.mongodb_versions[0] }} to ${{ matrix.mongodb_versions[1] }}, ${{ matrix.topology.name }}${{ matrix.toHashedIndexKey && ', hashed doc compare' || '' }}
 
     steps:
       - run: uname -a
@@ -110,6 +123,7 @@ jobs:
       - name: Test
         run: go test -v ./... -race
         env:
+          MVTEST_DOC_COMPARE_METHOD: ${{matrix.toHashedIndexKey && 'toHashedIndexKey' || ''}}
           MVTEST_SRC: ${{matrix.topology.srcConnStr}}
           MVTEST_DST: ${{matrix.topology.dstConnStr}}
           MVTEST_META: mongodb://localhost:27040
@@ -131,7 +131,7 @@ The verifier will now check to completion to make sure that there are no inconsi
 | `--srcNamespace <namespaces>`           | source namespaces to check                                                                                                                                                                  |
 | `--dstNamespace <namespaces>`           | destination namespaces to check                                                                                                                                                             |
 | `--metaDBName <name>`                   | name of the database in which to store verification metadata (default: "migration_verification_metadata")                                                                                   |
-| `--ignoreFieldOrder`                    | Whether or not field order is ignored in documents                                                                                                                                          |
+| `--docCompareMethod`                    | How to compare documents. See below for details.                                                                                                                                        |
 | `--verifyAll`                           | If set, verify all user namespaces                                                                                                                                                          |
 | `--clean`                               | If set, drop all previous verification metadata before starting                                                                                                                             |
 | `--readPreference <value>`              | Read preference for reading data from clusters. May be 'primary', 'secondary', 'primaryPreferred', 'secondaryPreferred', or 'nearest' (default: "primary")                                  |
@@ -312,6 +312,38 @@ The migration-verifier optimizes for the case where a migration’s initial sync
 
 The migration-verifier is also rather resource-hungry. To mitigate this, try limiting its number of workers (i.e., `--numWorkers`), its partition size (`--partitionSizeMB`), and/or its process group’s resource limits (see the `ulimit` command in POSIX OSes).
 
+# Document comparison methods
+
+## `binary`
+
+The default. This establishes full binary equivalence, including field order and all types.
+
+## `ignoreFieldOrder`
+
+Like `binary` but ignores the ordering of fields. Incurs extra overhead on this host.
+
+## `toHashedIndexKey`
+
+Compares document hashes (and lengths) rather than full documents. This minimizes the data sent to migration-verifier, which can dramatically shorten verification time.
+
+It carries a few downsides, though:
+
+### Lost precision
+
+This method ignores certain type changes if the underlying value remains the same. For example, if a Long changes to a Double, and the two values are identical, `toHashedIndexKey` will not notice the discrepancy.
+
+The discrepancy _will_, though, usually be seen if the BSON types are of different lengths. For example, if a Long changes to Decimal, `toHashedIndexKey` will notice that.
+
+If, however, _multiple_ numeric type changes happen, then `toHashedIndexKey` will only notice the discrepancy if the total document length changes. For example, if an Int changes to a Long, but elsewhere a Long changes to an Int, that will evade notice.
+
+The above are all, of course, **highly** unlikely in real-world migrations.
+
+### Lost reporting
+
+Full-document verification methods allow migration-verifier to diagnose mismatches, e.g., by identifying specific changed fields. The only such detail that `toHashedIndexKey` can discern, though, is a change in document length.
+
+Additionally, because the amount of data sent to migration-verifier doesn’t actually reflect the documents’ size, no meaningful statistics are shown concerning the collection data size. Document counts, of course, are still shown.
+
 # Known Issues
 
 - The verifier may report missing documents on the destination that don’t actually appear to be missing (i.e., a nonexistent problem). This has been hard to reproduce. If missing documents are reported, it is good practice to check for false positives.
 
@@ -0,0 +1,22 @@
+package comparehashed
+
+func CanCompareDocsViaToHashedIndexKey(
+	version []int,
+) bool {
+	if version[0] >= 8 {
+		return true
+	}
+
+	switch version[0] {
+	case 7:
+		return version[2] >= 6
+	case 6:
+		return version[2] >= 14
+	case 5:
+		return version[2] >= 25
+	case 4:
+		return version[1] == 4 && version[2] >= 29
+	default:
+		return false
+	}
+}
@@ -2,12 +2,14 @@ package partitions
 
 import (
 	"fmt"
+	"slices"
 
-	"github.com/10gen/migration-verifier/internal/logger"
 	"github.com/10gen/migration-verifier/internal/util"
+	"github.com/10gen/migration-verifier/option"
 	"github.com/pkg/errors"
 	"go.mongodb.org/mongo-driver/bson"
 	"go.mongodb.org/mongo-driver/bson/primitive"
+	"go.mongodb.org/mongo-driver/mongo"
 )
 
 // PartitionKey represents the _id of a partition document stored in the destination.
@@ -94,66 +96,75 @@ func (p *Partition) lowerBoundFromCurrent(current bson.Raw) (any, error) {
 	return nil, errors.New("could not find an '_id' element in the raw document")
 }
 
-// FindCmd constructs the Find command for reading documents from the partition. For capped
-// collections, the sort order will be `$natural` and the `lowerBound` argument is ignored. For
-// all other collections, the collection will be sorted by the `_id` field. The `lowerBound`
-// argument will determine the starting point for the find. If it is `nil`, then the value of
-// `p.Key.Lower`.
-//
-// This always constructs a non-type-bracketed find command.
-func (p *Partition) FindCmd(
-	// TODO (REP-1281)
-	logger *logger.Logger,
-	startAt *primitive.Timestamp,
-	// We only use this for testing.
-	batchSize ...int,
-) bson.D {
-	// Get the bounded query filter from the partition to be used in the Find command.
-	findCmd := bson.D{
-		{"find", p.Ns.Coll},
-		{"collectionUUID", p.Key.SourceUUID},
-		{"readConcern", bson.D{
-			{"level", "majority"},
-			// Start the cursor after the global state's ChangeStreamStartAtTs. Otherwise,
-			// there may be changes made by collection copy prior to change event application's
-			// start time that are not accounted for, leading to potential data
-			// inconsistencies.
-			{"afterClusterTime", startAt},
-		}},
-		// The cursor should not have a timeout.
-		{"noCursorTimeout", true},
+type PartitionQueryParameters struct {
+	filter    option.Option[bson.D]
+	sortField option.Option[string]
+	hint      option.Option[bson.D]
+}
+
+func (pqp PartitionQueryParameters) ToFindOptions() bson.D {
+	doc := bson.D{}
+
+	if theFilter, has := pqp.filter.Get(); has {
+		doc = append(doc, bson.E{"filter", theFilter})
+	}
+
+	pqp.addHintIfNeeded(&doc)
+
+	return doc
+}
+
+func (pqp PartitionQueryParameters) ToAggOptions() bson.D {
+	pl := mongo.Pipeline{}
+
+	if theFilter, has := pqp.filter.Get(); has {
+		pl = append(pl, bson.D{{"$match", theFilter}})
 	}
-	if len(batchSize) > 0 {
-		findCmd = append(findCmd, bson.E{"batchSize", batchSize[0]})
+
+	if theSort, has := pqp.sortField.Get(); has {
+		pl = append(pl, bson.D{{"$sort", bson.D{{theSort, 1}}}})
 	}
-	findOptions := p.GetFindOptions(nil, nil)
-	findCmd = append(findCmd, findOptions...)
 
-	return findCmd
+	doc := bson.D{
+		{"pipeline", pl},
+	}
+
+	pqp.addHintIfNeeded(&doc)
+
+	return doc
+}
+
+func (pqp PartitionQueryParameters) addHintIfNeeded(docRef *bson.D) {
+	if theHint, has := pqp.hint.Get(); has {
+		*docRef = append(*docRef, bson.E{"hint", theHint})
+	}
 }
 
-// GetFindOptions returns only the options necessary to do a find on any given collection with this
-// partition. It is intended to allow the same partitioning to be used on different collections
-// (e.g. use the partitions on the source to read the destination for verification)
+// GetQueryParameters returns a PartitionQueryParameters that describes the
+// parameters needed to fetch docs for the partition. It is intended to allow
+// the same partitioning to be used on different collections (e.g. use the
+// partitions on the source to read the destination for verification)
 // If the passed-in buildinfo indicates a mongodb version < 5.0, type bracketing is not used.
 // filterAndPredicates is a slice of filter criteria that's used to construct the "filter" field in the find option.
-func (p *Partition) GetFindOptions(clusterInfo *util.ClusterInfo, filterAndPredicates bson.A) bson.D {
+func (p *Partition) GetQueryParameters(clusterInfo *util.ClusterInfo, filterAndPredicates bson.A) PartitionQueryParameters {
+	params := PartitionQueryParameters{}
+
 	if p == nil {
 		if len(filterAndPredicates) > 0 {
-			return bson.D{{"filter", bson.D{{"$and", filterAndPredicates}}}}
+			params.filter = option.Some(bson.D{{"$and", filterAndPredicates}})
 		}
-		return bson.D{}
+
+		return params
 	}
-	findOptions := bson.D{}
+
 	if p.IsCapped {
 		// For capped collections, sort the documents by their natural order. We deliberately
 		// exclude the ID filter to ensure that documents are inserted in the correct order.
-		sort := bson.E{"sort", bson.D{{"$natural", 1}}}
-		findOptions = append(findOptions, sort)
+		params.sortField = option.Some("$natural")
 	} else {
 		// For non-capped collections, sort by _id to minimize the amount of time
 		// that a given document spends cached in memory.
-		findOptions = append(findOptions, bson.E{"sort", bson.D{{"_id", 1}}})
+		params.sortField = option.Some("_id")
 
 		// For non-capped collections, the cursor should use the ID filter and the _id index.
 		// Get the bounded query filter from the partition to be used in the Find command.
@@ -167,20 +178,22 @@ func (p *Partition) GetFindOptions(clusterInfo *util.ClusterInfo, filterAndPredi
 			}
 		}
 
+		filterAndPredicates = slices.Clone(filterAndPredicates)
+
 		if useExprFind {
 			filterAndPredicates = append(filterAndPredicates, p.filterWithExpr())
 		} else {
 			filterAndPredicates = append(filterAndPredicates, p.filterWithExplicitTypeChecks())
 		}
 
-		hint := bson.E{"hint", bson.D{{"_id", 1}}}
-		findOptions = append(findOptions, hint)
+		params.hint = option.Some(bson.D{{"_id", 1}})
 	}
 
 	if len(filterAndPredicates) > 0 {
-		findOptions = append(findOptions, bson.E{"filter", bson.D{{"$and", filterAndPredicates}}})
+		params.filter = option.Some(bson.D{{"$and", filterAndPredicates}})
 	}
-	return findOptions
+
+	return params
 }
 
 // filterWithExpr returns a range filter on _id to be used in a Find query for the