|
| 1 | +// Migration script: convert MongoDB GUID fields from BinData subtype 3 (CSharpLegacy) |
| 2 | +// to BinData subtype 4 (Standard/RFC 4122). |
| 3 | +// |
| 4 | +// Usage: |
| 5 | +// npx tsc database/migrate-guids-to-subtype4.ts |
| 6 | +// mongosh CombineDatabase database/migrate-guids-to-subtype4.js |
| 7 | +// |
| 8 | +// Type-check only (no output): |
| 9 | +// npx tsc --project database/tsconfig.json --noEmit |
| 10 | +// |
| 11 | +// Background: |
| 12 | +// The C# MongoDB driver previously encoded System.Guid values using BinData subtype 3 |
| 13 | +// with a CSharpLegacy byte order (little-endian for the first three components). |
| 14 | +// BinData subtype 4 uses a stable RFC 4122 byte order across all drivers, making |
| 15 | +// it easier to search by GUID in mongosh and other tools. |
| 16 | +// |
| 17 | +// CSharpLegacy byte order for UUID aabbccdd-eeff-gghh-iijj-kkllmmnnoopp: |
| 18 | +// stored as [dd,cc,bb,aa, ff,ee, hh,gg, ii,jj,kk,ll,mm,nn,oo,pp] |
| 19 | +// Standard (subtype 4) byte order for the same UUID: |
| 20 | +// stored as [aa,bb,cc,dd, ee,ff, gg,hh, ii,jj,kk,ll,mm,nn,oo,pp] |
| 21 | + |
| 22 | +// Type declarations for mongosh globals. |
| 23 | +// All `declare` statements are erased by tsc and produce no JS output. |
| 24 | +// |
| 25 | +// The inline BsonBinary interface matches the bson@^7 Binary API: |
| 26 | +// sub_type is a number property; toString("hex") returns the hex string. |
| 27 | +interface BsonBinary { |
| 28 | + sub_type: number; |
| 29 | + toString(encoding: "hex" | "base64" | "utf8" | "utf-8"): string; |
| 30 | +} |
| 31 | +declare function UUID(hexstr?: string): BsonBinary; |
| 32 | +declare function print(msg: string): void; |
| 33 | +declare const db: MongoDB; |
| 34 | + |
| 35 | +type MongoDoc = Record<string, unknown>; |
| 36 | + |
| 37 | +interface MongoCursor { |
| 38 | + forEach(callback: (doc: MongoDoc) => void): void; |
| 39 | +} |
| 40 | + |
| 41 | +interface MongoCollection { |
| 42 | + find(query: MongoDoc): MongoCursor; |
| 43 | + updateOne(filter: MongoDoc, update: MongoDoc): void; |
| 44 | +} |
| 45 | + |
| 46 | +interface MongoDB { |
| 47 | + getCollection(name: string): MongoCollection; |
| 48 | + getCollectionNames(): string[]; |
| 49 | +} |
| 50 | + |
| 51 | +/** |
| 52 | + * Convert a BinData subtype 3 (CSharpLegacy) GUID to BinData subtype 4 (Standard). |
| 53 | + * Returns null if the input is not a subtype-3 binary value. |
| 54 | + */ |
| 55 | +function csharpGuidToStandard(bin: unknown): ReturnType<typeof UUID> | null { |
| 56 | + if (bin === null || typeof bin !== "object") { |
| 57 | + return null; |
| 58 | + } |
| 59 | + const binary = bin as Partial<BsonBinary>; |
| 60 | + if (binary.sub_type !== 3 || typeof binary.toString !== "function") { |
| 61 | + return null; |
| 62 | + } |
| 63 | + // Split the 32-character hex string into 16 byte pairs. |
| 64 | + const hexBytes = binary.toString("hex").match(/../g); |
| 65 | + if (hexBytes === null || hexBytes.length !== 16) { |
| 66 | + return null; |
| 67 | + } |
| 68 | + |
| 69 | + // Rearrange the first 8 bytes (4+2+2) from little-endian to big-endian; |
| 70 | + // the remaining 8 bytes are already in big-endian order. |
| 71 | + const rev1 = hexBytes[3] + hexBytes[2] + hexBytes[1] + hexBytes[0]; |
| 72 | + const rev2 = hexBytes[5] + hexBytes[4]; |
| 73 | + const rev3 = hexBytes[7] + hexBytes[6]; |
| 74 | + const keep1 = hexBytes[8] + hexBytes[9]; |
| 75 | + const keep2 = hexBytes.slice(10).join(""); |
| 76 | + const uuidStr = rev1 + "-" + rev2 + "-" + rev3 + "-" + keep1 + "-" + keep2; |
| 77 | + return UUID(uuidStr); |
| 78 | +} |
| 79 | + |
| 80 | +let totalGuidsConverted = 0; |
| 81 | +let totalDocumentsUpdated = 0; |
| 82 | + |
| 83 | +// ── WordsCollection and FrontierCollection ────────────────────────────────── |
| 84 | +// |
| 85 | +// Each Word document has: |
| 86 | +// - guid (BinData, top-level) |
| 87 | +// - senses[].guid (BinData, per-element in the senses array) |
| 88 | + |
| 89 | +for (const collName of ["WordsCollection", "FrontierCollection"]) { |
| 90 | + const coll = db.getCollection(collName); |
| 91 | + |
| 92 | + // Find all words that have binData guid fields (the conversion function handles subtype checking). |
| 93 | + coll.find({ guid: { $type: "binData" } }).forEach((doc) => { |
| 94 | + const update: Record<string, ReturnType<typeof UUID>> = {}; |
| 95 | + |
| 96 | + // Convert top-level guid. |
| 97 | + const newGuid = csharpGuidToStandard(doc["guid"]); |
| 98 | + if (newGuid !== null) { |
| 99 | + update["guid"] = newGuid; |
| 100 | + totalGuidsConverted++; |
| 101 | + } |
| 102 | + |
| 103 | + // Convert each sense's guid. |
| 104 | + if (Array.isArray(doc["senses"])) { |
| 105 | + doc["senses"].forEach((sense, i) => { |
| 106 | + if (sense === null || typeof sense !== "object") { |
| 107 | + return; |
| 108 | + } |
| 109 | + const newSenseGuid = csharpGuidToStandard(sense["guid"]); |
| 110 | + if (newSenseGuid !== null) { |
| 111 | + update[`senses.${i}.guid`] = newSenseGuid; |
| 112 | + totalGuidsConverted++; |
| 113 | + } |
| 114 | + }); |
| 115 | + } |
| 116 | + |
| 117 | + if (Object.keys(update).length > 0) { |
| 118 | + try { |
| 119 | + coll.updateOne({ _id: doc["_id"] }, { $set: update }); |
| 120 | + totalDocumentsUpdated++; |
| 121 | + } catch (e) { |
| 122 | + print(`Error updating document ${doc["_id"]}: ${e}`); |
| 123 | + } |
| 124 | + } |
| 125 | + }); |
| 126 | + |
| 127 | + print(`${collName}: done`); |
| 128 | +} |
| 129 | + |
| 130 | +// ── UserEditsCollection ────────────────────────────────────────────────────── |
| 131 | +// |
| 132 | +// Each UserEdit document has: |
| 133 | +// - edits[].guid (BinData, per-element in the edits array) |
| 134 | + |
| 135 | +const userEditsColl = db.getCollection("UserEditsCollection"); |
| 136 | + |
| 137 | +// Find all UserEdits that have binData guid fields (the conversion function handles subtype checking). |
| 138 | +userEditsColl.find({ "edits.guid": { $type: "binData" } }).forEach((doc) => { |
| 139 | + const update: Record<string, ReturnType<typeof UUID>> = {}; |
| 140 | + |
| 141 | + if (Array.isArray(doc["edits"])) { |
| 142 | + doc["edits"].forEach((edit, i) => { |
| 143 | + if (edit === null || typeof edit !== "object") { |
| 144 | + return; |
| 145 | + } |
| 146 | + const newEditGuid = csharpGuidToStandard(edit["guid"]); |
| 147 | + if (newEditGuid !== null) { |
| 148 | + update[`edits.${i}.guid`] = newEditGuid; |
| 149 | + totalGuidsConverted++; |
| 150 | + } |
| 151 | + }); |
| 152 | + } |
| 153 | + |
| 154 | + if (Object.keys(update).length > 0) { |
| 155 | + try { |
| 156 | + userEditsColl.updateOne({ _id: doc["_id"] }, { $set: update }); |
| 157 | + totalDocumentsUpdated++; |
| 158 | + } catch (e) { |
| 159 | + print(`Error updating document ${doc["_id"]}: ${e}`); |
| 160 | + } |
| 161 | + } |
| 162 | +}); |
| 163 | + |
| 164 | +print("UserEditsCollection: done"); |
| 165 | + |
| 166 | +print( |
| 167 | + `Migration complete. ${totalGuidsConverted} GUID(s) converted in ${totalDocumentsUpdated} document(s).` |
| 168 | +); |
| 169 | + |
| 170 | +// ── Final verification scan ───────────────────────────────────────────────── |
| 171 | +// |
| 172 | +// Recursively scan every collection/document/field and count objects with a |
| 173 | +// sub_type property that is not 4. |
| 174 | + |
| 175 | +function countObjectsWithSubtypeNot4(root: unknown): number { |
| 176 | + if (root === null || root === undefined || typeof root !== "object") { |
| 177 | + return 0; |
| 178 | + } |
| 179 | + |
| 180 | + if (Array.isArray(root)) { |
| 181 | + return root.reduce<number>( |
| 182 | + (sum, item) => sum + countObjectsWithSubtypeNot4(item), |
| 183 | + 0 |
| 184 | + ); |
| 185 | + } |
| 186 | + |
| 187 | + if ("sub_type" in root && root.sub_type !== 4) { |
| 188 | + return 1; |
| 189 | + } |
| 190 | + |
| 191 | + return Object.values(root as Record<string, unknown>).reduce<number>( |
| 192 | + (sum, child) => sum + countObjectsWithSubtypeNot4(child), |
| 193 | + 0 |
| 194 | + ); |
| 195 | +} |
| 196 | + |
| 197 | +let totalNonSubtype4Objects = 0; |
| 198 | + |
| 199 | +for (const collName of db.getCollectionNames()) { |
| 200 | + print( |
| 201 | + `Scanning collection ${collName} for objects found with sub_type !== 4...` |
| 202 | + ); |
| 203 | + db.getCollection(collName) |
| 204 | + .find({}) |
| 205 | + .forEach((doc) => { |
| 206 | + const subcount = countObjectsWithSubtypeNot4(doc); |
| 207 | + if (subcount > 0) { |
| 208 | + print(`* doc ${doc["_id"]}: ${subcount} objects with sub_type !== 4`); |
| 209 | + } |
| 210 | + totalNonSubtype4Objects += subcount; |
| 211 | + }); |
| 212 | +} |
| 213 | + |
| 214 | +print(`Final scan: ${totalNonSubtype4Objects} objects found.`); |
0 commit comments