@@ -6,6 +6,7 @@ import { TaskStatus } from "./models";
66import { NDJSONWriteStream } from "./ndjson"
77import { isLargeFile } from './utils'
88import { sleep } from 'botasaurus/utils'
9+ import { normalizeItem } from 'botasaurus/output'
910
1011export function createProjection ( ets : string [ ] ) {
1112 return ets . reduce ( ( acc : any , field ) => {
@@ -14,16 +15,11 @@ export function createProjection(ets: string[]) {
1415 } , { } ) ;
1516}
1617
17- function normalizeKeys ( firstObjectKeysMapping : any , item : any ) {
18- for ( const key of firstObjectKeysMapping ) {
19- item [ key ] = item [ key ] === undefined ? null : item [ key ]
20- }
21- }
2218
23- function populateMissingKeys ( item : any , firstObjectKeysMapping : any ) {
24- for ( const key of Object . keys ( item ) ) {
25- if ( ! ( key in firstObjectKeysMapping ) ) {
26- firstObjectKeysMapping [ key ] = null
19+ function populateMissingKeys ( newKeys : string [ ] , allKeysMapping : any ) {
20+ for ( const key of newKeys ) {
21+ if ( ! ( key in allKeysMapping ) ) {
22+ allKeysMapping [ key ] = null
2723 }
2824 }
2925}
@@ -91,51 +87,54 @@ function renameTemporaryFile(tempFilePath: string, taskFilePathTemp: string) {
9187 } )
9288}
9389
94- function arraysEqual ( a :any , b :any ) {
95- if ( a === b ) return true ;
96- if ( a == null || b == null ) return false ;
97- if ( a . length !== b . length ) return false ;
98-
99- // If you don't care about the order of the elements inside
100- // the array, you should sort both arrays here.
101- // Please note that calling sort on an array will modify that array.
102- // you might want to clone your array first.
103-
104- for ( var i = 0 ; i < a . length ; ++ i ) {
105- if ( a [ i ] !== b [ i ] ) return false ;
106- }
107- return true ;
108- }
109-
11090async function normalizeAndDeduplicateChildrenTasks ( ids : number [ ] , parentId :number , removeDuplicatesBy : string | null ) {
11191 let itemsCount = 0
112- let shouldNormalize = false
113- let firstItem :any = null
114- let firstObjectKeysMapping : any = null
115- let firstObjectKeys : any = null
92+ let allKeysMapping : any = null
93+ let allKeys : any = null
94+ let firstItemKeyCount = 0
11695 const taskFilePath = TaskResults . generateTaskFilePath ( parentId )
11796
97+ // First pass: collect all unique keys from all objects
11898 await TaskResults . streamMultipleTask ( ids , ( item ) => {
119- if ( firstItem === null ) {
120- firstItem = item
121- firstObjectKeysMapping = createKeyToNullMapping ( firstItem )
122- firstObjectKeys = Object . keys ( firstItem )
123- }
124-
125- if ( ! arraysEqual ( firstObjectKeys , Object . keys ( item ) ) ) {
126- shouldNormalize = true
127- populateMissingKeys ( item , firstObjectKeysMapping )
99+ const itemKeys = Object . keys ( item )
100+
101+ if ( allKeysMapping === null ) {
102+ // First item: initialize with its keys
103+ allKeysMapping = createKeyToNullMapping ( item )
104+ firstItemKeyCount = itemKeys . length
105+ } else {
106+ const currentKeyCount = Object . keys ( allKeysMapping ) . length
107+
108+ if ( itemKeys . length !== currentKeyCount ) {
109+ // Different number of keys - collect new ones
110+ populateMissingKeys ( itemKeys , allKeysMapping )
111+ } else {
112+ // Same number of keys, but check if there are any new keys (different keys, not just different order)
113+ for ( const key of itemKeys ) {
114+ if ( ! ( key in allKeysMapping ) ) {
115+ // Found a new key, collect all keys from this item
116+ populateMissingKeys ( itemKeys , allKeysMapping )
117+ break
118+ }
119+ }
120+ }
128121 }
129122 } )
130123
124+ // After first pass, get the complete list of all keys
125+ if ( allKeysMapping !== null ) {
126+ allKeys = Object . keys ( allKeysMapping )
127+ }
128+
129+ const shouldNormalize = allKeys && allKeys . length !== firstItemKeyCount
131130
132131 const tempfile = taskFilePath + '.temp'
133132 const ndjsonWriteStream = new NDJSONWriteStream ( tempfile )
134133 const seen = new Set ( )
135134 try {
136135 await TaskResults . streamMultipleTask ( ids , async ( item ) => {
137136 if ( shouldNormalize ) {
138- normalizeKeys ( firstObjectKeys , item )
137+ item = normalizeItem ( allKeys , item )
139138 }
140139
141140 if ( removeDuplicatesBy ) {
0 commit comments