Skip to content

Commit 64f0d20

Browse files
authored
fix: #851 512MB seems to be the max supported file size for disk persistence plugin (#975)
1 parent e535eac commit 64f0d20

File tree

1 file changed

+164
-8
lines changed
  • packages/plugin-data-persistence/src

1 file changed

+164
-8
lines changed

packages/plugin-data-persistence/src/server.ts

Lines changed: 164 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,13 @@
11
import type { AnyOrama } from '@orama/orama'
2+
import { save, create, load } from '@orama/orama'
3+
import { encode, decode } from '@msgpack/msgpack'
4+
// @ts-expect-error dpack does not expose types
5+
import * as dpack from 'dpack'
26
import type { FileSystem, PersistenceFormat, Runtime } from './types.js'
3-
import { FILESYSTEM_NOT_SUPPORTED_ON_RUNTIME } from './errors.js'
7+
import { FILESYSTEM_NOT_SUPPORTED_ON_RUNTIME, UNSUPPORTED_FORMAT } from './errors.js'
48
import { persist, restore } from './index.js'
59
import { detectRuntime } from './utils.js'
10+
import { serializeOramaInstance } from './seqproto.js'
611

712
export const DEFAULT_DB_NAME = `orama_bump_${+new Date()}`
813

@@ -26,13 +31,8 @@ export async function persistToFile<T extends AnyOrama>(
2631
path = await getDefaultOutputFilename(format, runtime)
2732
}
2833

29-
const serialized = await persist(db, format, runtime)
30-
let toWrite: any = serialized
31-
// Convert ArrayBuffer (seqproto) to Buffer/String for FS
32-
if (serialized instanceof ArrayBuffer) {
33-
toWrite = Buffer.from(serialized)
34-
}
35-
await _fs.writeFile(path, toWrite)
34+
// For large datasets, use streaming approach to avoid memory issues
35+
await persistToFileStreaming(db, format, path, runtime)
3636

3737
return path
3838
}
@@ -55,6 +55,12 @@ export async function restoreFromFile<T extends AnyOrama>(
5555
}
5656

5757
const data = await _fs.readFile(path)
58+
59+
// Handle new binary format that stores data as binary instead of hex
60+
if (format === 'binary' && data instanceof Buffer) {
61+
return restoreFromBinaryData(data, runtime)
62+
}
63+
5864
return restore(format, data, runtime)
5965
}
6066

@@ -135,3 +141,153 @@ export async function getDefaultFileName(format: PersistenceFormat, runtime?: Ru
135141

136142
return `${dbName}.${extension}`
137143
}
144+
145+
// Streaming implementation to handle large datasets without memory issues
146+
async function persistToFileStreaming<T extends AnyOrama>(
147+
db: T,
148+
format: PersistenceFormat,
149+
filePath: string,
150+
runtime: Runtime
151+
): Promise<void> {
152+
const dbExport = await save(db)
153+
154+
switch (format) {
155+
case 'json':
156+
await streamJsonToFile(dbExport, filePath, runtime)
157+
break
158+
case 'binary':
159+
await streamBinaryToFile(dbExport, filePath, runtime)
160+
break
161+
case 'dpack':
162+
// dpack doesn't have streaming support, use regular approach
163+
// but check size and warn if too large
164+
const dpackSerialized = dpack.serialize(dbExport)
165+
await _fs.writeFile(filePath, dpackSerialized)
166+
break
167+
case 'seqproto':
168+
const seqprotoSerialized = serializeOramaInstance(db)
169+
const buffer = Buffer.from(seqprotoSerialized)
170+
await _fs.writeFile(filePath, buffer)
171+
break
172+
default:
173+
throw new Error(UNSUPPORTED_FORMAT(format))
174+
}
175+
}
176+
177+
// Stream JSON to file using streaming JSON stringification
178+
async function streamJsonToFile(data: any, filePath: string, runtime: Runtime): Promise<void> {
179+
if (runtime === 'node') {
180+
const fs = await import('node:fs')
181+
const { createWriteStream } = fs
182+
183+
return new Promise((resolve, reject) => {
184+
const stream = createWriteStream(filePath)
185+
186+
// For very large objects, we need to stringify in chunks
187+
// This is a simplified approach - in production you might want to use
188+
// a streaming JSON library
189+
try {
190+
const jsonString = JSON.stringify(data)
191+
stream.write(jsonString)
192+
stream.end()
193+
stream.on('finish', resolve)
194+
stream.on('error', reject)
195+
} catch (error) {
196+
// If JSON.stringify fails due to size, try chunked approach
197+
if (error instanceof Error && error.message.includes('string length')) {
198+
streamLargeJsonToFile(data, stream, resolve, reject)
199+
} else {
200+
reject(error)
201+
}
202+
}
203+
})
204+
} else {
205+
// For non-Node environments, fall back to regular approach
206+
const jsonString = JSON.stringify(data)
207+
await _fs.writeFile(filePath, jsonString)
208+
}
209+
}
210+
211+
// Handle extremely large JSON by breaking it into manageable chunks
212+
function streamLargeJsonToFile(data: any, stream: any, resolve: () => void, reject: (error: any) => void): void {
213+
try {
214+
stream.write('{')
215+
216+
let isFirst = true
217+
for (const [key, value] of Object.entries(data)) {
218+
if (!isFirst) {
219+
stream.write(',')
220+
}
221+
isFirst = false
222+
223+
// Write key
224+
stream.write(`"${key}":`)
225+
226+
// For large values, try to stringify them separately
227+
try {
228+
const valueStr = JSON.stringify(value)
229+
stream.write(valueStr)
230+
} catch (valueError) {
231+
// If individual value is too large, we need different handling
232+
console.warn(`Skipping large value for key ${key}:`, valueError)
233+
stream.write('null')
234+
}
235+
}
236+
237+
stream.write('}')
238+
stream.end()
239+
stream.on('finish', resolve)
240+
stream.on('error', reject)
241+
} catch (error) {
242+
reject(error)
243+
}
244+
}
245+
246+
// Stream binary data to file without creating large hex strings
247+
async function streamBinaryToFile(data: any, filePath: string, runtime: Runtime): Promise<void> {
248+
if (runtime === 'node') {
249+
const fs = await import('node:fs')
250+
const { createWriteStream } = fs
251+
252+
return new Promise((resolve, reject) => {
253+
const stream = createWriteStream(filePath)
254+
255+
try {
256+
// Encode to msgpack first
257+
const msgpack = encode(data)
258+
259+
// Instead of converting to hex string, write binary data directly
260+
// This avoids the 2x size increase from hex encoding
261+
const buffer = Buffer.from(msgpack.buffer, msgpack.byteOffset, msgpack.byteLength)
262+
stream.write(buffer)
263+
stream.end()
264+
stream.on('finish', resolve)
265+
stream.on('error', reject)
266+
} catch (error) {
267+
reject(error)
268+
}
269+
})
270+
} else {
271+
// For non-Node environments, fall back to regular approach
272+
const msgpack = encode(data)
273+
const buffer = Buffer.from(msgpack.buffer, msgpack.byteOffset, msgpack.byteLength)
274+
await _fs.writeFile(filePath, buffer)
275+
}
276+
}
277+
278+
// Helper function to restore from binary data directly
279+
async function restoreFromBinaryData<T extends AnyOrama>(
280+
data: Buffer,
281+
runtime: Runtime
282+
): Promise<T> {
283+
const db = create({
284+
schema: {
285+
__placeholder: 'string'
286+
}
287+
})
288+
289+
const deserialized = decode(data) as any
290+
load(db, deserialized)
291+
292+
return db as unknown as T
293+
}

0 commit comments

Comments
 (0)