11import type { ColumnData } from 'hyparquet'
2- import type { ClientMessage , ParquetReadObjectsWorkerOptions , ParquetReadWorkerOptions , Rows , WorkerMessage } from './types.js'
2+ import type { ClientMessage , ParquetQueryWorkerOptions , ParquetReadObjectsWorkerOptions , ParquetReadWorkerOptions , Rows , WorkerMessage } from './types.js'
33
44let worker : Worker | undefined
55let nextQueryId = 0
@@ -8,8 +8,9 @@ interface Agent {
88 onChunk ?: ( chunk : ColumnData ) => void
99 onPage ?: ( page : ColumnData ) => void
1010 reject : ( error : Error ) => void
11- resolveEmpty ?: ( ) => void
12- resolveRowObjects ?: ( rowObjects : Rows ) => void
11+ parquetReadResolve ?: ( ) => void
12+ parquetReadObjectsResolve ?: ( rows : Rows ) => void
13+ parquetQueryResolve ?: ( rows : Rows ) => void
1314}
1415
1516const pendingAgents = new Map < number , Agent > ( )
@@ -26,24 +27,37 @@ function getWorker() {
2627 return
2728 }
2829
29- const { onComplete, onChunk, onPage, reject, resolveEmpty, resolveRowObjects } = pendingAgent
30- if ( 'rows' in data ) {
31- onComplete ?.( data . rows )
32- } else if ( 'chunk' in data ) {
33- onChunk ?.( data . chunk )
34- } else if ( 'page' in data ) {
35- onPage ?.( data . page )
36- } else {
37- if ( 'error' in data ) {
38- reject ( data . error )
39- } else if ( 'rowObjects' in data ) {
40- resolveRowObjects ?.( data . rowObjects )
41- } else {
42- resolveEmpty ?.( )
43- }
44- /* clean up */
45- pendingAgents . delete ( data . queryId )
46- // TODO(SL): maybe terminate the worker when no pending agents left
30+ const { onComplete, onChunk, onPage, reject, parquetReadResolve, parquetReadObjectsResolve, parquetQueryResolve } = pendingAgent
31+ switch ( data . kind ) {
32+ case 'onComplete' :
33+ onComplete ?.( data . rows )
34+ break
35+ case 'onChunk' :
36+ onChunk ?.( data . chunk )
37+ break
38+ case 'onPage' :
39+ onPage ?.( data . page )
40+ break
41+ default :
42+ switch ( data . kind ) {
43+ case 'onReject' :
44+ if ( 'error' in data ) { // check, just in case
45+ reject ( data . error )
46+ }
47+ break
48+ case 'onParquetReadResolve' :
49+ parquetReadResolve ?.( )
50+ break
51+ case 'onParquetReadObjectsResolve' :
52+ parquetReadObjectsResolve ?.( data . rows )
53+ break
54+ case 'onParquetQueryResolve' :
55+ parquetQueryResolve ?.( data . rows )
56+ break
57+ }
58+ /* clean up */
59+ pendingAgents . delete ( data . queryId )
60+ // TODO(SL): maybe terminate the worker when no pending agents left
4761 }
4862 }
4963 }
@@ -58,12 +72,12 @@ function getWorker() {
5872 * the default parsers.
5973 */
6074export function parquetReadWorker ( options : ParquetReadWorkerOptions ) : Promise < void > {
61- const { onComplete, onChunk, onPage, ...serializableOptions } = options
75+ const { onComplete, onChunk, onPage, from , ...serializableOptions } = options
6276 return new Promise ( ( resolve , reject ) => {
6377 const queryId = nextQueryId ++
64- pendingAgents . set ( queryId , { resolveEmpty : resolve , reject, onComplete, onChunk, onPage } )
78+ pendingAgents . set ( queryId , { parquetReadResolve : resolve , reject, onComplete, onChunk, onPage } )
6579 const worker = getWorker ( )
66- const message : ClientMessage = { queryId, ... serializableOptions , kind : 'parquetRead' }
80+ const message : ClientMessage = { queryId, from , kind : 'parquetRead' , options : serializableOptions }
6781 worker . postMessage ( message )
6882 } )
6983}
@@ -76,12 +90,30 @@ export function parquetReadWorker(options: ParquetReadWorkerOptions): Promise<vo
7690 * the default parsers.
7791 */
7892export function parquetReadObjectsWorker ( options : ParquetReadObjectsWorkerOptions ) : Promise < Rows > {
79- const { onChunk, onPage, ...serializableOptions } = options
93+ const { onChunk, onPage, from , ...serializableOptions } = options
8094 return new Promise ( ( resolve , reject ) => {
8195 const queryId = nextQueryId ++
82- pendingAgents . set ( queryId , { resolveRowObjects : resolve , reject, onChunk, onPage } )
96+ pendingAgents . set ( queryId , { parquetReadObjectsResolve : resolve , reject, onChunk, onPage } )
8397 const worker = getWorker ( )
84- const message : ClientMessage = { queryId, ...serializableOptions , kind : 'parquetReadObjects' }
98+ const message : ClientMessage = { queryId, from, kind : 'parquetReadObjects' , options : serializableOptions }
99+ worker . postMessage ( message )
100+ } )
101+ }
102+
103+ /**
104+ * Presents almost the same interface as parquetQuery, but runs in a worker.
105+ * This is useful for reading large parquet files without blocking the main thread.
106+ * Instead of taking an AsyncBuffer, it takes a AsyncBufferFrom, because it needs
107+ * to be serialized to the worker. Also: the worker uses hyparquet-compressors and
108+ * the default parsers.
109+ */
110+ export function parquetQueryWorker ( options : ParquetQueryWorkerOptions ) : Promise < Rows > {
111+ const { onComplete, onChunk, onPage, from, ...serializableOptions } = options
112+ return new Promise ( ( resolve , reject ) => {
113+ const queryId = nextQueryId ++
114+ pendingAgents . set ( queryId , { parquetQueryResolve : resolve , reject, onComplete, onChunk, onPage } )
115+ const worker = getWorker ( )
116+ const message : ClientMessage = { queryId, from, kind : 'parquetQuery' , options : serializableOptions }
85117 worker . postMessage ( message )
86118 } )
87119}
0 commit comments