11import ParquetWorker from './parquetWorker?worker&inline'
22/// ^ the worker is bundled with the main thread code (inline) which is easier for users to import
33/// (no need to copy the worker file to the right place)
4- import { AsyncBuffer , ColumnData } from 'hyparquet'
4+ import { AsyncBuffer , ColumnData , FileMetaData , ParquetReadOptions } from 'hyparquet'
55import { asyncBufferFromUrl } from '../lib/utils.ts'
6- import type {
7- AsyncBufferFrom ,
8- ParquetMessage ,
9- ParquetReadWorkerOptions ,
10- Row ,
11- } from './types.ts'
12- // import { asyncBufferFromUrl, cachedAsyncBuffer, AsyncBuffer } from 'hyparquet'
6+
7+ // Serializable constructor for AsyncBuffers
8+ export interface AsyncBufferFrom {
9+ url : string
10+ byteLength : number
11+ headers ?: Record < string , string >
12+ }
13+ // Same as ParquetReadOptions, but AsyncBufferFrom instead of AsyncBuffer
14+ export interface ParquetReadWorkerOptions extends Omit < ParquetReadOptions , 'file' > {
15+ from : AsyncBufferFrom
16+ orderBy ?: string
17+ sortIndex ?: boolean
18+ }
19+ // Row is defined in hightable, but not exported + we change any to unknown
20+ export type Row = Record < string , unknown > ;
21+
22+ interface Message {
23+ queryId : number
24+ }
25+ export interface ChunkMessage extends Message {
26+ chunk : ColumnData
27+ }
28+ export interface ResultMessage extends Message {
29+ result : Row [ ]
30+ }
31+ export interface IndicesMessage extends Message {
32+ indices : number [ ]
33+ }
34+ export interface ErrorMessage extends Message {
35+ error : Error
36+ }
37+
38+ export type ParquetMessage = ChunkMessage | ResultMessage | ErrorMessage
39+ export type SortParquetMessage = IndicesMessage | ErrorMessage
40+
41+ export interface ParquetSortIndexOptions {
42+ metadata : FileMetaData
43+ from : AsyncBufferFrom
44+ orderBy : string
45+ }
1346
1447
1548let worker : Worker | undefined
1649let nextQueryId = 0
17- interface QueryAgent {
50+ interface SortQueryAgent {
51+ kind : 'sortIndex' ;
52+ resolve : ( value : number [ ] ) => void ;
53+ reject : ( error : Error ) => void ;
54+ }
55+ interface RowsQueryAgent {
56+ kind : 'query' ;
1857 resolve : ( value : Row [ ] ) => void ;
1958 reject : ( error : Error ) => void ;
2059 onChunk ?: ( chunk : ColumnData ) => void ;
2160}
61+ type QueryAgent = SortQueryAgent | RowsQueryAgent
62+
2263const pending = new Map < number , QueryAgent > ( )
2364
2465function getWorker ( ) {
2566 if ( ! worker ) {
2667 worker = new ParquetWorker ( )
27- worker . onmessage = ( { data } : { data : ParquetMessage } ) => {
68+ worker . onmessage = ( { data } : { data : ParquetMessage | SortParquetMessage } ) => {
2869 const pendingQueryAgent = pending . get ( data . queryId )
2970 if ( ! pendingQueryAgent ) {
3071 throw new Error (
3172 `Unexpected: no pending promise found for queryId: ${ data . queryId . toString ( ) } ` ,
3273 )
3374 // TODO(SL): should never happen. But if it does, I'm not sure if throwing an error here helps.
3475 }
35- const { resolve, reject, onChunk } = pendingQueryAgent
36- if ( 'error' in data ) {
37- reject ( data . error )
38- } else if ( 'result' in data ) {
39- resolve ( data . result )
40- } else if ( 'chunk' in data ) {
41- onChunk ?.( data . chunk )
76+ if ( pendingQueryAgent . kind === 'query' ) {
77+ const { resolve, reject, onChunk } = pendingQueryAgent
78+ if ( 'error' in data ) {
79+ reject ( data . error )
80+ } else if ( 'result' in data ) {
81+ resolve ( data . result )
82+ } else if ( 'chunk' in data ) {
83+ onChunk ?.( data . chunk )
84+ } else {
85+ reject ( new Error ( 'Unexpected message from worker' ) )
86+ }
4287 } else {
43- reject ( new Error ( 'Unexpected message from worker' ) )
88+ const { resolve, reject } = pendingQueryAgent
89+ if ( 'error' in data ) {
90+ reject ( data . error )
91+ } else if ( 'indices' in data ) {
92+ resolve ( data . indices )
93+ } else {
94+ reject ( new Error ( 'Unexpected message from worker' ) )
95+ }
4496 }
4597 }
4698 }
4799 return worker
48100}
49101
102+
50103/**
51104 * Presents almost the same interface as parquetRead, but runs in a worker.
52105 * This is useful for reading large parquet files without blocking the main thread.
@@ -63,7 +116,7 @@ export function parquetQueryWorker({
63116} : ParquetReadWorkerOptions ) : Promise < Row [ ] > {
64117 return new Promise ( ( resolve , reject ) => {
65118 const queryId = nextQueryId ++
66- pending . set ( queryId , { resolve, reject, onChunk } )
119+ pending . set ( queryId , { kind : 'query' , resolve, reject, onChunk } )
67120 const worker = getWorker ( )
68121
69122 // If caller provided an onChunk callback, worker will send chunks as they are parsed
@@ -80,6 +133,17 @@ export function parquetQueryWorker({
80133 } )
81134}
82135
136+ export function parquetSortIndexWorker ( { metadata, from, orderBy } : ParquetSortIndexOptions ) : Promise < number [ ] > {
137+ return new Promise ( ( resolve , reject ) => {
138+ const queryId = nextQueryId ++
139+ pending . set ( queryId , { kind : 'sortIndex' , resolve, reject } )
140+ const worker = getWorker ( )
141+ worker . postMessage ( {
142+ queryId, metadata, from, orderBy, sortIndex : true ,
143+ } )
144+ } )
145+ }
146+
83147/**
84148 * Convert AsyncBufferFrom to AsyncBuffer and cache results.
85149 */
0 commit comments