Skip to content

Commit 125d20a

Browse files
authored
upgrade hightable to 0.13.1 (#180)
* upgrade hightable to 0.13.0 + adapt style. Note that more is needed (orderBy in particular) * implement multi column sort * use the same stringify function * export Cells, not Row * import the built hightable CSS * stringify bigint * only fetch the values for the column, without transposing to objects * rename variable 'classes' to 'styles' * upgrade to hightable 0.13.1 and use its exported stringify function
1 parent 9836c62 commit 125d20a

File tree

12 files changed

+238
-124
lines changed

12 files changed

+238
-124
lines changed

package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@
4747
"watch:url": "NODE_ENV=development nodemon bin/cli.js https://hyperparam.blob.core.windows.net/hyperparam/starcoderdata-js-00000-of-00065.parquet"
4848
},
4949
"dependencies": {
50-
"hightable": "0.12.1",
50+
"hightable": "0.13.1",
5151
"hyparquet": "1.9.1",
5252
"hyparquet-compressors": "1.0.0",
5353
"icebird": "0.1.8",

src/components/Cell.tsx

Lines changed: 1 addition & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import { stringify } from 'hightable'
12
import { asyncBufferFromUrl, parquetMetadataAsync } from 'hyparquet'
23
import { useEffect, useState } from 'react'
34
import type { FileSource } from '../lib/sources/types.js'
@@ -75,30 +76,3 @@ export default function CellView({ source, row, col, config }: CellProps) {
7576
</Layout>
7677
)
7778
}
78-
79-
/**
80-
* Robust stringification of any value, including json and bigints.
81-
*/
82-
function stringify(value: unknown): string {
83-
if (typeof value === 'string') return value
84-
if (typeof value === 'number') return value.toLocaleString('en-US')
85-
if (Array.isArray(value)) {
86-
return `[\n${value.map((v) => indent(stringify(v), 2)).join(',\n')}\n]`
87-
}
88-
if (value === null || value === undefined) return JSON.stringify(value)
89-
if (value instanceof Date) return value.toISOString()
90-
if (typeof value === 'object') {
91-
return `{${Object.entries(value)
92-
.filter((d) => d[1] !== undefined)
93-
.map(([k, v]) => `${k}: ${stringify(v)}`)
94-
.join(', ')}}`
95-
}
96-
return '{}'
97-
}
98-
99-
function indent(text: string | undefined, spaces: number) {
100-
return text
101-
?.split('\n')
102-
.map((line) => ' '.repeat(spaces) + line)
103-
.join('\n')
104-
}

src/components/viewers/CellPanel.tsx

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,8 +36,7 @@ export default function CellPanel({ df, row, col, setProgress, setError, onClose
3636
if (asyncCell === undefined) {
3737
throw new Error(`Cell missing at column ${columnName}`)
3838
}
39-
/* TODO(SL): use the same implementation of stringify, here and in Cell.tsx */
40-
const text = await asyncCell.then(cell => stringify(cell as unknown) ?? '{}')
39+
const text = await asyncCell.then(stringify)
4140
setText(text)
4241
} catch (error) {
4342
setError(error as Error)

src/components/viewers/ParquetView.tsx

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,10 @@
1-
import HighTable, { DataFrame, rowCache } from 'hightable'
1+
import HighTable, { DataFrame, rowCache, stringify } from 'hightable'
22
import { asyncBufferFromUrl, parquetMetadataAsync } from 'hyparquet'
33
import React, { useCallback, useEffect, useState } from 'react'
44
import { RoutesConfig, appendSearchParams } from '../../lib/routes.js'
55
import { FileSource } from '../../lib/sources/types.js'
66
import { parquetDataFrame } from '../../lib/tableProvider.js'
7+
import styles from '../../styles/ParquetView.module.css'
78
import { Spinner } from '../Layout.js'
89
import CellPanel from './CellPanel.js'
910
import ContentHeader, { ContentSize } from './ContentHeader.js'
@@ -100,7 +101,10 @@ export default function ParquetView({ source, setProgress, setError, config }: V
100101
data={content.dataframe}
101102
onDoubleClickCell={onDoubleClickCell}
102103
onMouseDownCell={onMouseDownCell}
103-
onError={setError} />}
104+
onError={setError}
105+
className={styles.hightable}
106+
stringify={stringify}
107+
/>}
104108

105109
{isLoading && <div className='center'><Spinner /></div>}
106110
</ContentHeader>

src/lib/getParquetColumn.ts

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
import { ColumnData, ParquetReadOptions, parquetRead } from 'hyparquet'
2+
3+
type GetColumnOptions = Omit<ParquetReadOptions, 'columns' | 'rowStart' | 'rowEnd' | 'onChunk' | 'onComplete'> & {column: string}
4+
5+
export async function getParquetColumn({ metadata, file, column, compressors }: GetColumnOptions): Promise<unknown[]> {
6+
const numRows = Number(metadata?.num_rows)
7+
if (isNaN(numRows)) {
8+
throw new Error('metadata.num_rows is undefined')
9+
}
10+
if (numRows === 0) {
11+
return []
12+
}
13+
const lastError: {error?: Error} = {}
14+
const values: unknown[] = Array(numRows).fill(undefined)
15+
const ranges: [number, number][] = []
16+
function onChunk({ columnName, columnData, rowStart, rowEnd }: ColumnData) {
17+
if (columnName !== column) {
18+
lastError.error = new Error(`unexpected column name ${columnName}`)
19+
}
20+
for (let i = rowStart; i < rowEnd; i++) {
21+
values[i] = columnData[i - rowStart]
22+
}
23+
ranges.push([rowStart, rowEnd])
24+
}
25+
26+
// this awaits all the promises. When it returns, all the data should have already been sent using onChunk
27+
await parquetRead({ metadata, file, columns: [column], compressors, onChunk })
28+
29+
// Do some checks before returning the data
30+
31+
// check for errors
32+
if (lastError.error !== undefined) {
33+
throw lastError.error
34+
}
35+
36+
// check for missing data (should be faster than checking for undefined values in the array)
37+
const sortedRanges = ranges.sort((a, b) => a[0] - b[0])
38+
for (let i = 0; i < sortedRanges.length - 1; i++) {
39+
const range = sortedRanges[i]
40+
const nextRange = sortedRanges[i + 1]
41+
if (!range || !nextRange) {
42+
throw new Error('The ranges should not be undefined')
43+
}
44+
if (range[1] !== nextRange[0]) {
45+
throw new Error(`missing data between rows ${range[1]} and ${nextRange[0]}`)
46+
}
47+
}
48+
const firstRange = sortedRanges[0]
49+
if (!firstRange) {
50+
throw new Error('The first range should not be undefined')
51+
}
52+
if (firstRange[0] !== 0) {
53+
throw new Error(`missing data before row ${firstRange[0]}`)
54+
}
55+
const lastRange = sortedRanges[sortedRanges.length - 1]
56+
if (!lastRange) {
57+
throw new Error('The last range should not be undefined')
58+
}
59+
if (lastRange[1] !== numRows) {
60+
throw new Error(`missing data after row ${lastRange[1]}`)
61+
}
62+
63+
// return the values
64+
return values
65+
}

src/lib/index.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,4 +4,4 @@ export * from './sources/index.js'
44
export { parquetDataFrame } from './tableProvider.js'
55
export { asyncBufferFrom, cn, contentTypes, formatFileSize, getFileDate, getFileDateShort, imageTypes, parseFileSize } from './utils.js'
66
export { parquetQueryWorker } from './workers/parquetWorkerClient.js'
7-
export type { AsyncBufferFrom, Row } from './workers/types.js'
7+
export type { AsyncBufferFrom, Cells } from './workers/types.js'

src/lib/tableProvider.ts

Lines changed: 49 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,41 @@
1-
import { DataFrame, ResolvableRow, resolvableRow } from 'hightable'
1+
import { DataFrame, OrderBy, ResolvableRow, resolvableRow } from 'hightable'
22
import { FileMetaData, parquetSchema } from 'hyparquet'
3-
import { parquetQueryWorker, parquetSortIndexWorker } from './workers/parquetWorkerClient.js'
3+
import { parquetColumnRanksWorker, parquetQueryWorker } from './workers/parquetWorkerClient.js'
44
import type { AsyncBufferFrom } from './workers/types.d.ts'
55

6+
/*
7+
* sortIndex[0] gives the index of the first row in the sorted table
8+
*/
9+
export function computeSortIndex(orderByRanks: { direction: 'ascending' | 'descending', ranks: number[] }[]): number[] {
10+
if (!(0 in orderByRanks)) {
11+
throw new Error('orderByRanks should have at least one element')
12+
}
13+
const numRows = orderByRanks[0].ranks.length
14+
return Array
15+
.from({ length: numRows }, (_, i) => i)
16+
.sort((a, b) => {
17+
for (const { direction, ranks } of orderByRanks) {
18+
const rankA = ranks[a]
19+
const rankB = ranks[b]
20+
if (rankA === undefined || rankB === undefined) {
21+
throw new Error('Invalid ranks')
22+
}
23+
const value = direction === 'ascending' ? 1 : -1
24+
if (rankA < rankB) return -value
25+
if (rankA > rankB) return value
26+
}
27+
return 0
28+
})
29+
}
30+
631
/**
732
* Convert a parquet file into a dataframe.
833
*/
934
export function parquetDataFrame(from: AsyncBufferFrom, metadata: FileMetaData): DataFrame {
1035
const { children } = parquetSchema(metadata)
1136
const header = children.map(child => child.element.name)
1237
const sortCache = new Map<string, Promise<number[]>>()
38+
const columnRanksCache = new Map<string, Promise<number[]>>()
1339
const data = new Array<ResolvableRow | undefined>(Number(metadata.num_rows))
1440
const groups = new Array(metadata.row_groups.length).fill(false)
1541
let groupStart = 0
@@ -34,7 +60,8 @@ export function parquetDataFrame(from: AsyncBufferFrom, metadata: FileMetaData):
3460
throw new Error(`Missing data row for index ${i}`)
3561
}
3662
dataRow.index.resolve(i)
37-
const row = groupData[i - rowStart]
63+
const j = i - rowStart
64+
const row = groupData[j]
3865
if (row === undefined) {
3966
throw new Error(`Missing row in groupData for index: ${i - rowStart}`)
4067
}
@@ -54,20 +81,33 @@ export function parquetDataFrame(from: AsyncBufferFrom, metadata: FileMetaData):
5481
}
5582
}
5683

57-
function getSortIndex(orderBy: string) {
58-
let sortIndex = sortCache.get(orderBy)
84+
function getColumnRanks(column: string): Promise<number[]> {
85+
let columnRanks = columnRanksCache.get(column)
86+
if (!columnRanks) {
87+
columnRanks = parquetColumnRanksWorker({ from, metadata, column })
88+
columnRanksCache.set(column, columnRanks)
89+
}
90+
return columnRanks
91+
}
92+
93+
function getSortIndex(orderBy: OrderBy): Promise<number[]> {
94+
const orderByKey = JSON.stringify(orderBy)
95+
let sortIndex = sortCache.get(orderByKey)
5996
if (!sortIndex) {
60-
sortIndex = parquetSortIndexWorker({ from, metadata, orderBy })
61-
sortCache.set(orderBy, sortIndex)
97+
const orderByRanksPromise = Promise.all(
98+
orderBy.map(({ column, direction }) => getColumnRanks(column).then(ranks => ({ direction, ranks })))
99+
)
100+
sortIndex = orderByRanksPromise.then(orderByRanks => computeSortIndex(orderByRanks))
101+
sortCache.set(orderByKey, sortIndex)
62102
}
63103
return sortIndex
64104
}
65105

66106
return {
67107
header,
68108
numRows: Number(metadata.num_rows),
69-
rows({ start, end, orderBy }: { start: number, end: number, orderBy?: string}) {
70-
if (orderBy) {
109+
rows({ start, end, orderBy }: { start: number, end: number, orderBy?: OrderBy}) {
110+
if (orderBy?.length) {
71111
const numRows = end - start
72112
const wrapped = new Array(numRows).fill(null).map(() => resolvableRow(header))
73113

src/lib/workers/parquetWorker.ts

Lines changed: 37 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
import { ColumnData, parquetQuery } from 'hyparquet'
22
import { compressors } from 'hyparquet-compressors'
3+
import { getParquetColumn } from '../getParquetColumn.js'
34
import { asyncBufferFrom } from '../utils.js'
4-
import type { ChunkMessage, ErrorMessage, IndicesMessage, ParquetReadWorkerOptions, ResultMessage } from './types.js'
5+
import type { ChunkMessage, ClientMessage, ColumnRanksMessage, ErrorMessage, ResultMessage } from './types.js'
56

67
function postChunkMessage ({ chunk, queryId }: ChunkMessage) {
78
self.postMessage({ chunk, queryId })
@@ -12,35 +13,50 @@ function postResultMessage ({ result, queryId }: ResultMessage) {
1213
function postErrorMessage ({ error, queryId }: ErrorMessage) {
1314
self.postMessage({ error, queryId })
1415
}
15-
function postIndicesMessage ({ indices, queryId }: IndicesMessage) {
16-
self.postMessage({ indices, queryId })
16+
function postColumnRanksMessage ({ columnRanks, queryId }: ColumnRanksMessage) {
17+
self.postMessage({ columnRanks, queryId })
1718
}
1819

19-
self.onmessage = async ({ data }: {
20-
data: ParquetReadWorkerOptions & { queryId: number; chunks: boolean };
21-
}) => {
22-
const { metadata, from, rowStart, rowEnd, orderBy, columns, queryId, chunks, sortIndex } = data
20+
self.onmessage = async ({ data }: { data: ClientMessage }) => {
21+
const { metadata, from, kind, queryId } = data
2322
const file = await asyncBufferFrom(from)
24-
if (sortIndex === undefined) {
25-
const onChunk = chunks ? (chunk: ColumnData) => { postChunkMessage({ chunk, queryId }) } : undefined
23+
if (kind === 'columnRanks') {
24+
const { column } = data
25+
// return the column ranks in ascending order
26+
// we can get the descending order replacing the rank with numRows - rank - 1. It's not exactly the rank of
27+
// the descending order, because the rank is the first, not the last, of the ties. But it's enough for the
28+
// purpose of sorting.
29+
2630
try {
27-
const result = await parquetQuery({ metadata, file, rowStart, rowEnd, orderBy, columns, compressors, onChunk })
28-
postResultMessage({ result, queryId })
31+
const sortColumn = await getParquetColumn({ metadata, file, column, compressors })
32+
const valuesWithIndex = sortColumn.map((value, index) => ({ value, index }))
33+
const sortedValuesWithIndex = Array.from(valuesWithIndex).sort(({ value: a }, { value: b }) => compare<unknown>(a, b))
34+
const numRows = sortedValuesWithIndex.length
35+
const columnRanks = sortedValuesWithIndex.reduce((accumulator, currentValue, rank) => {
36+
const { lastValue, lastRank, ranks } = accumulator
37+
const { value, index } = currentValue
38+
if (value === lastValue) {
39+
ranks[index] = lastRank
40+
return { ranks, lastValue, lastRank }
41+
} else {
42+
ranks[index] = rank
43+
return { ranks, lastValue: value, lastRank: rank }
44+
}
45+
}, {
46+
ranks: Array(numRows).fill(-1) as number[],
47+
lastValue: undefined as unknown,
48+
lastRank: 0,
49+
}).ranks
50+
postColumnRanksMessage({ columnRanks: columnRanks, queryId })
2951
} catch (error) {
3052
postErrorMessage({ error: error as Error, queryId })
3153
}
3254
} else {
55+
const { rowStart, rowEnd, chunks } = data
56+
const onChunk = chunks ? (chunk: ColumnData) => { postChunkMessage({ chunk, queryId }) } : undefined
3357
try {
34-
// Special case for sorted index
35-
if (orderBy === undefined)
36-
throw new Error('sortParquetWorker requires orderBy')
37-
if (rowStart !== undefined || rowEnd !== undefined)
38-
throw new Error('sortIndex requires all rows')
39-
const sortColumn = await parquetQuery({ metadata, file, columns: [orderBy], compressors })
40-
const indices = Array.from(sortColumn, (_, index) => index).sort((a, b) =>
41-
compare<unknown>(sortColumn[a]?.[orderBy], sortColumn[b]?.[orderBy])
42-
)
43-
postIndicesMessage({ indices, queryId })
58+
const result = await parquetQuery({ metadata, file, rowStart, rowEnd, compressors, onChunk })
59+
postResultMessage({ result, queryId })
4460
} catch (error) {
4561
postErrorMessage({ error: error as Error, queryId })
4662
}

0 commit comments

Comments
 (0)