|
1 | | -import { DataFrame, DataFrameEvents, ResolvedValue, checkSignal, createEventTarget, validateFetchParams, validateGetCellParams, validateGetRowNumberParams } from 'hightable' |
| 1 | +import { DataFrame, DataFrameEvents, ResolvedValue, arrayDataFrame, checkSignal, createEventTarget, sortableDataFrame, validateFetchParams, validateGetCellParams, validateGetRowNumberParams } from 'hightable' |
2 | 2 | import type { ColumnData } from 'hyparquet' |
3 | | -import { FileMetaData, ParquetReadOptions, parquetSchema } from 'hyparquet' |
| 3 | +import { FileMetaData, ParquetReadOptions, asyncBufferFromUrl, parquetMetadataAsync, parquetSchema } from 'hyparquet' |
| 4 | +import { parseCsv } from './csv.js' |
4 | 5 | import { parquetReadWorker } from './workers/parquetWorkerClient.js' |
5 | 6 | import type { AsyncBufferFrom } from './workers/types.d.ts' |
6 | 7 |
|
| 8 | +interface TableProviderOptions { |
| 9 | + url: string |
| 10 | + fileName: string |
| 11 | + requestInit?: RequestInit |
| 12 | +} |
| 13 | + |
| 14 | +/** |
| 15 | + * Create a dataframe from a file URL, automatically detecting the file type. |
| 16 | + * Supports parquet, CSV, and JSONL files. |
| 17 | + */ |
| 18 | +export async function tableProvider({ url, fileName, requestInit }: TableProviderOptions): Promise<DataFrame> { |
| 19 | + const asyncBuffer = await asyncBufferFromUrl({ url, requestInit }) |
| 20 | + const from = { url, byteLength: asyncBuffer.byteLength, requestInit } |
| 21 | + |
| 22 | + const baseName = fileName.toLowerCase() |
| 23 | + if (baseName.endsWith('.csv')) { |
| 24 | + return csvDataFrame(from) |
| 25 | + } |
| 26 | + |
| 27 | + if (baseName.endsWith('.jsonl')) { |
| 28 | + return jsonLinesDataFrame(from) |
| 29 | + } |
| 30 | + |
| 31 | + // Default to parquet |
| 32 | + const metadata = await parquetMetadataAsync(asyncBuffer) |
| 33 | + return sortableDataFrame(parquetDataFrame(from, metadata)) |
| 34 | +} |
| 35 | + |
7 | 36 | type GroupStatus = { |
8 | 37 | kind: 'unfetched' |
9 | 38 | } | { |
@@ -130,3 +159,49 @@ export function parquetDataFrame(from: AsyncBufferFrom, metadata: FileMetaData, |
130 | 159 |
|
131 | 160 | return unsortableDataFrame |
132 | 161 | } |
| 162 | + |
| 163 | +/** |
| 164 | + * Convert a CSV file into a sortable dataframe. |
| 165 | + * |
| 166 | + * Parses the entire file and creates a sortable dataframe. |
| 167 | + * The first row is treated as the header. |
| 168 | + */ |
| 169 | +export async function csvDataFrame(from: AsyncBufferFrom): Promise<DataFrame> { |
| 170 | + let buffer: ArrayBuffer |
| 171 | + if ('file' in from) { |
| 172 | + buffer = await from.file.arrayBuffer() |
| 173 | + } else { |
| 174 | + const response = await fetch(from.url, from.requestInit) |
| 175 | + buffer = await response.arrayBuffer() |
| 176 | + } |
| 177 | + |
| 178 | + const text = new TextDecoder().decode(buffer) |
| 179 | + const lines = parseCsv(text) |
| 180 | + const header = lines[0] ?? [] |
| 181 | + const rows = lines.slice(1).map(row => { |
| 182 | + return Object.fromEntries(header.map((key, i) => [key, row[i]])) |
| 183 | + }) |
| 184 | + return sortableDataFrame(arrayDataFrame(rows)) |
| 185 | +} |
| 186 | + |
| 187 | +/** |
| 188 | + * Convert a JSONL file into a sortable dataframe. |
| 189 | + * |
| 190 | + * Parses each line as a JSON object and creates a sortable dataframe. |
| 191 | + */ |
| 192 | +export async function jsonLinesDataFrame(from: AsyncBufferFrom): Promise<DataFrame> { |
| 193 | + let buffer: ArrayBuffer |
| 194 | + if ('file' in from) { |
| 195 | + buffer = await from.file.arrayBuffer() |
| 196 | + } else { |
| 197 | + const response = await fetch(from.url, from.requestInit) |
| 198 | + buffer = await response.arrayBuffer() |
| 199 | + } |
| 200 | + |
| 201 | + const text = new TextDecoder().decode(buffer).trimEnd() |
| 202 | + const lines = text.split('\n').filter(line => line.trim()) |
| 203 | + const rows: Record<string, unknown>[] = lines.map(line => { |
| 204 | + return line ? JSON.parse(line) as Record<string, unknown> : {} |
| 205 | + }) |
| 206 | + return sortableDataFrame(arrayDataFrame(rows)) |
| 207 | +} |
0 commit comments