Skip to content

Commit 606ea70

Browse files
committed
vendor listFiles from @huggingface/hub
1 parent 5d16147 commit 606ea70

File tree

2 files changed

+81
-10
lines changed

2 files changed

+81
-10
lines changed

package.json

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,6 @@
5555
"watch:url": "NODE_ENV=development nodemon bin/cli.js https://hyperparam.blob.core.windows.net/hyperparam/starcoderdata-js-00000-of-00065.parquet"
5656
},
5757
"dependencies": {
58-
"@huggingface/hub": "2.6.12",
5958
"hightable": "0.20.2",
6059
"hyparquet": "1.20.0",
6160
"hyparquet-compressors": "1.1.1",

src/lib/sources/huggingFaceSource.ts

Lines changed: 81 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
1-
import { type RepoFullName, type RepoType, listFiles } from '@huggingface/hub'
21
import type { DirSource, FileMetadata, FileSource, SourcePart } from './types.js'
32
import { getFileName } from './utils.js'
43

4+
type RepoType = 'model' | 'dataset' | 'space'
5+
56
interface BaseUrl {
67
source: string
78
origin: string
@@ -45,7 +46,7 @@ interface RefMetadata extends RefResponse {
4546

4647
const baseUrl = 'https://huggingface.co'
4748

48-
function getFullName(url: HFUrl): RepoFullName {
49+
function getFullName(url: HFUrl): string {
4950
return url.type === 'dataset' ? `datasets/${url.repo}` : url.type === 'space' ? `spaces/${url.repo}` : url.repo
5051
}
5152
function getSourceParts(url: HFUrl): SourcePart[] {
@@ -74,15 +75,12 @@ function getSourceParts(url: HFUrl): SourcePart[] {
7475
function getPrefix(url: DirectoryUrl): string {
7576
return `${url.origin}/${getFullName(url)}/tree/${url.branch}${url.path}`.replace(/\/$/, '')
7677
}
77-
async function fetchFilesList(url: DirectoryUrl, options?: {requestInit?: RequestInit, accessToken?: string}): Promise<FileMetadata[]> {
78+
async function fetchFilesList(url: DirectoryUrl, options?: { requestInit?: RequestInit, accessToken?: string }): Promise<FileMetadata[]> {
79+
const repoFullName = getFullName(url)
7880
const filesIterator = listFiles({
79-
repo: {
80-
name: url.repo,
81-
type: url.type,
82-
},
81+
repoFullName,
8382
revision: url.branch,
8483
path: 'path' in url ? url.path.replace(/^\//, '') : '', // remove leading slash if any
85-
expand: true,
8684
accessToken: options?.accessToken,
8785
})
8886
const files: FileMetadata[] = []
@@ -256,7 +254,7 @@ export function parseHuggingFaceUrl(url: string): HFUrl {
256254
*
257255
* @returns the list of branches, tags, pull requests, and converts
258256
*/
259-
export async function fetchRefsList(
257+
async function fetchRefsList(
260258
url: HFUrl,
261259
options?: {requestInit?: RequestInit, accessToken?: string}
262260
): Promise<RefMetadata[]> {
@@ -286,3 +284,77 @@ export async function fetchRefsList(
286284
})
287285
})
288286
}
287+
288+
/*
289+
* Copied and adapted from https://github.com/huggingface/huggingface.js/blob/main/packages/hub
290+
* MIT License, Copyright (c) 2023 Hugging Face
291+
*/
292+
293+
interface ListFileEntry {
294+
type: 'file' | 'directory' | 'unknown';
295+
size: number;
296+
path: string;
297+
lastCommit?: {
298+
date: string;
299+
id: string;
300+
};
301+
}
302+
303+
const HUB_URL = 'https://huggingface.co'
304+
305+
/**
306+
* List files in a folder. To list ALL files in the directory, call it
307+
* with {@link params.recursive} set to `true`.
308+
*/
309+
async function* listFiles(
310+
params: {
311+
repoFullName: string;
312+
/**
313+
* Eg 'data' for listing all files in the 'data' folder. Leave it empty to list all
314+
* files in the repo.
315+
*/
316+
path?: string;
317+
revision?: string;
318+
/**
319+
* Custom fetch function to use instead of the default one, for example to use a proxy or edit headers.
320+
*/
321+
fetch?: typeof fetch;
322+
accessToken?: string;
323+
}
324+
): AsyncGenerator<ListFileEntry> {
325+
let url: string | undefined = `${HUB_URL}/api/${params.repoFullName}/tree/${
326+
params.revision ?? 'main'
327+
}${params.path ? '/' + params.path : ''}?expand=true`
328+
329+
while (url) {
330+
const res: Response = await (params.fetch ?? fetch)(url, {
331+
headers: {
332+
accept: 'application/json',
333+
...params.accessToken ? { Authorization: `Bearer ${params.accessToken}` } : undefined,
334+
},
335+
})
336+
337+
if (!res.ok) {
338+
throw new Error(`Failed to list files: ${res.status.toString()} ${res.statusText}`)
339+
}
340+
341+
const items = await res.json() as ListFileEntry[]
342+
343+
for (const item of items) {
344+
yield item
345+
}
346+
347+
const linkHeader = res.headers.get('Link')
348+
349+
url = linkHeader ? parseLinkHeader(linkHeader).next : undefined
350+
}
351+
}
352+
353+
/**
354+
* Parse Link HTTP header, eg `<https://huggingface.co/api/datasets/bigscience/P3/tree/main?recursive=1&cursor=...>; rel="next"`
355+
*/
356+
export function parseLinkHeader(header: string): Record<string, string> {
357+
const regex = /<(https?:[/][/][^>]+)>;\s+rel="([^"]+)"/g
358+
359+
return Object.fromEntries([...header.matchAll(regex)].map(([, url, rel]) => [rel, url])) as Record<string, string>
360+
}

0 commit comments

Comments
 (0)