Skip to content

Commit f78a5eb

Browse files
committed
add HF source from the space repository
1 parent 81fdaf3 commit f78a5eb

File tree

5 files changed

+531
-1
lines changed

5 files changed

+531
-1
lines changed

package.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@
5555
"watch:url": "NODE_ENV=development nodemon bin/cli.js https://hyperparam.blob.core.windows.net/hyperparam/starcoderdata-js-00000-of-00065.parquet"
5656
},
5757
"dependencies": {
58+
"@huggingface/hub": "2.6.12",
5859
"hightable": "0.20.2",
5960
"hyparquet": "1.20.0",
6061
"hyparquet-compressors": "1.1.1",

src/components/App/App.tsx

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import { useMemo } from 'react'
22
import { Config, ConfigProvider } from '../../hooks/useConfig.js'
33
import { getHttpSource } from '../../lib/sources/httpSource.js'
4+
import { getHuggingFaceSource } from '../../lib/sources/huggingFaceSource.js'
45
import { getHyperparamSource } from '../../lib/sources/hyperparamSource.js'
56
import Page from '../Page/Page.js'
67

@@ -10,7 +11,9 @@ export default function App() {
1011
const row = search.get('row') === null ? undefined : Number(search.get('row'))
1112
const col = search.get('col') === null ? undefined : Number(search.get('col'))
1213

13-
const source = getHttpSource(sourceId) ?? getHyperparamSource(sourceId, { endpoint: location.origin })
14+
const source = getHuggingFaceSource(sourceId) ??
15+
getHttpSource(sourceId) ??
16+
getHyperparamSource(sourceId, { endpoint: location.origin })
1417

1518
// Memoize the config to avoid creating a new object on each render
1619
const config: Config = useMemo(() => ({
Lines changed: 268 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,268 @@
1+
import { listFiles } from '@huggingface/hub'
2+
import type { DirSource, FileMetadata, FileSource, SourcePart } from 'hyperparam'
3+
import { getFileName } from 'hyperparam'
4+
5+
export const baseUrl = 'https://huggingface.co/datasets'
6+
7+
function getSourceParts(url: HFUrl): SourcePart[] {
8+
const sourceParts: SourcePart[] = [{
9+
sourceId: `${baseUrl}/${url.repo}/tree/${url.branch}/`,
10+
text: `${baseUrl}/${url.repo}/${url.action}/${url.branch}/`,
11+
}]
12+
13+
const pathParts = url.path.split('/').filter(d => d.length > 0)
14+
const lastPart = pathParts.at(-1)
15+
if (lastPart) {
16+
for (const [i, part] of pathParts.slice(0, -1).entries()) {
17+
sourceParts.push({
18+
sourceId: `${baseUrl}/${url.repo}/tree/${url.branch}/${pathParts.slice(0, i + 1).join('/')}`,
19+
text: part + '/',
20+
})
21+
}
22+
sourceParts.push({
23+
sourceId: `${baseUrl}/${url.repo}/${url.action}/${url.branch}${url.path}`,
24+
text: lastPart,
25+
})
26+
}
27+
return sourceParts
28+
}
29+
function getPrefix(url: DirectoryUrl): string {
30+
return `${url.origin}/datasets/${url.repo}/tree/${url.branch}${url.path}`.replace(/\/$/, '')
31+
}
32+
async function fetchFilesList(url: DirectoryUrl, options?: {requestInit?: RequestInit, accessToken?: string}): Promise<FileMetadata[]> {
33+
const filesIterator = listFiles({
34+
repo: `datasets/${url.repo}`,
35+
revision: url.branch,
36+
path: 'path' in url ? url.path.replace(/^\//, '') : '', // remove leading slash if any
37+
expand: true,
38+
accessToken: options?.accessToken,
39+
})
40+
const files: FileMetadata[] = []
41+
for await (const file of filesIterator) {
42+
files.push({
43+
name: getFileName(file.path),
44+
eTag: file.lastCommit?.id,
45+
size: file.size,
46+
lastModified: file.lastCommit?.date,
47+
sourceId: `${url.origin}/datasets/${url.repo}/${file.type === 'file' ? 'blob' : 'tree'}/${url.branch}/${file.path}`.replace(/\/$/, ''),
48+
kind: file.type === 'file' ? 'file' : 'directory', // 'unknown' is considered as a directory
49+
})
50+
}
51+
return files
52+
}
53+
export function getHuggingFaceSource(sourceId: string, options?: {requestInit?: RequestInit, accessToken?: string}): FileSource | DirSource | undefined {
54+
try {
55+
const url = parseHuggingFaceUrl(sourceId)
56+
async function fetchVersions() {
57+
const refsList = await fetchRefsList(url.repo, options)
58+
return {
59+
label: 'Branches',
60+
versions: refsList.map(({ refType, name, ref }) => {
61+
const label = refType === 'branches' ? name :
62+
refType === 'converts' ? `[convert] ${name}` :
63+
refType === 'tags' ? `[tag] ${name}` :
64+
`[pr] ${name}`
65+
// remove refs/heads/ from the ref name
66+
// e.g. refs/heads/main -> main
67+
const fixedRef = refType === 'branches' ? ref.replace(/refs\/heads\//, '') : ref
68+
const branchSourceId = `${url.origin}/datasets/${url.repo}/${url.kind === 'file' ? 'blob' : 'tree'}/${fixedRef}${url.path}`
69+
return {
70+
label,
71+
sourceId: branchSourceId,
72+
}
73+
}),
74+
}
75+
}
76+
if (url.kind === 'file') {
77+
return {
78+
kind: 'file',
79+
sourceId,
80+
sourceParts: getSourceParts(url),
81+
fileName: getFileName(url.path),
82+
resolveUrl: url.resolveUrl,
83+
requestInit: options?.requestInit,
84+
fetchVersions,
85+
}
86+
} else {
87+
return {
88+
kind: 'directory',
89+
sourceId,
90+
sourceParts: getSourceParts(url),
91+
prefix: getPrefix(url),
92+
listFiles: () => fetchFilesList(url, options),
93+
fetchVersions,
94+
}
95+
}
96+
} catch {
97+
return undefined
98+
}
99+
}
100+
101+
export interface DirectoryUrl {
102+
kind: 'directory';
103+
source: string;
104+
origin: string;
105+
repo: string;
106+
action: 'tree';
107+
branch: string;
108+
path: string;
109+
}
110+
111+
export interface FileUrl {
112+
kind: 'file';
113+
source: string;
114+
origin: string;
115+
repo: string;
116+
action: 'resolve' | 'blob';
117+
branch: string;
118+
path: string;
119+
resolveUrl: string;
120+
}
121+
122+
type HFUrl = DirectoryUrl | FileUrl;
123+
124+
export function parseHuggingFaceUrl(url: string): HFUrl {
125+
const urlObject = new URL(url)
126+
// ^ throws 'TypeError: URL constructor: {url} is not a valid URL.' if url is not a valid URL
127+
128+
if (urlObject.protocol !== 'https:' && urlObject.protocol !== 'http:') {
129+
throw new Error('url must be a HTTP URL')
130+
}
131+
132+
if (
133+
!['huggingface.co', 'huggingface.co', 'hf.co'].includes(urlObject.host) ||
134+
urlObject.protocol !== 'https:'
135+
) {
136+
throw new Error('Not a Hugging Face URL')
137+
}
138+
139+
const repoGroups = /^\/datasets\/(?<namespace>[^/]+)\/(?<dataset>[^/]+)\/?$/.exec(
140+
urlObject.pathname
141+
)?.groups
142+
if (repoGroups?.namespace !== undefined && repoGroups.dataset !== undefined) {
143+
return {
144+
kind: 'directory',
145+
source: url,
146+
origin: urlObject.origin,
147+
repo: repoGroups.namespace + '/' + repoGroups.dataset,
148+
action: 'tree',
149+
branch: 'main', // hardcode the default branch
150+
path: '',
151+
}
152+
}
153+
154+
const folderGroups =
155+
/^\/datasets\/(?<namespace>[^/]+)\/(?<dataset>[^/]+)\/(?<action>tree)\/(?<branch>(refs\/(convert|pr)\/)?[^/]+)(?<path>(\/[^/]+)*)\/?$/.exec(
156+
urlObject.pathname
157+
)?.groups
158+
if (
159+
folderGroups?.namespace !== undefined &&
160+
folderGroups.dataset !== undefined &&
161+
folderGroups.action !== undefined &&
162+
folderGroups.branch !== undefined &&
163+
folderGroups.path !== undefined &&
164+
folderGroups.branch !== 'refs'
165+
) {
166+
const branch = folderGroups.branch.replace(/\//g, '%2F')
167+
const source = `${urlObject.origin}/datasets/${folderGroups.namespace}/${folderGroups.dataset}/${folderGroups.action}/${branch}${folderGroups.path}`
168+
return {
169+
kind: 'directory',
170+
source,
171+
origin: urlObject.origin,
172+
repo: folderGroups.namespace + '/' + folderGroups.dataset,
173+
action: 'tree',
174+
branch,
175+
path: folderGroups.path,
176+
}
177+
}
178+
179+
const fileGroups =
180+
/^\/datasets\/(?<namespace>[^/]+)\/(?<dataset>[^/]+)\/(?<action>blob|resolve)\/(?<branch>(refs\/(convert|pr)\/)?[^/]+)(?<path>(\/[^/]+)+)$/.exec(
181+
urlObject.pathname
182+
)?.groups
183+
if (
184+
fileGroups?.namespace !== undefined &&
185+
fileGroups.dataset !== undefined &&
186+
fileGroups.action !== undefined &&
187+
fileGroups.branch !== undefined &&
188+
fileGroups.path !== undefined &&
189+
fileGroups.branch !== 'refs'
190+
) {
191+
const branch = fileGroups.branch.replace(/\//g, '%2F')
192+
const source = `${urlObject.origin}/datasets/${fileGroups.namespace}/${fileGroups.dataset}/${fileGroups.action}/${branch}${fileGroups.path}`
193+
return {
194+
kind: 'file',
195+
source,
196+
origin: urlObject.origin,
197+
repo: fileGroups.namespace + '/' + fileGroups.dataset,
198+
action: fileGroups.action === 'blob' ? 'blob' : 'resolve',
199+
branch,
200+
path: fileGroups.path,
201+
resolveUrl: `${urlObject.origin}/datasets/${fileGroups.namespace}/${fileGroups.dataset}/resolve/${branch}${fileGroups.path}`,
202+
}
203+
}
204+
205+
throw new Error('Unsupported Hugging Face URL')
206+
}
207+
208+
interface RefResponse {
209+
name: string;
210+
ref: string;
211+
targetCommit: string;
212+
}
213+
214+
export const refTypes = [
215+
'branches',
216+
'tags',
217+
'converts',
218+
'pullRequests',
219+
] as const
220+
type RefType = (typeof refTypes)[number];
221+
type RefsResponse = Partial<Record<RefType, RefResponse[]>>;
222+
223+
export interface RefMetadata extends RefResponse {
224+
refType: RefType; // TODO(SL): use it to style the refs differently?
225+
}
226+
227+
/**
228+
* List refs in a HF dataset repo
229+
*
230+
* Example API URL: https://huggingface.co/api/datasets/codeparrot/github-code/refs
231+
*
232+
* @param repo (namespace/repo)
233+
* @param [options]
234+
* @param [options.requestInit] - request init object to pass to fetch
235+
* @param [options.accessToken] - access token to use for authentication
236+
*
237+
* @returns the list of branches, tags, pull requests, and converts
238+
*/
239+
export async function fetchRefsList(
240+
repo: string,
241+
options?: {requestInit?: RequestInit, accessToken?: string}
242+
): Promise<RefMetadata[]> {
243+
if (options?.accessToken && !options.accessToken.startsWith('hf_')) {
244+
throw new TypeError('Your access token must start with \'hf_\'')
245+
}
246+
const headers = new Headers(options?.requestInit?.headers)
247+
headers.set('accept', 'application/json')
248+
if (options?.accessToken) {
249+
headers.set('Authorization', `Bearer ${options.accessToken}`)
250+
}
251+
const response = await fetch(`https://huggingface.co/api/datasets/${repo}/refs`, { ...options?.requestInit, headers })
252+
if (!response.ok) {
253+
throw new Error(`HTTP error ${response.status.toString()}`)
254+
}
255+
const refsByType = await response.json() as RefsResponse
256+
return refTypes.flatMap((refType) => {
257+
const refResponse = refsByType[refType]
258+
if (!refResponse) {
259+
return []
260+
}
261+
return refResponse.map((refResponse) => {
262+
return {
263+
refType,
264+
...refResponse,
265+
}
266+
})
267+
})
268+
}

src/lib/sources/index.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
export { getHttpSource } from './httpSource.js'
22
export { getHyperparamSource } from './hyperparamSource.js'
3+
export { getHuggingFaceSource } from './huggingFaceSource.js'
34
export type { HyperparamFileMetadata } from './hyperparamSource.js'
45
export type { DirSource, FileKind, FileMetadata, FileSource, Source, SourcePart } from './types.js'
56
export { getFileName } from './utils.js'

0 commit comments

Comments
 (0)