diff --git a/packages/hub/README.md b/packages/hub/README.md index e51b778c49..8fc8140f9e 100644 --- a/packages/hub/README.md +++ b/packages/hub/README.md @@ -110,6 +110,19 @@ console.log(oauthResult); Checkout the demo: https://huggingface.co/spaces/huggingfacejs/client-side-oauth +## Hugging face cache + +The `@huggingface/hub` package provide basic capabilities to scan the cache directory. Learn more about [Manage huggingface_hub cache-system](https://huggingface.co/docs/huggingface_hub/en/guides/manage-cache). + +```ts +import { scanCacheDir } from "@huggingface/hub"; + +const result = await scanCacheDir(); + +console.log(result); +``` +Note that the cache directory is created and used only by the Python and Rust libraries. Downloading files using the `@huggingface/hub` package won't use the cache directory. + ## Performance considerations When uploading large files, you may want to run the `commit` calls inside a worker, to offload the sha256 computations. diff --git a/packages/hub/package.json b/packages/hub/package.json index cf5a967535..2513264072 100644 --- a/packages/hub/package.json +++ b/packages/hub/package.json @@ -20,6 +20,7 @@ "browser": { "./src/utils/sha256-node.ts": false, "./src/utils/FileBlob.ts": false, + "./src/lib/cache-management.ts": false, "./dist/index.js": "./dist/browser/index.js", "./dist/index.mjs": "./dist/browser/index.mjs" }, diff --git a/packages/hub/src/lib/cache-management.spec.ts b/packages/hub/src/lib/cache-management.spec.ts new file mode 100644 index 0000000000..3bfed63d92 --- /dev/null +++ b/packages/hub/src/lib/cache-management.spec.ts @@ -0,0 +1,137 @@ +import { describe, test, expect, vi, beforeEach } from "vitest"; +import { + scanCacheDir, + scanCachedRepo, + scanSnapshotDir, + parseRepoType, + getBlobStat, + type CachedFileInfo, +} from "./cache-management"; +import { stat, readdir, realpath, lstat } from "node:fs/promises"; +import type { Dirent, Stats } from "node:fs"; +import { join } from "node:path"; + +// Mocks +vi.mock("node:fs/promises"); + +beforeEach(() => { + vi.resetAllMocks(); + vi.restoreAllMocks(); +}); + +describe("scanCacheDir", () => { + test("should throw an error if cacheDir is not a directory", async () => { + vi.mocked(stat).mockResolvedValueOnce({ + isDirectory: () => false, + } as Stats); + + await expect(scanCacheDir("/fake/dir")).rejects.toThrow("Scan cache expects a directory"); + }); + + test("empty directory should return an empty set of repository and no warnings", async () => { + vi.mocked(stat).mockResolvedValueOnce({ + isDirectory: () => true, + } as Stats); + + // mock empty cache folder + vi.mocked(readdir).mockResolvedValue([]); + + const result = await scanCacheDir("/fake/dir"); + + // cacheDir must have been read + expect(readdir).toHaveBeenCalledWith("/fake/dir"); + + expect(result.warnings.length).toBe(0); + expect(result.repos).toHaveLength(0); + expect(result.size).toBe(0); + }); +}); + +describe("scanCachedRepo", () => { + test("should throw an error for invalid repo path", async () => { + await expect(() => { + return scanCachedRepo("/fake/repo_path"); + }).rejects.toThrow("Repo path is not a valid HuggingFace cache directory"); + }); + + test("should throw an error if the snapshot folder does not exist", async () => { + vi.mocked(readdir).mockResolvedValue([]); + vi.mocked(stat).mockResolvedValue({ + isDirectory: () => false, + } as Stats); + + await expect(() => { + return scanCachedRepo("/fake/cacheDir/models--hello-world--name"); + }).rejects.toThrow("Snapshots dir doesn't exist in cached repo"); + }); + + test("should properly parse the repository name", async () => { + const repoPath = "/fake/cacheDir/models--hello-world--name"; + vi.mocked(readdir).mockResolvedValue([]); + vi.mocked(stat).mockResolvedValue({ + isDirectory: () => true, + } as Stats); + + const result = await scanCachedRepo(repoPath); + expect(readdir).toHaveBeenCalledWith(join(repoPath, "refs"), { + withFileTypes: true, + }); + + expect(result.id.name).toBe("hello-world/name"); + expect(result.id.type).toBe("model"); + }); +}); + +describe("scanSnapshotDir", () => { + test("should scan a valid snapshot directory", async () => { + const cachedFiles: CachedFileInfo[] = []; + const blobStats = new Map(); + vi.mocked(readdir).mockResolvedValueOnce([{ name: "file1", isDirectory: () => false } as Dirent]); + + vi.mocked(realpath).mockResolvedValueOnce("/fake/realpath"); + vi.mocked(lstat).mockResolvedValueOnce({ size: 1024, atimeMs: Date.now(), mtimeMs: Date.now() } as Stats); + + await scanSnapshotDir("/fake/revision", cachedFiles, blobStats); + + expect(cachedFiles).toHaveLength(1); + expect(blobStats.size).toBe(1); + }); +}); + +describe("getBlobStat", () => { + test("should retrieve blob stat if already cached", async () => { + const blobStats = new Map([["/fake/blob", { size: 1024 } as Stats]]); + const result = await getBlobStat("/fake/blob", blobStats); + + expect(lstat).not.toHaveBeenCalled(); + expect(result.size).toBe(1024); + }); + + test("should fetch and cache blob stat if not cached", async () => { + const blobStats = new Map(); + vi.mocked(lstat).mockResolvedValueOnce({ size: 2048 } as Stats); + + const result = await getBlobStat("/fake/blob", blobStats); + + expect(result.size).toBe(2048); + expect(blobStats.size).toBe(1); + }); +}); + +describe("parseRepoType", () => { + test("should parse models repo type", () => { + expect(parseRepoType("models")).toBe("model"); + }); + + test("should parse dataset repo type", () => { + expect(parseRepoType("datasets")).toBe("dataset"); + }); + + test("should parse space repo type", () => { + expect(parseRepoType("spaces")).toBe("space"); + }); + + test("should throw an error for invalid repo type", () => { + expect(() => parseRepoType("invalid")).toThrowError("Invalid repo type: invalid"); + }); +}); diff --git a/packages/hub/src/lib/cache-management.ts b/packages/hub/src/lib/cache-management.ts new file mode 100644 index 0000000000..aecbf271e1 --- /dev/null +++ b/packages/hub/src/lib/cache-management.ts @@ -0,0 +1,258 @@ +import { homedir } from "node:os"; +import { join, basename } from "node:path"; +import { stat, readdir, readFile, realpath, lstat } from "node:fs/promises"; +import type { Stats } from "node:fs"; +import type { RepoType, RepoId } from "../types/public"; + +function getDefaultHome(): string { + return join(homedir(), ".cache"); +} + +function getDefaultCachePath(): string { + return join(process.env["HF_HOME"] ?? join(process.env["XDG_CACHE_HOME"] ?? getDefaultHome(), "huggingface"), "hub"); +} + +function getHuggingFaceHubCache(): string { + return process.env["HUGGINGFACE_HUB_CACHE"] ?? getDefaultCachePath(); +} + +function getHFHubCache(): string { + return process.env["HF_HUB_CACHE"] ?? getHuggingFaceHubCache(); +} + +const FILES_TO_IGNORE: string[] = [".DS_Store"]; + +export interface CachedFileInfo { + path: string; + /** + * Underlying file - which `path` is symlinked to + */ + blob: { + size: number; + path: string; + lastModifiedAt: Date; + lastAccessedAt: Date; + }; +} + +export interface CachedRevisionInfo { + commitOid: string; + path: string; + size: number; + files: CachedFileInfo[]; + refs: string[]; + + lastModifiedAt: Date; +} + +export interface CachedRepoInfo { + id: RepoId; + path: string; + size: number; + filesCount: number; + revisions: CachedRevisionInfo[]; + + lastAccessedAt: Date; + lastModifiedAt: Date; +} + +export interface HFCacheInfo { + size: number; + repos: CachedRepoInfo[]; + warnings: Error[]; +} + +export async function scanCacheDir(cacheDir: string | undefined = undefined): Promise { + if (!cacheDir) cacheDir = getHFHubCache(); + + const s = await stat(cacheDir); + if (!s.isDirectory()) { + throw new Error( + `Scan cache expects a directory but found a file: ${cacheDir}. Please use \`cacheDir\` argument or set \`HF_HUB_CACHE\` environment variable.` + ); + } + + const repos: CachedRepoInfo[] = []; + const warnings: Error[] = []; + + const directories = await readdir(cacheDir); + for (const repo of directories) { + // skip .locks folder + if (repo === ".locks") continue; + + // get the absolute path of the repo + const absolute = join(cacheDir, repo); + + // ignore non-directory element + const s = await stat(absolute); + if (!s.isDirectory()) { + continue; + } + + try { + const cached = await scanCachedRepo(absolute); + repos.push(cached); + } catch (err: unknown) { + warnings.push(err as Error); + } + } + + return { + repos: repos, + size: [...repos.values()].reduce((sum, repo) => sum + repo.size, 0), + warnings: warnings, + }; +} + +export async function scanCachedRepo(repoPath: string): Promise { + // get the directory name + const name = basename(repoPath); + if (!name.includes("--")) { + throw new Error(`Repo path is not a valid HuggingFace cache directory: ${name}`); + } + + // parse the repoId from directory name + const [type, ...remaining] = name.split("--"); + const repoType = parseRepoType(type); + const repoId = remaining.join("/"); + + const snapshotsPath = join(repoPath, "snapshots"); + const refsPath = join(repoPath, "refs"); + + const snapshotStat = await stat(snapshotsPath); + if (!snapshotStat.isDirectory()) { + throw new Error(`Snapshots dir doesn't exist in cached repo ${snapshotsPath}`); + } + + // Check if the refs directory exists and scan it + const refsByHash: Map = new Map(); + const refsStat = await stat(refsPath); + if (refsStat.isDirectory()) { + await scanRefsDir(refsPath, refsByHash); + } + + // Scan snapshots directory and collect cached revision information + const cachedRevisions: CachedRevisionInfo[] = []; + const blobStats: Map = new Map(); // Store blob stats + + const snapshotDirs = await readdir(snapshotsPath); + for (const dir of snapshotDirs) { + if (FILES_TO_IGNORE.includes(dir)) continue; // Ignore unwanted files + + const revisionPath = join(snapshotsPath, dir); + const revisionStat = await stat(revisionPath); + if (!revisionStat.isDirectory()) { + throw new Error(`Snapshots folder corrupted. Found a file: ${revisionPath}`); + } + + const cachedFiles: CachedFileInfo[] = []; + await scanSnapshotDir(revisionPath, cachedFiles, blobStats); + + const revisionLastModified = + cachedFiles.length > 0 + ? Math.max(...[...cachedFiles].map((file) => file.blob.lastModifiedAt.getTime())) + : revisionStat.mtimeMs; + + cachedRevisions.push({ + commitOid: dir, + files: cachedFiles, + refs: refsByHash.get(dir) || [], + size: [...cachedFiles].reduce((sum, file) => sum + file.blob.size, 0), + path: revisionPath, + lastModifiedAt: new Date(revisionLastModified), + }); + + refsByHash.delete(dir); + } + + // Verify that all refs refer to a valid revision + if (refsByHash.size > 0) { + throw new Error( + `Reference(s) refer to missing commit hashes: ${JSON.stringify(Object.fromEntries(refsByHash))} (${repoPath})` + ); + } + + const repoStats = await stat(repoPath); + const repoLastAccessed = + blobStats.size > 0 ? Math.max(...[...blobStats.values()].map((stat) => stat.atimeMs)) : repoStats.atimeMs; + + const repoLastModified = + blobStats.size > 0 ? Math.max(...[...blobStats.values()].map((stat) => stat.mtimeMs)) : repoStats.mtimeMs; + + // Return the constructed CachedRepoInfo object + return { + id: { + name: repoId, + type: repoType, + }, + path: repoPath, + filesCount: blobStats.size, + revisions: cachedRevisions, + size: [...blobStats.values()].reduce((sum, stat) => sum + stat.size, 0), + lastAccessedAt: new Date(repoLastAccessed), + lastModifiedAt: new Date(repoLastModified), + }; +} + +export async function scanRefsDir(refsPath: string, refsByHash: Map): Promise { + const refFiles = await readdir(refsPath, { withFileTypes: true }); + for (const refFile of refFiles) { + const refFilePath = join(refsPath, refFile.name); + if (refFile.isDirectory()) continue; // Skip directories + + const commitHash = await readFile(refFilePath, "utf-8"); + const refName = refFile.name; + if (!refsByHash.has(commitHash)) { + refsByHash.set(commitHash, []); + } + refsByHash.get(commitHash)?.push(refName); + } +} + +export async function scanSnapshotDir( + revisionPath: string, + cachedFiles: CachedFileInfo[], + blobStats: Map +): Promise { + const files = await readdir(revisionPath, { withFileTypes: true }); + for (const file of files) { + if (file.isDirectory()) continue; // Skip directories + + const filePath = join(revisionPath, file.name); + const blobPath = await realpath(filePath); + const blobStat = await getBlobStat(blobPath, blobStats); + + cachedFiles.push({ + path: filePath, + blob: { + path: blobPath, + size: blobStat.size, + lastAccessedAt: new Date(blobStat.atimeMs), + lastModifiedAt: new Date(blobStat.mtimeMs), + }, + }); + } +} + +export async function getBlobStat(blobPath: string, blobStats: Map): Promise { + const blob = blobStats.get(blobPath); + if (!blob) { + const statResult = await lstat(blobPath); + blobStats.set(blobPath, statResult); + return statResult; + } + return blob; +} + +export function parseRepoType(type: string): RepoType { + switch (type) { + case "models": + return "model"; + case "datasets": + return "dataset"; + case "spaces": + return "space"; + default: + throw new TypeError(`Invalid repo type: ${type}`); + } +} diff --git a/packages/hub/src/lib/index.ts b/packages/hub/src/lib/index.ts index 554977f024..4630d1764f 100644 --- a/packages/hub/src/lib/index.ts +++ b/packages/hub/src/lib/index.ts @@ -1,3 +1,4 @@ +export * from "./cache-management"; export * from "./commit"; export * from "./count-commits"; export * from "./create-repo"; diff --git a/packages/hub/vitest-browser.config.mts b/packages/hub/vitest-browser.config.mts index 65be77c7ac..e106a2fbaa 100644 --- a/packages/hub/vitest-browser.config.mts +++ b/packages/hub/vitest-browser.config.mts @@ -2,6 +2,6 @@ import { configDefaults, defineConfig } from "vitest/config"; export default defineConfig({ test: { - exclude: [...configDefaults.exclude, "src/utils/FileBlob.spec.ts"], + exclude: [...configDefaults.exclude, "src/utils/FileBlob.spec.ts", "src/lib/cache-management.spec.ts"], }, });