Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
a383613
Add GGUF file support in CLI and printing functions
vm7608 Jan 7, 2026
86555c8
Refactor print functions to accept customizable name length
vm7608 Jan 8, 2026
4692471
Add gguf dtype mapping to bytes-per-weight
diegovelilla Jan 30, 2026
687b6d0
Move gguf functionality to separate file
diegovelilla Jan 31, 2026
0dd089a
Added GGUFDtypes Literal, specific metadata dataclasses and typeguard
diegovelilla Jan 31, 2026
fc301a9
Added fetching and parsing of gguf metadata + weights size estimation.
diegovelilla Feb 1, 2026
8987533
Added function to compute kv cache size.
diegovelilla Feb 2, 2026
4c7f9f4
Added printing for gguf files (without kv-cache) + fixed bugs for sha…
diegovelilla Feb 2, 2026
2424ac0
Added printing for kv-cache estimation.
diegovelilla Feb 3, 2026
4320c5b
Fix --experimental bug.
diegovelilla Feb 3, 2026
74643cd
--experimental bug fix + removed ?recursive=True comments.
diegovelilla Feb 3, 2026
6b93f28
Fixed --kv-cache-dtype bug and icompatibility with gguf files.
diegovelilla Feb 3, 2026
6919426
Added new formatting of the tables and fixed sharded files bug.
diegovelilla Feb 4, 2026
a076ab5
Fixed kv-cache printing bug.
diegovelilla Feb 4, 2026
3728de5
Added asynchronous fetching with semaphore.
diegovelilla Feb 5, 2026
147a9b8
Added GGUF section to README.
diegovelilla Feb 5, 2026
57e9f05
Fixed f string quoting.
diegovelilla Feb 20, 2026
fced11d
Removed --gguf flag
diegovelilla Feb 20, 2026
9008b4c
Added found GGUF filepaths to the warning.
diegovelilla Feb 20, 2026
d36c019
Updated README with gguf
diegovelilla Feb 23, 2026
be147d2
Pre-commit fixes
diegovelilla Feb 23, 2026
8bbec01
Added changes for __version__ commit
diegovelilla Feb 23, 2026
9fa7a43
Added __version__ to json output for gguf
diegovelilla Feb 23, 2026
8bb8c34
Update README with .gguf screenshot and moved over Agent skills
diegovelilla Feb 24, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,16 @@ uvx hf-mem --model-id MiniMaxAI/MiniMax-M2 --experimental

<img src="https://github.com/user-attachments/assets/64eaff88-d395-4d8d-849b-78fb86411dc3" />

## GGUF Files

*.gguf* files will only be listed when no other *.safetensors* files are present in the repository or when using the `--gguf-file` flag followed by a filepath to a GGUF model. For sharded files, the path to any of the individual shards will work. Other flags like `--experimental` or the ones regarding KV cache calculations are also compatible.

```bash
uvx hf-mem --model-id TheBloke/deepseek-llm-7B-chat-GGUF --gguf-file deepseek-llm-7b-chat.Q2_K.gguf --experimental
```

<img width="1140" height="1065" alt="image" src="https://github.com/user-attachments/assets/9cdcb769-6ca9-4ed9-adaf-630848c94356" />

## (Optional) Agent Skills

Optionally, you can add `hf-mem` as an agent skill, which allows the underlying coding agent to discover and use it when provided as a [`SKILL.md`](.skills/hf-mem/SKILL.md).
Expand Down
173 changes: 165 additions & 8 deletions src/hf_mem/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import asyncio
import json
import os
import re
import struct
import warnings
from dataclasses import asdict
Expand All @@ -12,15 +13,17 @@
import httpx

from hf_mem import __version__
from hf_mem.gguf import GGUFDtype, GGUFMetadata, fetch_gguf_with_semaphore, gguf_metadata_to_json, merge_shards
from hf_mem.metadata import parse_safetensors_metadata
from hf_mem.print import print_report
from hf_mem.print import print_report, print_report_for_gguf
from hf_mem.types import TorchDtypes, get_safetensors_dtype_bytes, torch_dtype_to_safetensors_dtype

# NOTE: Defines the bytes that will be fetched per safetensors file, but the metadata
# can indeed be larger than that
MAX_METADATA_SIZE = 100_000
REQUEST_TIMEOUT = float(os.getenv("REQUEST_TIMEOUT", 30.0))
MAX_CONCURRENCY = int(os.getenv("MAX_WORKERS", min(32, (os.cpu_count() or 1) + 4)))
KV_CACHE_DTYPE_CHOICES = ["auto", "bfloat16", "fp8", "fp8_ds_mla", "fp8_e4m3", "fp8_e5m2", "fp8_inc"]


# NOTE: Return type-hint set to `Any`, but it will only be a JSON-compatible object
Expand Down Expand Up @@ -91,6 +94,7 @@ async def run(
# END_KV_CACHE_ARGS
json_output: bool = False,
ignore_table_width: bool = False,
gguf_file: str | None = None,
) -> Dict[str, Any] | None:
headers = {"User-Agent": f"hf-mem/0.4; id={uuid4()}; model_id={model_id}; revision={revision}"}
# NOTE: Read from `HF_TOKEN` if provided, then fallback to reading from `$HF_HOME/token`
Expand Down Expand Up @@ -119,13 +123,155 @@ async def run(
follow_redirects=True,
)

# TODO: `recursive=true` shouldn't really be required unless it's a Diffusers
# models... I don't think this adds extra latency anyway
url = f"https://huggingface.co/api/models/{model_id}/tree/{revision}?recursive=true"
files = await get_json_file(client=client, url=url, headers=headers)
file_paths = [f["path"] for f in files if f.get("path") and f.get("type") == "file"]

if "model.safetensors" in file_paths:
# NOTE: GGUF support only applies if:
# 1. The `--gguf-file` flag is set.
# 2. No Safetensors files are found and at least one gguf file is found
gguf_paths = [f for f in file_paths if str(f).endswith(".gguf")]
has_safetensors = any(
f in ["model.safetensors", "model.safetensors.index.json", "model_index.json"] for f in file_paths
)
gguf = gguf_file is not None or (gguf_paths and not has_safetensors)

if not gguf and (has_safetensors and gguf_paths):
warnings.warn(
f"Both Safetensors and GGUF files have been found for {model_id} @ {revision}, if you want to estimate any of the GGUF file sizes, please use the `--gguf-file` flag with the path to the specific GGUF file. GGUF files found: {gguf_paths}."
)

if gguf:
if kv_cache_dtype not in GGUFDtype.__members__ and kv_cache_dtype != "auto":
raise RuntimeError(
f"--kv-cache-dtype={kv_cache_dtype} not recognized for GGUF files. Valid options: {list(GGUFDtype.__members__.keys())} or `auto`."
)

if not gguf_paths:
raise RuntimeError(f"No GGUF files found for {model_id} @ {revision}.")

if gguf_file:
# Check if it's a sharded file (model-00001-of-00046.gguf)
if prefix_match := re.match(r"(.+)-\d+-of-\d+\.gguf$", gguf_file):
# Keep all shards with the same prefix
prefix = prefix_match.group(1)
gguf_paths = [
path
for path in gguf_paths
if re.match(rf"{re.escape(prefix)}-\d+-of-\d+\.gguf$", str(path))
]
else:
# Not sharded
gguf_paths = [path for path in gguf_paths if str(path).endswith(gguf_file)]
if len(gguf_paths) > 1:
raise RuntimeError(
f"Multiple GGUF files named `{gguf_file}` found for {model_id} @ {revision}."
)

if not gguf_paths:
raise RuntimeError(f"No GGUF file matching `{gguf_file}` found for {model_id} @ {revision}.")

semaphore = asyncio.Semaphore(MAX_CONCURRENCY)

tasks = []
for path in gguf_paths:
# In sharded GGUF files tensor metadata also gets sharded, so we need to merge them all
shard_pattern = re.match(
r"(.+)-(\d+)-of-(\d+)\.gguf$", str(path)
) # Ex: Kimi-K2.5-BF16-00001-of-00046.gguf
parse_kv_cache = experimental
# For sharded files, parsing kv_cache data might result in runtime errors (missing fields)
if experimental and shard_pattern:
shard_num = int(shard_pattern.group(2)) # Get first number
parse_kv_cache = shard_num == 1

task = asyncio.create_task(
fetch_gguf_with_semaphore(
semaphore=semaphore,
client=client,
model_id=model_id,
revision=revision,
path=path,
parse_kv_cache=parse_kv_cache,
shard_pattern=shard_pattern,
max_model_len=max_model_len,
kv_cache_dtype=kv_cache_dtype,
batch_size=batch_size,
headers=headers,
)
)
tasks.append(task)

results = await asyncio.gather(*tasks, return_exceptions=False)

gguf_files: Dict[str, GGUFMetadata] = dict()
for path, metadata, shard_pattern in results:
# Merge metadata for sharded files
if shard_pattern:
# Ex: base_name = Kimi-K2.5-BF16
base_name = shard_pattern.group(1) + ".gguf"
if base_name in gguf_files:
gguf_files[base_name] = merge_shards(gguf_files[base_name], metadata)
else:
gguf_files[base_name] = metadata
else:
gguf_files[path] = metadata

if json_output:
if gguf_file:
print(
json.dumps(
[
gguf_metadata_to_json(model_id=filename, revision=revision, metadata=gguf_metadata)
for filename, gguf_metadata in gguf_files.items()
][0]
)
)
else:
print(
json.dumps(
[
gguf_metadata_to_json(model_id=filename, revision=revision, metadata=gguf_metadata)
for filename, gguf_metadata in gguf_files.items()
]
)
)
else:
if gguf_file:
gguf_metadata = list(gguf_files.values())[0]
gguf_file_name = list(gguf_files.keys())[0]

# If we print just one file, we reuse the print_report function
if experimental and gguf_metadata.kv_cache_info is not None:
print_report(
model_id=gguf_file_name,
revision=revision,
metadata=gguf_metadata,
cache={
"max_model_len": gguf_metadata.kv_cache_info.max_model_len,
"cache_size": gguf_metadata.kv_cache_info.cache_size,
"batch_size": gguf_metadata.kv_cache_info.batch_size,
"cache_dtype": gguf_metadata.kv_cache_info.cache_dtype, # type: ignore
},
ignore_table_width=ignore_table_width,
)
else:
print_report(
model_id=gguf_file_name,
revision=revision,
metadata=gguf_metadata,
ignore_table_width=ignore_table_width,
)
else:
# For multiple files, we use the new one
print_report_for_gguf(
model_id=model_id,
revision=revision,
gguf_files=gguf_files,
ignore_table_width=ignore_table_width,
)
return
elif "model.safetensors" in file_paths:
url = f"https://huggingface.co/{model_id}/resolve/{revision}/model.safetensors"
raw_metadata = await fetch_safetensors_metadata(client=client, url=url, headers=headers)

Expand Down Expand Up @@ -237,7 +383,7 @@ async def fetch_with_semaphore(url: str) -> Dict[str, Any]:
metadata = parse_safetensors_metadata(raw_metadata=raw_metadata)
else:
raise RuntimeError(
"NONE OF `model.safetensors`, `model.safetensors.index.json`, `model_index.json` HAS BEEN FOUND"
"NONE OF `model.safetensors`, `model.safetensors.index.json`, `model_index.json` FILES HAVE BEEN FOUND"
)

cache_size = None
Expand Down Expand Up @@ -432,8 +578,7 @@ def main() -> None:
type=str,
default="auto",
# NOTE: https://docs.vllm.ai/en/stable/cli/serve/#-kv-cache-dtype
choices={"auto", "bfloat16", "fp8", "fp8_ds_mla", "fp8_e4m3", "fp8_e5m2", "fp8_inc"},
help="Data type for the KV cache storage. If `auto` is specified, it will use the default model dtype specified in the `config.json` (if available). Despite the FP8 data types having different formats, all those take 1 byte, meaning that the calculation would lead to the same results. Defaults to `auto`.",
help=f"Data type for the KV cache storage. If `auto` is specified, it will use the default model dtype specified in the `config.json` (if available) or F16 for GGUF files. Despite the FP8 data types having different formats, all those take 1 byte, meaning that the calculation would lead to the same results. Valid values are {KV_CACHE_DTYPE_CHOICES} without for safetensors files and {['auto'] + list(GGUFDtype.__members__.keys())} for GGUF files. Defaults to `auto`.",
)

parser.add_argument(
Expand All @@ -446,14 +591,24 @@ def main() -> None:
action="store_true",
help="Whether to ignore the maximum recommended table width, in case the `--model-id` and/or `--revision` cause a row overflow when printing those.",
)

parser.add_argument(
"--gguf-file",
type=str,
default=None,
help="Specific GGUF file to estimate. If not provided, all GGUF files found in the repo will be estimated. Only the file name is required, not the full path.",
)
args = parser.parse_args()

if args.experimental:
warnings.warn(
"`--experimental` is set, which means that models with an architecture as `...ForCausalLM` and `...ForConditionalGeneration` will include estimations for the KV Cache as well. You can also provide the args `--max-model-len` and `--batch-size` as part of the estimation. Note that enabling `--experimental` means that the output will be different both when displayed and when dumped as JSON with `--json-output`, so bear that in mind."
)

if args.kv_cache_dtype not in KV_CACHE_DTYPE_CHOICES:
raise RuntimeError(
f"--kv-cache-dtype={args.kv_cache_dtype} not recognized. Valid options: {KV_CACHE_DTYPE_CHOICES}."
)

asyncio.run(
run(
model_id=args.model_id,
Expand All @@ -466,5 +621,7 @@ def main() -> None:
# NOTE: Below are the arguments that affect the output format
json_output=args.json_output,
ignore_table_width=args.ignore_table_width,
# NOTE: GGUF flags
gguf_file=args.gguf_file,
)
)
Loading