Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 27 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1 +1,27 @@
# DocBotTest
# DocBotTest

## Hugging Face demo

`hf_demo.py` is a small script that:
- downloads a model from the Hugging Face Hub (via `transformers`)
- downloads a dataset from the Hugging Face Hub (via `datasets`)
- runs batched inference over a text column and prints predictions

### Setup

```bash
pip install -U "transformers" "datasets" "torch" "accelerate"
```

### Run

```bash
python hf_demo.py
```

### Customize

```bash
python hf_demo.py --dataset imdb --split "test[:50]" --text-column text \
--task text-classification --model distilbert-base-uncased-finetuned-sst-2-english
```
112 changes: 112 additions & 0 deletions hf_demo.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
"""
hf_demo.py

Demo code that:
- pulls a model from the Hugging Face Hub (Transformers)
- pulls a dataset from the Hugging Face Hub (Datasets)
- runs batched inference over a text column

Example:
python hf_demo.py

Requires:
pip install -U "transformers" "datasets" "torch" "accelerate"
"""

from __future__ import annotations

import argparse
from typing import Any, Dict, Iterable, List, Optional

from datasets import load_dataset
from transformers import pipeline
from transformers.pipelines.pt_utils import KeyDataset


def _parse_args() -> argparse.Namespace:
p = argparse.ArgumentParser(description="Run a HF model on a HF dataset.")
p.add_argument(
"--dataset",
default="imdb",
help='Hugging Face dataset id, e.g. "imdb" or "cornell-movie-review-data/rotten_tomatoes".',
)
p.add_argument(
"--split",
default="test[:200]",
help='Dataset split (supports slicing), e.g. "train", "test", "train[:1000]".',
)
p.add_argument(
"--text-column",
default="text",
help='Column to feed into the model, e.g. "text".',
)
p.add_argument(
"--task",
default="text-classification",
help='Pipeline task, e.g. "text-classification", "sentiment-analysis", "summarization".',
)
p.add_argument(
"--model",
default="distilbert-base-uncased-finetuned-sst-2-english",
help='Model id on Hugging Face Hub, e.g. "distilbert-base-uncased-finetuned-sst-2-english".',
)
p.add_argument(
"--batch-size",
type=int,
default=16,
help="Pipeline batch size.",
)
p.add_argument(
"--max-examples",
type=int,
default=25,
help="Stop after this many examples (keeps the demo quick).",
)
return p.parse_args()


def _snippet(s: str, n: int = 200) -> str:
s = s.replace("\n", " ").strip()
return s if len(s) <= n else s[:n] + "..."


def main() -> None:
args = _parse_args()

# 1) Load a dataset from the Hugging Face Hub.
ds = load_dataset(args.dataset, split=args.split)
if args.text_column not in ds.column_names:
raise ValueError(
f'Column "{args.text_column}" not found. Available columns: {ds.column_names}'
)

# 2) Load a model from the Hugging Face Hub via a Transformers pipeline.
# device_map="auto" will use GPU if available; otherwise CPU.
pipe = pipeline(task=args.task, model=args.model, device_map="auto")

# 3) Run inference over the dataset (batched).
outputs: List[Dict[str, Any]] = []
for i, out in enumerate(
pipe(KeyDataset(ds, args.text_column), batch_size=args.batch_size, truncation=True)
):
outputs.append(out)
if i + 1 >= args.max_examples:
break

# 4) Print a few example outputs.
print(f"dataset={args.dataset} split={args.split} text_column={args.text_column}")
print(f"task={args.task} model={args.model}")
print(f"processed_examples={len(outputs)}")
print()

show_n = min(5, len(outputs))
for i in range(show_n):
text = ds[i][args.text_column]
print("----")
print("text:", _snippet(text))
print("pred:", outputs[i])


if __name__ == "__main__":
main()