upstash · shannonrumsey · Feb 11, 2026
diff --git a/README.md b/README.md
@@ -1 +1,27 @@
-# DocBotTest
+# DocBotTest
+
+## Hugging Face demo
+
+`hf_demo.py` is a small script that:
+- downloads a model from the Hugging Face Hub (via `transformers`)
+- downloads a dataset from the Hugging Face Hub (via `datasets`)
+- runs batched inference over a text column and prints predictions
+
+### Setup
+
+```bash
+pip install -U "transformers" "datasets" "torch" "accelerate"
+```
+
+### Run
+
+```bash
+python hf_demo.py
+```
+
+### Customize
+
+```bash
+python hf_demo.py --dataset imdb --split "test[:50]" --text-column text \
+  --task text-classification --model distilbert-base-uncased-finetuned-sst-2-english
+```
diff --git a/hf_demo.py b/hf_demo.py
@@ -0,0 +1,112 @@
+"""
+hf_demo.py
+
+Demo code that:
+- pulls a model from the Hugging Face Hub (Transformers)
+- pulls a dataset from the Hugging Face Hub (Datasets)
+- runs batched inference over a text column
+
+Example:
+  python hf_demo.py
+
+Requires:
+  pip install -U "transformers" "datasets" "torch" "accelerate"
+"""
+
+from __future__ import annotations
+
+import argparse
+from typing import Any, Dict, Iterable, List, Optional
+
+from datasets import load_dataset
+from transformers import pipeline
+from transformers.pipelines.pt_utils import KeyDataset
+
+
+def _parse_args() -> argparse.Namespace:
+    p = argparse.ArgumentParser(description="Run a HF model on a HF dataset.")
+    p.add_argument(
+        "--dataset",
+        default="imdb",
+        help='Hugging Face dataset id, e.g. "imdb" or "cornell-movie-review-data/rotten_tomatoes".',
+    )
+    p.add_argument(
+        "--split",
+        default="test[:200]",
+        help='Dataset split (supports slicing), e.g. "train", "test", "train[:1000]".',
+    )
+    p.add_argument(
+        "--text-column",
+        default="text",
+        help='Column to feed into the model, e.g. "text".',
+    )
+    p.add_argument(
+        "--task",
+        default="text-classification",
+        help='Pipeline task, e.g. "text-classification", "sentiment-analysis", "summarization".',
+    )
+    p.add_argument(
+        "--model",
+        default="distilbert-base-uncased-finetuned-sst-2-english",
+        help='Model id on Hugging Face Hub, e.g. "distilbert-base-uncased-finetuned-sst-2-english".',
+    )
+    p.add_argument(
+        "--batch-size",
+        type=int,
+        default=16,
+        help="Pipeline batch size.",
+    )
+    p.add_argument(
+        "--max-examples",
+        type=int,
+        default=25,
+        help="Stop after this many examples (keeps the demo quick).",
+    )
+    return p.parse_args()
+
+
+def _snippet(s: str, n: int = 200) -> str:
+    s = s.replace("\n", " ").strip()
+    return s if len(s) <= n else s[:n] + "..."
+
+
+def main() -> None:
+    args = _parse_args()
+
+    # 1) Load a dataset from the Hugging Face Hub.
+    ds = load_dataset(args.dataset, split=args.split)
+    if args.text_column not in ds.column_names:
+        raise ValueError(
+            f'Column "{args.text_column}" not found. Available columns: {ds.column_names}'
+        )
+
+    # 2) Load a model from the Hugging Face Hub via a Transformers pipeline.
+    # device_map="auto" will use GPU if available; otherwise CPU.
+    pipe = pipeline(task=args.task, model=args.model, device_map="auto")
+
+    # 3) Run inference over the dataset (batched).
+    outputs: List[Dict[str, Any]] = []
+    for i, out in enumerate(
+        pipe(KeyDataset(ds, args.text_column), batch_size=args.batch_size, truncation=True)
+    ):
+        outputs.append(out)
+        if i + 1 >= args.max_examples:
+            break
+
+    # 4) Print a few example outputs.
+    print(f"dataset={args.dataset} split={args.split} text_column={args.text_column}")
+    print(f"task={args.task} model={args.model}")
+    print(f"processed_examples={len(outputs)}")
+    print()
+
+    show_n = min(5, len(outputs))
+    for i in range(show_n):
+        text = ds[i][args.text_column]
+        print("----")
+        print("text:", _snippet(text))
+        print("pred:", outputs[i])
+
+
+if __name__ == "__main__":
+    main()
+