[ggma] Add documentation for TinyLlama example

glistening · glistening · commit 86030e409fc1 · 2025-11-14T16:12:08.000+09:00
- Created `runtime/ggma/examples/generate_text/tinyllama.md` with step‑by‑step guide.
- Includes prerequisites, model generation commands, full processing pipeline, and a summary.

ONE-DCO-1.0-Signed-off-by: Sanggyu Lee &lt;sg5.lee@samsung.com&gt;
diff --git a/runtime/ggma/examples/generate_text/decode.py b/runtime/ggma/examples/generate_text/decode.py
@@ -0,0 +1,71 @@
+# User input
+prompt = "Lily picked up a flower."
+model_name = "Maykeye/TinyLLama-v0"
+
+# Tokenizer
+from transformers import AutoTokenizer
+
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+tokenizer.pad_token = tokenizer.eos_token
+tokenizer.padding_side = "right"
+inputs = tokenizer(
+    prompt,
+    return_tensors="pt",
+    padding="max_length",
+    max_length=30,
+    truncation=True,
+)
+
+# Generator
+import torch
+
+from transformers import AutoModelForCausalLM
+
+model = AutoModelForCausalLM.from_pretrained(model_name)
+model.eval()
+
+from tico.utils.record_input import RecordingInput
+
+# past_key_values
+# ---------------
+# During prefill, "past_key_values" not None, but an empty Cache instance.
+# Passing None makes torch.export happy.
+
+
+input_to_remove = [
+    "attention_mask",
+    # For left pad,        [0, ⋯, 0, 1, ⋯, 1]
+    # For right right pad, [1, ⋯, 1, 0, ⋯, 0]
+    # ( 0 is pad-token )
+    # This script uses right pad and pass all-1 attention mask (including pad).
+    # Npu computes all positions whether it is pad or not.
+]
+condition_fn = lambda args_dict: args_dict["past_key_values"].get_seq_length() != 0
+
+with torch.no_grad(), RecordingInput(
+    model, condition_fn, input_to_remove=input_to_remove
+) as rec:
+    outputs = model.generate(
+        **inputs,
+        max_new_tokens=32,
+        do_sample=False,
+        pad_token_id=tokenizer.eos_token_id,
+    )
+    captured_input = rec.captured_input
+
+generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
+print(generated_text)
+
+# Tico
+import tico
+from tico.serialize.operators.adapters.onert.llama_attention import (
+    llama_attention_forward_adapter,
+)
+from transformers.models.llama.modeling_llama import LlamaAttention
+
+#LlamaAttention.forward = llama_attention_forward_adapter
+
+model = AutoModelForCausalLM.from_pretrained(model_name)
+model.eval()
+circle_model = tico.convert(model, captured_input)
+circle_model.save(f"tinyllama.decode.circle")
diff --git a/runtime/ggma/examples/generate_text/prefill.py b/runtime/ggma/examples/generate_text/prefill.py
@@ -0,0 +1,76 @@
+# User input
+prompt = "Lily picked up a flower."
+model_name = "Maykeye/TinyLLama-v0"
+
+# Tokenizer
+from transformers import AutoTokenizer
+
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+tokenizer.pad_token = tokenizer.eos_token
+tokenizer.padding_side = "right"
+inputs = tokenizer(
+    prompt,
+    return_tensors="pt",
+    padding="max_length",
+    max_length=32,
+    truncation=True,
+)
+
+# Generator
+import torch
+
+from transformers import AutoModelForCausalLM
+
+model = AutoModelForCausalLM.from_pretrained(model_name)
+model.eval()
+
+from tico.utils.record_input import RecordingInput
+
+# past_key_values
+# ---------------
+# During prefill, "past_key_values" not None, but an empty Cache instance.
+# Passing None makes torch.export happy.
+
+
+input_to_remove = [
+    "past_key_values",
+    # DynamicCache is flatten-able operator since 4.50.
+    # See _pytree.py > tree_flatten
+    # SUPPORTED_NODES has *transformers.DynamicCache*
+    # After flattening, DynamicCache becomes { "key_cache": [] , "value_cache": [ ] }
+    # dict.value is returne. dict.key is stored in treespec.
+    #
+    # On prefill, DynamicCache is empty, and dict is empty after flattening.
+    # PyTorch removes empty dict!
+    # If number of args is 4 (including cache), it becomes 3!
+    # To avoid this error, don't pass empty cache, just pass None.
+    "attention_mask",
+    # For left pad,        [0, ⋯, 0, 1, ⋯, 1]
+    # For right right pad, [1, ⋯, 1, 0, ⋯, 0]
+    # ( 0 is pad-token )
+    # This script uses right pad and pass all-1 attention mask (including pad).
+    # Npu computes all positions whether it is pad or not.
+    "cache_position"
+    # It is the list of cache position like [0, 1, ..., 11].
+    # For npu, we always store all values (including pad).
+]
+
+with torch.no_grad(), RecordingInput(model, input_to_remove=input_to_remove) as rec:
+    outputs = model.generate(
+        **inputs,
+        max_new_tokens=32,
+        do_sample=False,
+        pad_token_id=tokenizer.eos_token_id,
+    )
+    captured_input = rec.captured_input
+
+generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
+print(generated_text)
+
+# Tico
+import tico
+
+model = AutoModelForCausalLM.from_pretrained(model_name)
+model.eval()
+circle_model = tico.convert(model, captured_input)
+circle_model.save(f"tinyllama.prefill.circle")
diff --git a/runtime/ggma/examples/generate_text/requirements.txt b/runtime/ggma/examples/generate_text/requirements.txt
@@ -0,0 +1,2 @@
+transformers==4.50.3
+torch
diff --git a/runtime/ggma/examples/generate_text/tinyllama.md b/runtime/ggma/examples/generate_text/tinyllama.md
@@ -0,0 +1,84 @@
+# TinyLlama Example Documentation
+
+This document provides a step‑by‑step guide for generating and processing a text generation model.
+
+## Summary
+
+1. Set up the environment and install dependencies.
+2. Generate the initial `prefill` and `decode` Circle model files.
+3. Run the pipeline to optimize, reshape, and prune the model, producing a final `decode.circle` ready for inference.
+
+## Prerequisites
+
+1. **Python virtual environment**
+   ```bash
+   cd runtime/ggma/examples/generate_text/
+   python3 -m venv _
+   source _/bin/activate
+   ```
+
+2. **Install required Python packages**
+   ```bash
+   pip install -r requirements.txt
+   ```
+
+3. **Install TICO (Torch IR to Circle ONE)**
+   ```bash
+   # Clone the repository
+   git clone https://github.com/Samsung/TICO.git
+   # Install it in editable mode
+   pip install -e TICO
+   ```
+
+## Generating Model Files
+
+Run the provided scripts to create the prefill and decode Circle model files:
+
+```bash
+python prefill.py   # Generates tinyllama.prefill.circle
+python decode.py    # Generates tinyllama.decode.circle
+```
+
+You can verify the generated files:
+
+```bash
+ls -lh *.circle
+# Expected output:
+# -rw-rw-r-- 1 gyu gyu 18M Nov 14 14:09 tinyllama.decode.circle
+# -rw-rw-r-- 1 gyu gyu 18M Nov 14 14:09 tinyllama.prefill.circle
+```
+
+## Full Processing Pipeline
+
+The following pipeline shows how to chain several tools to transform the model:
+
+```bash
+with.py tinyllama.decode.circle |
+fuse.attention.py \
+fuse.bmm_lhs_const.py | reshape.fc_weight.py | \
+reshape.io.py input --by_shape [1,16,30,4] [1,16,32,4] | \
+transpose.io.kvcache.py | \
+remove.io.py output --keep_by_id 0 | \
+select.op.py --by_id 0-181 | \
+gc.py | \
+retype.input_ids.py > decode.circle
+```
+
+### Explanation of each step
+
+| Tool | Purpose |
+|------|---------|
+| `with.py` | Reads the Circle model from stdin and writes it to stdout. |
+| `fuse.attention.py` | Fuses attention‑related operators for optimization. |
+| `fuse.bmm_lhs_const.py` | Fuses constant left‑hand side matrices in batch matrix multiplication. |
+| `reshape.fc_weight.py` | Reshapes fully‑connected layer weights. |
+| `reshape.io.py input --by_shape [...]` | Reshapes input tensors to the specified shapes. |
+| `transpose.io.kvcache.py` | Transposes the KV‑cache tensors. |
+| `remove.io.py output --keep_by_id 0` | Keeps only the output tensor with ID 0, removing the rest. |
+| `select.op.py --by_id 0-181` | Selects operators with IDs from 0 to 181. |
+| `gc.py` | Performs garbage collection, removing unused tensors and operators. |
+| `retype.input_ids.py` | Changes the data type of input IDs as needed. |
+| `> decode.circle` | Saves the final processed model to `decode.circle`. |
+
+
+Feel free to adjust the pipeline arguments (e.g., shapes, IDs) to suit your specific model configuration.