huggingface
diff --git a/‎.github/workflows/collated-reports.yml
Lines changed: 49 additions & 0 deletions b/‎.github/workflows/collated-reports.yml
Lines changed: 49 additions & 0 deletions
diff --git a/‎SECURITY.md
Lines changed: 1 addition & 1 deletion b/‎SECURITY.md
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/source/en/generation_strategies.md
Lines changed: 25 additions & 0 deletions b/‎docs/source/en/generation_strategies.md
Lines changed: 25 additions & 0 deletions
diff --git a/‎docs/source/ko/_toctree.yml
Lines changed: 4 additions & 0 deletions b/‎docs/source/ko/_toctree.yml
Lines changed: 4 additions & 0 deletions
diff --git a/‎docs/source/ko/tiny_agents.md
Lines changed: 44 additions & 0 deletions b/‎docs/source/ko/tiny_agents.md
Lines changed: 44 additions & 0 deletions
diff --git a/‎src/transformers/cache_utils.py
Lines changed: 7 additions & 6 deletions b/‎src/transformers/cache_utils.py
Lines changed: 7 additions & 6 deletions
diff --git a/‎src/transformers/commands/serving.py
Lines changed: 14 additions & 5 deletions b/‎src/transformers/commands/serving.py
Lines changed: 14 additions & 5 deletions
diff --git a/‎src/transformers/generation/utils.py
Lines changed: 34 additions & 41 deletions b/‎src/transformers/generation/utils.py
Lines changed: 34 additions & 41 deletions
diff --git a/‎src/transformers/models/idefics2/modeling_idefics2.py
Lines changed: 7 additions & 4 deletions b/‎src/transformers/models/idefics2/modeling_idefics2.py
Lines changed: 7 additions & 4 deletions
@@ -0,0 +1,49 @@
+name: CI collated reports
+
+on:
+  workflow_call:
+    inputs:
+      job:
+        required: true
+        type: string
+      report_repo_id:
+        required: true
+        type: string
+      machine_type:
+        required: true
+        type: string
+      gpu_name:
+        description: Name of the GPU used for the job. Its enough that the value contains the name of the GPU, e.g. "noise-h100-more-noise". Case insensitive.
+        required: true
+        type: string
+
+jobs:
+  collated_reports:
+    name: Collated reports
+    runs-on: ubuntu-22.04
+    if: always()
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/download-artifact@v4
+
+      - name: Collated reports
+        shell: bash
+        env:
+          ACCESS_REPO_INFO_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
+          CI_SHA: ${{ github.sha }}
+          TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN: ${{ secrets.TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN }}
+        run: |
+          pip install huggingface_hub
+          python3 utils/collated_reports.py                  \
+            --path /transformers/reports/                    \
+            --machine-type ${{ inputs.machine_type }}        \
+            --commit-hash ${{ env.CI_SHA }}                  \
+            --job ${{ inputs.job }}                          \
+            --report-repo-id ${{ inputs.report_repo_id }}    \
+            --gpu-name ${{ inputs.gpu_name }}
+
+      - name: Upload collated reports
+        uses: actions/upload-artifact@v4
+        with:
+          name: collated_reports_${{ env.CI_SHA }}.json
+          path: collated_reports_${{ env.CI_SHA }}.json
@@ -14,7 +14,7 @@ Models uploaded on the Hugging Face Hub come in different formats. We heavily re
 models in the [`safetensors`](https://github.com/huggingface/safetensors) format (which is the default prioritized
 by the transformers library), as developed specifically to prevent arbitrary code execution on your system.
 
-To avoid loading models from unsafe formats(e.g. [pickle](https://docs.python.org/3/library/pickle.html), you should use the `use_safetensors` parameter. If doing so, in the event that no .safetensors file is present, transformers will error when loading the model.
+To avoid loading models from unsafe formats (e.g. [pickle](https://docs.python.org/3/library/pickle.html), you should use the `use_safetensors` parameter. If doing so, in the event that no .safetensors file is present, transformers will error when loading the model.
 
 ### Remote code
 
 
@@ -504,6 +504,31 @@ Recommended practices:
 - Add self-contained examples to enable quick experimentation.
 - Describe soft-requirements such as if the method only works well with a certain family of models.
 
+### Reusing `generate`’s input preparation
+
+If you're adding a new decoding loop, you might want to preserve the input preparation present in `generate` (batch expansion, attention masks, logits processors, stopping criteria, etc.). You can also pass a **callable** to `custom_generate` to reuse [`~GenerationMixin.generate`]’s full preparation pipeline while overriding only the decoding loop.
+
+```py
+def custom_loop(model, input_ids, attention_mask, logits_processor, stopping_criteria, generation_config, **model_kwargs):
+    next_tokens = input_ids
+    while input_ids.shape[1] < stopping_criteria[0].max_length:
+        logits = model(next_tokens, attention_mask=attention_mask, **model_kwargs).logits
+        next_token_logits = logits_processor(input_ids, logits[:, -1, :])
+        next_tokens = torch.argmax(next_token_logits, dim=-1)[:, None]
+        input_ids = torch.cat((input_ids, next_tokens), dim=-1)
+        attention_mask = torch.cat((attention_mask, torch.ones_like(next_tokens)), dim=-1)
+    return input_ids
+
+output = model.generate(
+    **inputs,
+    custom_generate=custom_loop,
+    max_new_tokens=10,
+)
+```
+
+> [!TIP]
+> If you publish a `custom_generate` repository, your `generate` implementation can itself define a callable and pass it to `model.generate()`. This lets you customize the decoding loop while still benefiting from Transformers’ built-in input preparation logic.
+
 ### Finding custom generation methods
 
 You can find all custom generation methods by [searching for their custom tag.](https://huggingface.co/models?other=custom_generate), `custom_generate`. In addition to the tag, we curate two collections of `custom_generate` methods:
 
@@ -91,6 +91,10 @@
     - local: in_translation
       title: (번역중) Tools and RAG
     title: 모델을 사용해 대화하기
+  - sections:
+    - local: tiny_agents
+      title: Tiny-Agents CLI 및 MCP 도구
+    title: 서빙(Serving)
   - sections:
     - local: in_translation
       title: (번역중) torch.compile
 
@@ -0,0 +1,44 @@
+### `tiny-agents` CLI 및 MCP 도구[[tiny-agents-cli-and-mcp-tools]]
+
+MCP 도구의 사용을 보여주기 위해 [`tiny-agents`](https://huggingface.co/blog/python-tiny-agents) CLI와 `transformers serve` 서버를 연동하는 방법을 살펴보겠습니다.
+
+> [!TIP]
+> 이 예시처럼 많은 Hugging Face Spaces를 MCP 서버로 활용할 수 있습니다. 호환 가능한 모든 Spaces는 [여기](https://huggingface.co/spaces?filter=mcp-server)에서 찾을 수 있습니다.
+
+MCP 도구를 사용하려면 먼저 모델에 사용 가능한 도구를 알려야 합니다. 예를 들어, [이미지 생성 MCP 서버](https://evalstate-flux1-schnell.hf.space/)를 참조하는 `tiny-agents` 설정 파일을 살펴보겠습니다.
+
+```json
+{
+  "model": "Menlo/Jan-nano",
+  "endpointUrl": "http://localhost:8000",
+  "servers": [
+    {
+      "type": "sse",
+      "url": "https://evalstate-flux1-schnell.hf.space/gradio_api/mcp/sse"
+    }
+  ]
+}
+```
+
+그런 다음 아래 명령어로 `tiny-agents` 채팅 인터페이스를 실행할 수 있습니다.
+
+```bash
+tiny-agents run path/to/your/config.json
+```
+
+백그라운드에서 `transformers serve`가 실행 중이라면, 이제 로컬 모델에서 MCP 도구를 사용할 수 있습니다. 다음은 `tiny-agents`와의 채팅 세션 예시입니다.
+
+```bash
+Agent loaded with 1 tools:
+ • flux1_schnell_infer
+»  Generate an image of a cat on the moon
+<Tool req_0_tool_call>flux1_schnell_infer {"prompt": "a cat on the moon", "seed": 42, "randomize_seed": true, "width": 1024, "height": 1024, "num_inference_steps": 4}
+
+Tool req_0_tool_call
+[Binary Content: Image image/webp, 57732 bytes]
+The task is complete and the content accessible to the User
+Image URL: https://evalstate-flux1-schnell.hf.space/gradio_api/file=/tmp/gradio/3dbddc0e53b5a865ed56a4e3dbdd30f3f61cf3b8aabf1b456f43e5241bd968b8/image.webp
+380576952
+
+Flux 1 Schnell 이미지 생성기를 사용하여 달 위의 고양이 이미지를 생성했습니다. 이미지는 1024x1024 픽셀이며 4번의 추론 단계를 거쳐 생성되었습니다. 변경 사항이 필요하거나 추가 도움이 필요하시면 알려주세요!
+```
@@ -16,9 +16,6 @@
 )
 
 
-if _is_quanto_greater_than_0_2_5 := is_quanto_greater("0.2.5", accept_dev=True):
-    from optimum.quanto import MaxOptimizer, qint2, qint4, quantize_weight
-
 if is_hqq_available():
     from hqq.core.quantize import Quantizer as HQQQuantizer
 
@@ -558,7 +555,7 @@ def __init__(
         q_group_size: int = 64,
         residual_length: int = 128,
     ):
-        super().__init__(self)
+        super().__init__()
         self.nbits = nbits
         self.axis_key = axis_key
         self.axis_value = axis_value
@@ -635,10 +632,12 @@ def __init__(
             residual_length=residual_length,
         )
 
-        if not _is_quanto_greater_than_0_2_5:
+        # We need to import quanto here to avoid circular imports due to optimum/quanto/models/transformers_models.py
+        if is_quanto_greater("0.2.5", accept_dev=True):
+            from optimum.quanto import MaxOptimizer, qint2, qint4
+        else:
             raise ImportError(
                 "You need optimum-quanto package version to be greater or equal than 0.2.5 to use `QuantoQuantizedCache`. "
-                "Detected version {optimum_quanto_version}."
             )
 
         if self.nbits not in [2, 4]:
@@ -656,6 +655,8 @@ def __init__(
         self.optimizer = MaxOptimizer()  # hardcode as it's the only one for per-channel quantization
 
     def _quantize(self, tensor, axis):
+        from optimum.quanto import quantize_weight
+
         scale, zeropoint = self.optimizer(tensor, self.qtype, axis, self.q_group_size)
         qtensor = quantize_weight(tensor, self.qtype, axis, scale, zeropoint, self.q_group_size)
         return qtensor
 
@@ -830,13 +830,22 @@ def get_processor_inputs_from_inbound_messages(messages, modality: Modality):
             parsed_message = {"role": message["role"], "content": []}
 
             if modality == Modality.LLM:
-                # If we're working with LLMs, then "content" is a single string.
-                content = message["content"] if isinstance(message["content"], str) else message["content"]["text"]
-                parsed_message["content"] = content
+                # Input: `content` is a string or a list of dictionaries with a "text" key.
+                # Output: `content` is a string.
+                if isinstance(message["content"], str):
+                    parsed_content = message["content"]
+                elif isinstance(message["content"], list):
+                    parsed_content = []
+                    for content in message["content"]:
+                        if content["type"] == "text":
+                            parsed_content.append(content["text"])
+                    parsed_content = " ".join(parsed_content)
+                parsed_message["content"] = parsed_content
 
             elif modality == Modality.VLM:
-                # If we're working with VLMs, then "content" is a dictionary, containing a "type" key indicating
-                # which other key will be present and the type of the value of said key.
+                # Input: `content` is a string or a list of dictionaries with a "type" key (possible types: "text",
+                # "image_url").
+                # Output: `content` is a list of dictionaries with a "type" key
                 if isinstance(message["content"], str):
                     parsed_message["content"].append({"type": "text", "text": message["content"]})
                 else:
 
@@ -2165,7 +2165,7 @@ def generate(
         negative_prompt_ids: Optional[torch.Tensor] = None,
         negative_prompt_attention_mask: Optional[torch.Tensor] = None,
         use_model_defaults: Optional[bool] = None,
-        custom_generate: Optional[str] = None,
+        custom_generate: Optional[Union[str, Callable]] = None,
         **kwargs,
     ) -> Union[GenerateOutput, torch.LongTensor]:
         r"""
@@ -2235,11 +2235,15 @@ def generate(
                 generation configuration (`model.generation_config`), as opposed to the global defaults
                 (`GenerationConfig()`). If unset, models saved starting from `v4.50` will consider this flag to be
                 `True`.
-            custom_generate (`str`, *optional*):
-                A string containing the name of a huggingface.co repository. If provided, the custom `generate`
-                function defined in that reposity's `custom_generate/generate.py` file will be executed instead of the
-                standard `generate` method. Note that the logic is for generation is entirely defined in that
-                repository, and the return type may be different from the standard `generate` method.
+            custom_generate (`str` or `Callable`, *optional*):
+                One of the following:
+                - `str` (Hugging Face Hub repository name): runs the custom `generate` function defined at
+                  `custom_generate/generate.py` in that repository instead of the standard `generate` method. The
+                  repository fully replaces the generation logic, and the return type may differ.
+                - `str` (local repository path): same as above but from a local path, `trust_remote_code` not required.
+                - `Callable`: `generate` will perform the usual input preparation steps, then call the provided callable to
+                  run the decoding loop.
+                For more information, see [the docs](../../generation_strategies#custom-generation-methods).
             kwargs (`dict[str, Any]`, *optional*):
                 Ad hoc parametrization of `generation_config` and/or additional model-specific kwargs that will be
                 forwarded to the `forward` function of the model. If the model is an encoder-decoder model, encoder
@@ -2263,7 +2267,7 @@ def generate(
         """
         # 0. If requested, load an arbitrary generation recipe from the Hub and run it instead
         trust_remote_code = kwargs.pop("trust_remote_code", None)
-        if custom_generate is not None:
+        if custom_generate is not None and isinstance(custom_generate, str):
             # Get all `generate` arguments in a single variable. Custom functions are responsible for handling them:
             # they receive the same inputs as `generate`, with `model` instead of `self` and excluding the arguments to
             # trigger the custom generation. They can access to methods from `GenerationMixin` through `model`.
@@ -2360,6 +2364,14 @@ def generate(
         else:
             input_ids = inputs_tensor if model_input_name == "input_ids" else model_kwargs.pop("input_ids")
 
+        # Expand inputs depending on the generation mode
+        input_ids, model_kwargs = self._expand_inputs_for_generation(
+            input_ids=input_ids,
+            expand_size=max(generation_config.num_beams, generation_config.num_return_sequences),
+            is_encoder_decoder=self.config.is_encoder_decoder,
+            **model_kwargs,
+        )
+
         if generation_config.token_healing:
             input_ids = self.heal_tokens(input_ids, tokenizer)
 
@@ -2441,7 +2453,18 @@ def generate(
         model_kwargs["use_cache"] = generation_config.use_cache
 
         # 10. go into different generation modes
-        if generation_mode == GenerationMode.ASSISTED_GENERATION:
+        if isinstance(custom_generate, Callable):
+            result = custom_generate(
+                self,
+                input_ids,
+                logits_processor=prepared_logits_processor,
+                stopping_criteria=prepared_stopping_criteria,
+                generation_config=generation_config,
+                synced_gpus=synced_gpus,
+                streamer=streamer,
+                **model_kwargs,
+            )
+        elif generation_mode == GenerationMode.ASSISTED_GENERATION:
             if generation_config.num_return_sequences > 1:
                 raise ValueError(
                     "num_return_sequences has to be 1 when doing assisted generate, "
@@ -2530,15 +2553,7 @@ def generate(
             )
 
         elif generation_mode in (GenerationMode.SAMPLE, GenerationMode.GREEDY_SEARCH):
-            # 11. expand input_ids with `num_return_sequences` additional sequences per batch
-            input_ids, model_kwargs = self._expand_inputs_for_generation(
-                input_ids=input_ids,
-                expand_size=generation_config.num_return_sequences,
-                is_encoder_decoder=self.config.is_encoder_decoder,
-                **model_kwargs,
-            )
-
-            # 12. run sample (it degenerates to greedy search when `generation_config.do_sample=False`)
+            # 11. run sample (it degenerates to greedy search when `generation_config.do_sample=False`)
             result = self._sample(
                 input_ids,
                 logits_processor=prepared_logits_processor,
@@ -2550,14 +2565,7 @@ def generate(
             )
 
         elif generation_mode in (GenerationMode.BEAM_SAMPLE, GenerationMode.BEAM_SEARCH):
-            # 11. interleave input_ids with `num_beams` additional sequences per batch
-            input_ids, model_kwargs = self._expand_inputs_for_generation(
-                input_ids=input_ids,
-                expand_size=generation_config.num_beams,
-                is_encoder_decoder=self.config.is_encoder_decoder,
-                **model_kwargs,
-            )
-            # 12. run beam sample
+            # 11. run beam sample
             result = self._beam_search(
                 input_ids,
                 logits_processor=prepared_logits_processor,
@@ -2583,14 +2591,6 @@ def generate(
                 num_beam_groups=generation_config.num_beam_groups,
                 max_length=generation_config.max_length,
             )
-            # 12. interleave input_ids with `num_beams` additional sequences per batch
-            input_ids, model_kwargs = self._expand_inputs_for_generation(
-                input_ids=input_ids,
-                expand_size=generation_config.num_beams,
-                is_encoder_decoder=self.config.is_encoder_decoder,
-                **model_kwargs,
-            )
-            # 13. run beam search
             result = self._group_beam_search(
                 input_ids,
                 beam_scorer,
@@ -2657,14 +2657,7 @@ def typeerror():
                 num_beam_hyps_to_keep=generation_config.num_return_sequences,
                 max_length=generation_config.max_length,
             )
-            # 12. interleave input_ids with `num_beams` additional sequences per batch
-            input_ids, model_kwargs = self._expand_inputs_for_generation(
-                input_ids=input_ids,
-                expand_size=generation_config.num_beams,
-                is_encoder_decoder=self.config.is_encoder_decoder,
-                **model_kwargs,
-            )
-            # 13. run beam search
+            # 12. run beam search
             result = self._constrained_beam_search(
                 input_ids,
                 constrained_beam_scorer=constrained_beam_scorer,
 
@@ -141,8 +141,12 @@ def forward(self, pixel_values: torch.FloatTensor, patch_attention_mask: torch.B
         embeddings = patch_embeds.flatten(2).transpose(1, 2)
 
         max_nb_patches_h, max_nb_patches_w = max_im_h // self.patch_size, max_im_w // self.patch_size
-        boundaries = torch.arange(1 / self.num_patches_per_side, 1.0, 1 / self.num_patches_per_side)
-        position_ids = torch.full(size=(batch_size, max_nb_patches_h * max_nb_patches_w), fill_value=0)
+        boundaries = torch.arange(
+            1 / self.num_patches_per_side, 1.0, 1 / self.num_patches_per_side, device=pixel_values.device
+        )
+        position_ids = torch.full(
+            size=(batch_size, max_nb_patches_h * max_nb_patches_w), fill_value=0, device=pixel_values.device
+        )
 
         for batch_idx, p_attn_mask in enumerate(patch_attention_mask):
             nb_patches_h = p_attn_mask[:, 0].sum()
@@ -158,9 +162,8 @@ def forward(self, pixel_values: torch.FloatTensor, patch_attention_mask: torch.B
             bucket_coords_w = torch.bucketize(fractional_coords_w, boundaries, right=True)
 
             pos_ids = (bucket_coords_h[:, None] * self.num_patches_per_side + bucket_coords_w).flatten()
-            position_ids[batch_idx][p_attn_mask.view(-1).cpu()] = pos_ids
+            position_ids[batch_idx][p_attn_mask.view(-1)] = pos_ids
 
-        position_ids = position_ids.to(self.position_embedding.weight.device)
         embeddings = embeddings + self.position_embedding(position_ids)
         return embeddings