OpenNMT
diff --git a/‎.github/workflows/ci.yml‎
Lines changed: 1 addition & 7 deletions b/‎.github/workflows/ci.yml‎
Lines changed: 1 addition & 7 deletions
diff --git a/‎CHANGELOG.md‎
Lines changed: 14 additions & 1 deletion b/‎CHANGELOG.md‎
Lines changed: 14 additions & 1 deletion
diff --git a/‎CMakeLists.txt‎
Lines changed: 4 additions & 2 deletions b/‎CMakeLists.txt‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎CONTRIBUTING.md‎
Lines changed: 16 additions & 3 deletions b/‎CONTRIBUTING.md‎
Lines changed: 16 additions & 3 deletions
diff --git a/‎README.md‎
Lines changed: 10 additions & 0 deletions b/‎README.md‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎docs/guides/transformers.md‎
Lines changed: 80 additions & 1 deletion b/‎docs/guides/transformers.md‎
Lines changed: 80 additions & 1 deletion
diff --git a/‎include/ctranslate2/batch_reader.h‎
Lines changed: 8 additions & 1 deletion b/‎include/ctranslate2/batch_reader.h‎
Lines changed: 8 additions & 1 deletion
diff --git a/‎include/ctranslate2/layers/attention.h‎
Lines changed: 3 additions & 0 deletions b/‎include/ctranslate2/layers/attention.h‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎include/ctranslate2/ops/median_filter.h‎
Lines changed: 3 additions & 2 deletions b/‎include/ctranslate2/ops/median_filter.h‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎python/cpp/generator.cc‎
Lines changed: 4 additions & 4 deletions b/‎python/cpp/generator.cc‎
Lines changed: 4 additions & 4 deletions
@@ -170,7 +170,7 @@ jobs:
           CIBW_MANYLINUX_X86_64_IMAGE: manylinux2014
           CIBW_MANYLINUX_AARCH64_IMAGE: manylinux2014
           CIBW_ARCHS: ${{ matrix.arch }}
-          CIBW_SKIP: pp* *-musllinux_*
+          CIBW_SKIP: "*-musllinux_*"
 
       - name: Upload Python wheels
         uses: actions/upload-artifact@v4
@@ -195,10 +195,6 @@ jobs:
             artifact_pattern: python-wheels-Linux-aarch64
             wheel_pattern: "*cp310*manylinux*_aarch64.whl"
 
-          #- os: windows-2022
-          #  artifact_pattern: python-wheels-Windows-auto64
-          #  wheel_pattern: "*cp310*win*.whl"
-
           - os: macos-15
             artifact_pattern: python-wheels-macOS-arm64
             wheel_pattern: "*cp310*macosx*arm64.whl"
@@ -226,8 +222,6 @@ jobs:
       - name: Install wheel
         shell: bash
         run: |
-          ls -l
-          find .
           pip install ${{ matrix.wheel_pattern }}
 
       - name: Test Python wheel
 
@@ -4,7 +4,20 @@
 
 ### Fixes and improvements
 
-## [v4.6.1](https://github.com/OpenNMT/CTranslate2/releases/tag/v4.6.1) (2025-10-07)
+## [v4.6.2](https://github.com/OpenNMT/CTranslate2/releases/tag/v4.6.2) (2025-12-05)
+
+### New features
+
+* Qwen 3 support (#1943) by [@jordimas](https://github.com/jordimas)
+* Gemma 3 text support (#1936) by [@jordimas](https://github.com/jordimas)
+
+### Fixes and improvements
+
+* Fixed pkg_resources Deprecated Warning (#1911) by [@thawancomt](https://github.com/thawancomt)
+* Disable INT8 for sm120 - Blackwell GPUs (#1937) by [@Purfview](https://github.com/Purfview)
+* FIX: package libctranslate2.so in wheel to avoid build fail (#1920) by [@yzewei](https://github.com/yzewei)
+
+## [v4.6.1](https://github.com/OpenNMT/CTranslate2/releases/tag/v4.6.1) (2025-11-07)
 
 ### New features
 
 
@@ -170,6 +170,8 @@ set(SOURCES
   src/ops/mean.cc
   src/ops/mean_cpu.cc
   src/ops/median_filter.cc
+  src/ops/median_filter_cpu.cc
+  src/ops/median_filter_gpu.cu
   src/ops/min_max.cc
   src/ops/mul.cc
   src/ops/multinomial.cc
@@ -545,8 +547,9 @@ if (WITH_CUDA)
     list(APPEND PRIVATE_INCLUDE_DIRECTORIES ${CUDNN_INCLUDE_DIR})
     list(APPEND LIBRARIES ${CUDNN_LIBRARIES})
     add_definitions(-DCT2_WITH_CUDNN)
+    list(APPEND SOURCES src/ops/conv1d_cudnn_gpu.cu)
   else()
-    message(WARNING "cuDNN library is not enabled: convolution layers will not be supported on GPU")
+    list(APPEND SOURCES src/ops/conv1d_gpu.cu)
   endif()
 
   if(CUDA_DYNAMIC_LOADING)
@@ -636,7 +639,6 @@ if (WITH_CUDA)
     src/ops/alibi_add_gpu.cu
     src/ops/bias_add_gpu.cu
     src/ops/concat_split_slide_gpu.cu
-    src/ops/conv1d_gpu.cu
     src/ops/dequantize_gpu.cu
     src/ops/flash_attention_gpu.cu
     src/ops/gather_gpu.cu
 
@@ -23,6 +23,19 @@ Do you think a feature is missing or would be a great addition to the project? P
   * look for GitHub issues marked with the *help wanted* label: these are developments that we find particularly suited for community contributions.
 * If you are planning to make a large change to the existing code, consider asking first on [the forum](https://forum.opennmt.net/) to confirm that it is welcome.
 
+## Contribution rules
+
+CTranslate2 is a low-level, performance-critical codebase. A single misplaced pointer or inefficient memory allocation (which LLMs often get wrong) can take hours to debug.
+
+To maintain code integrity and manage maintainer workload, we apply the following policy:
+
+* Use of AI tools for brainstorming or minor assistance is acceptable, but contributors must explicitly disclose how AI was used and remain fully responsible for correctness, performance, and design. Submissions that appear generated without deep understanding will be declined. Verifying AI output for correctness and performance is more time-consuming than writing code manually.
+
+* Mandatory Deep Understanding: Contributors must fully understand their code and be prepared to justify the purpose of part of the code base.
+
+* Please contribute within your area of expertise. If you are not familiar with the core codebase, consider contributing to documentation, examples, or Hugging Face integrations.
+
+
 ### Building the sources
 
 See [Install from sources](https://opennmt.net/CTranslate2/installation.html#install-from-sources).
@@ -85,7 +98,7 @@ The list is ordered on 5. from the largest to smallest time.
 
 #### `StorageView` class
 
-CTranslate2 uses [row-major](https://en.wikipedia.org/wiki/Row-_and_column-major_order) storages, usually encapsulated in the `StorageView` class. This class acts like a tensor representation but without the mathematical semantics. It is convenience wrapper to view a buffer of data in a particular shape, and provides methods to resize, reshape, and copy data. The underlying storage has a type (e.g. `float`) and a location (e.g. GPU #1) which are both resolved at runtime.
+CTranslate2 uses [row-major](https://en.wikipedia.org/wiki/Row-_and_column-major_order) storages, usually encapsulated in the `StorageView` class. This class acts like a tensor representation but without the mathematical semantics. It is a convenience wrapper to view a buffer of data in a particular shape, and provides methods to resize, reshape, and copy data. The underlying storage has a type (e.g. `float`) and a location (e.g. GPU #1) which are both resolved at runtime.
 
 To maximize performance, the implementation avoid new allocations when possible:
 
@@ -144,7 +157,7 @@ To limit the size of the packages pushed to PyPI, some libraries are not include
 
 One of the benefits of this dynamic loading is that multiple versions of cuBLAS and cuDNN are supported by the same binary. In particular, users can install any CUDA 12.x version as long as it provides `libcublas.so.12`.
 
-The Python library only support CUDA 12.x. C++ source code is always compatible with CUDA 11, possible to use CUDA 11 libraries during compilation to create CUDA 11.x support wheel.
+The Python library only supports CUDA 12.x. C++ source code is always compatible with CUDA 11, possible to use CUDA 11 libraries during compilation to create CUDA 11.x support wheel.
 
 ### Updating other dependencies
 
@@ -161,7 +174,7 @@ If a dependency needs an update, it is particularly important that it is updated
 
 ### Managing PyPI project size limit
 
-Projects on PyPI have a size limit. The default limit is 10GB and [we already requested](https://github.com/pypi/support/issues/1480) an increase to 20GB in the past. Because increase requests can take several months to be accepted, we now try to work with this 20GB limit.
+Projects on PyPI have a size limit. The default limit is 10GB. Currently the CTranslate2 project [has 50GB](https://github.com/pypi/support/issues/8119) of storage limit.
 
 So older releases need to be regularly deleted on PyPI to make room for new releases. **However, make sure to keep the latest release of each major version.**
 
 
@@ -119,6 +119,16 @@ Executed with 4 threads on a [*c5.2xlarge*](https://aws.amazon.com/ec2/instance-
 
 Executed with CUDA 11 on a [*g5.xlarge*](https://aws.amazon.com/ec2/instance-types/g5/) Amazon EC2 instance equipped with a NVIDIA A10G GPU (driver version: 510.47.03).
 
+## Contributing
+
+CTranslate2 is a community-driven project. We welcome contributions of all kinds:
+* **New Model Support:** Help us implement more Transformer architectures.
+* **Performance:** Propose optimizations for CPU or GPU kernels.
+* **Bug Reports:** Open an issue if you find something not working as expected.
+* **Documentation:** Improve our guides or add new examples.
+
+Check out our [Contributing Guide](CONTRIBUTING.md) to learn how to set up your development environment.
+
 ## Additional resources
 
 * [Documentation](https://opennmt.net/CTranslate2)
 
@@ -8,6 +8,8 @@ CTranslate2 supports selected models from Hugging Face's [Transformers](https://
 * CodeGen
 * DistilBERT
 * Falcon
+* Gemma 2
+* Gemma 3 (text only)
 * Llama
 * M2M100
 * MarianMT
@@ -20,6 +22,8 @@ CTranslate2 supports selected models from Hugging Face's [Transformers](https://
 * GPT-NeoX
 * OPT
 * Pegasus
+* Qwen 2.5
+* Qwen 3
 * T5
 * Whisper
 * XLM-RoBERTa
@@ -80,7 +84,7 @@ print(tokenizer.decode(tokenizer.convert_tokens_to_ids(target), skip_special_tok
 
 ## BERT
 
-[BERT](https://huggingface.co/docs/transformers/model_doc/bert) is pretrained model on English language using a masked language modeling objective.
+[BERT](https://huggingface.co/docs/transformers/model_doc/bert) is a pretrained model on English language using a masked language modeling objective.
 
 CTranslate2 only implements the `BertModel` class from Transformers which includes the Transformer encoder and the pooling layer. Task-specific layers should be run with PyTorch as shown in the example below.
 
@@ -183,6 +187,43 @@ output = tokenizer.decode(results[0].sequences_ids[0])
 print(output)
 ```
 
+## Gemma 3 (text only)
+
+
+[Gemma 3](https://ai.google.dev/gemma/docs/core) is Google's latest family of lightweight, open-weight AI models, built on the same technology as Gemini.
+
+Gemma models come in two flavors: instruction tuned (it) models and base models.
+
+Instruction tuned models expect a specific [prompt template format](https://ai.google.dev/gemma/docs/core/prompt-structure) which you should use.
+
+When converting an instruction-tuned model, CTranslate sets `<end_of_turn>` as the default end-of-sequence token.
+
+
+To convert a model:
+
+```bash
+ct2-transformers-converter --model google/gemma-3-1b-it --output_dir gemma-3-1b-it
+```
+
+Gemma 3 usage sample:
+
+
+```python
+
+from transformers import AutoTokenizer
+import ctranslate2
+
+tok = AutoTokenizer.from_pretrained("google/gemma-3-1b-it")
+gen = ctranslate2.Generator("gemma-3-1b-it")
+
+prompt = "<start_of_turn>user\nGenerate a 200 word text talking about George Orwell.<end_of_turn>\n<start_of_turn>model\n"
+tokens = tok.convert_ids_to_tokens(tok.encode(prompt))
+
+res = gen.generate_batch([tokens], max_length=2048, sampling_temperature=0.1, include_prompt_in_result=False)
+print(tok.convert_tokens_to_string(res[0].sequences[0]))
+```
+
+
 ## Llama 2
 
 [Llama 2](https://ai.meta.com/llama/) is a collection of pretrained and fine-tuned generative text models ranging in scale from 7 billion to 70 billion parameters.
@@ -446,6 +487,44 @@ output = tokenizer.decode(results[0].sequences_ids[0])
 print(output)
 ```
 
+## Qwen 3
+
+[Qwen 3](https://github.com/QwenLM/Qwen3) are a collection of large language models developed by the Alibaba Group. A key feature is allows switching between "thinking mode" for complex reasoning and a "non-thinking mode" for efficient general chat.
+
+To convert a model:
+
+```bash
+ct2-transformers-converter --model Qwen/Qwen3-4B --quantization float16 --output_dir qwen3-4b-ct2
+```
+
+Usage Sample
+
+You can use the converted model for text generation with ctranslate2.Generator. For Qwen 3 instruction-tuned models, you should use the Hugging Face tokenizer's apply_chat_template method to correctly format your prompts, especially when dealing with the optional "thinking mode". Currently MoE models variants are not supported.
+
+```python
+import ctranslate2
+import transformers
+
+generator = ctranslate2.Generator("qwen3-4b-ct2")
+tokenizer = transformers.AutoTokenizer.from_pretrained("Qwen/Qwen3-4B")
+
+def generate(prompt):
+    tokens = tokenizer.convert_ids_to_tokens(tokenizer.encode(prompt, add_special_tokens=False))
+    results = generator.generate_batch([tokens], max_length=2048, sampling_temperature=0.7, include_prompt_in_result=False)
+    return tokenizer.decode(results[0].sequences_ids[0])
+
+prompt_base = """<|im_start|>user
+A train leaves Station A at 60 mph heading towards Station B, 300 miles away. At the same time, another train leaves Station B at 40 mph heading towards Station A. When will they meet and how far from Station A?
+<|im_end|>
+<|im_start|>assistant"""
+
+print("Non-thinking:\n" + "-"*60)
+print(generate(prompt_base + "\n<think></think>\n"))
+
+print("\nThinking:\n" + "="*60)
+print(generate(prompt_base))
+```
+
 ## T5
 
 [T5](https://huggingface.co/docs/transformers/model_doc/t5) is an encoder-decoder model pre-trained on a multi-task mixture of unsupervised and supervised tasks and for which each task is converted into a text-to-text format.
 
@@ -56,7 +56,8 @@ namespace ctranslate2 {
 
     std::vector<Example>
     get_next(const size_t max_batch_size,
-             const BatchType batch_type = BatchType::Examples);
+             const BatchType batch_type = BatchType::Examples,
+             const bool consider_padding = false);
 
     // Consumes and returns the next example.
     virtual Example get_next_example() = 0;
@@ -67,6 +68,12 @@ namespace ctranslate2 {
     }
 
   private:
+    std::vector<Example> fill_batch_with_fixed_increment(const size_t max_batch_size,
+                                                          const BatchType batch_type);
+
+    std::vector<Example> fill_batch_with_variable_increment(const size_t max_batch_size,
+                                                             const BatchType batch_type);
+
     bool _initialized = false;
     Example _next;
   };
 
@@ -2,6 +2,7 @@
 
 #include "ctranslate2/layers/attention_layer.h"
 #include "ctranslate2/padder.h"
+#include "ctranslate2/layers/transformer.h"
 
 namespace ctranslate2 {
   namespace layers {
@@ -65,6 +66,8 @@ namespace ctranslate2 {
       dim_t _relative_right_max_position;
       const bool _merge_time_and_head_dims;
       const dim_t _cache_time_dim;
+      std::unique_ptr<const LayerNorm> _q_norm;  // Query normalization
+      std::unique_ptr<const LayerNorm> _k_norm;  // Key normalization
     };
   }
 }
@@ -1,17 +1,18 @@
 #pragma once
-
 #include "op.h"
 
 namespace ctranslate2 {
   namespace ops {
 
     class MedianFilter : public Op {
     public:
-      MedianFilter(const dim_t width);
+      explicit MedianFilter(dim_t width);
       void operator()(const StorageView& input, StorageView& output) const;
 
     private:
       const dim_t _width;
+      template <Device D, typename T>
+      void compute(const StorageView& input, const dim_t axis_size, StorageView& output) const;
     };
 
   }
 
@@ -234,10 +234,10 @@ namespace ctranslate2 {
                  Arguments:
                    start_tokens: Batch of start tokens. If the decoder starts from a special
                      start token like ``<s>``, this token should be added to this input.
-                   max_batch_size: The maximum batch size. If the number of inputs is greater than
-                     :obj:`max_batch_size`, the inputs are sorted by length and split by chunks of
-                     :obj:`max_batch_size` examples so that the number of padding positions is
-                     minimized.
+                   max_batch_size: The maximum batch size. If the number of inputs is greater than :obj:`max_batch_size`,
+                     the inputs are sorted by length and split by chunks of :obj:`max_batch_size` examples
+                     (or tokens when :obj:`batch_type`="tokens") so that the number of padding positions
+                     is minimized.
                    batch_type: Whether :obj:`max_batch_size` is the number of "examples" or "tokens".
                    asynchronous: Run the generation asynchronously.
                    beam_size: Beam size (1 for greedy search).
Original file line number	Diff line number	Diff line change
`@@ -2,6 +2,7 @@`
`2`	`2`
`3`	`3`	`#include "ctranslate2/layers/attention_layer.h"`
`4`	`4`	`#include "ctranslate2/padder.h"`
	`5`	`+#include "ctranslate2/layers/transformer.h"`
`5`	`6`
`6`	`7`	`namespace ctranslate2 {`
`7`	`8`	`namespace layers {`
`@@ -65,6 +66,8 @@ namespace ctranslate2 {`
`65`	`66`	`dim_t _relative_right_max_position;`
`66`	`67`	`const bool _merge_time_and_head_dims;`
`67`	`68`	`const dim_t _cache_time_dim;`
	`69`	`+ std::unique_ptr<const LayerNorm> _q_norm; // Query normalization`
	`70`	`+ std::unique_ptr<const LayerNorm> _k_norm; // Key normalization`
`68`	`71`	`};`
`69`	`72`	`}`
`70`	`73`	`}`