-
Notifications
You must be signed in to change notification settings - Fork 31.9k
Fix typing hints for different rope parameters per layer type #43320
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 2 commits
fdcf6f3
ae430d1
3c1606f
9048182
8abaa64
f42ad99
2b2a8d3
404eb7d
1ecfe88
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -18,7 +18,7 @@ | |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| # See the License for the specific language governing permissions and | ||
| # limitations under the License. | ||
| from typing import Any | ||
| from typing import Any, Literal | ||
|
|
||
| from ...configuration_utils import PreTrainedConfig, layer_type_validation | ||
| from ...modeling_rope_utils import RopeParameters | ||
|
|
@@ -92,10 +92,9 @@ class Gemma3TextConfig(PreTrainedConfig): | |
| Scaling factor when applying tanh softcapping on the logits. | ||
| attn_logit_softcapping (`float`, *optional*): | ||
| Scaling factor when applying tanh softcapping on the attention scores. | ||
| rope_parameters (`RopeParameters`, *optional*): | ||
| Dictionary containing the configuration parameters for the RoPE embeddings. The dictionary should contain | ||
| a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE | ||
| with longer `max_position_embeddings`. | ||
| rope_parameters (`dict`, *optional*): | ||
| Dictionary mapping attention patterns (`"full_attention"`, `"sliding_attention"`) to `RopeParameters`. | ||
| Each value should be a dictionary containing `rope_type` and optional scaling parameters. | ||
| use_bidirectional_attention (`bool`, *optional*, defaults to `False`): | ||
| If True, the model will attend to all text tokens instead of using a causal mask. This does not change | ||
| behavior for vision tokens. | ||
|
|
@@ -155,7 +154,7 @@ def __init__( | |
| layer_types: list[str] | None = None, | ||
| final_logit_softcapping: float | None = None, | ||
| attn_logit_softcapping: float | None = None, | ||
| rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None, | ||
| rope_parameters: dict[Literal["full_attention", "sliding_attention"], RopeParameters] | None = None, | ||
| use_bidirectional_attention: bool | None = False, | ||
| tie_word_embeddings: bool | None = True, | ||
| **kwargs, | ||
|
|
@@ -205,12 +204,17 @@ def convert_rope_params_to_dict(self, ignore_keys_at_rope_validation=None, **kwa | |
| rope_scaling = kwargs.pop("rope_scaling", None) | ||
|
|
||
| # Try to set `rope_scaling` if available, otherwise use `rope_parameters`. If we find `rope_parameters` | ||
| # as arg in the inputs, we can safely assume that it is in the new format. New naming used -> new format | ||
| # as arg in the inputs and both `sliding_attention` and `full_attention` are present, we can safely assume | ||
| # that it is in the new format. New naming used -> new format | ||
| default_rope_params = { | ||
| "sliding_attention": {"rope_type": "default"}, | ||
| "full_attention": {"rope_type": "default"}, | ||
| } | ||
| self.rope_parameters = self.rope_parameters if self.rope_parameters is not None else default_rope_params | ||
| if ( | ||
| self.rope_parameters.get("sliding_attention") is not None | ||
| and self.rope_parameters.get("full_attention") is not None | ||
| ): | ||
| self.rope_parameters = default_rope_params | ||
|
||
| if rope_scaling is not None: | ||
| self.rope_parameters["full_attention"].update(rope_scaling) | ||
| self.rope_parameters["full_attention"].setdefault( | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.