1- // Copyright 2025 Google LLC
1+ // Copyright 2026 Google LLC
22//
33// Licensed under the Apache License, Version 2.0 (the "License");
44// you may not use this file except in compliance with the License.
@@ -108,6 +108,33 @@ message Content {
108108// A `Part` must have a fixed IANA MIME type identifying the type and subtype
109109// of the media if `inline_data` or `file_data` field is filled with raw bytes.
110110message Part {
111+ // per part media resolution.
112+ // Media resolution for the input media.
113+ message MediaResolution {
114+ // The media resolution level.
115+ enum Level {
116+ // Media resolution has not been set.
117+ MEDIA_RESOLUTION_UNSPECIFIED = 0 ;
118+
119+ // Media resolution set to low.
120+ MEDIA_RESOLUTION_LOW = 1 ;
121+
122+ // Media resolution set to medium.
123+ MEDIA_RESOLUTION_MEDIUM = 2 ;
124+
125+ // Media resolution set to high.
126+ MEDIA_RESOLUTION_HIGH = 3 ;
127+
128+ // Media resolution set to ultra high. This is for image only.
129+ MEDIA_RESOLUTION_ULTRA_HIGH = 4 ;
130+ }
131+
132+ oneof value {
133+ // The tokenization quality used for given media.
134+ Level level = 1 ;
135+ }
136+ }
137+
111138 oneof data {
112139 // Optional. Text part (can be code).
113140 string text = 1 [(google.api.field_behavior ) = OPTIONAL ];
@@ -150,6 +177,10 @@ message Part {
150177 // video data is presented in inline_data or file_data.
151178 VideoMetadata video_metadata = 4 [(google.api.field_behavior ) = OPTIONAL ];
152179 }
180+
181+ // per part media resolution.
182+ // Media resolution for the input media.
183+ MediaResolution media_resolution = 12 ;
153184}
154185
155186// Content blob.
@@ -182,6 +213,10 @@ message VideoMetadata {
182213 // Optional. The end offset of the video.
183214 google.protobuf.Duration end_offset = 2
184215 [(google.api.field_behavior ) = OPTIONAL ];
216+
217+ // Optional. The frame rate of the video sent to the model. If not specified,
218+ // the default value is 1.0. The valid range is (0.0, 24.0].
219+ double fps = 3 [(google.api.field_behavior ) = OPTIONAL ];
185220}
186221
187222// Configuration for a prebuilt voice.
@@ -202,7 +237,6 @@ message ReplicatedVoiceConfig {
202237 bytes voice_sample_audio = 2 [(google.api.field_behavior ) = OPTIONAL ];
203238}
204239
205-
206240// Configuration for a voice.
207241message VoiceConfig {
208242 // The configuration for the speaker to use.
@@ -250,6 +284,37 @@ message SpeechConfig {
250284
251285// Config for image generation features.
252286message ImageConfig {
287+ // The image output format for generated images.
288+ message ImageOutputOptions {
289+ // Optional. The image format that the output should be saved as.
290+ optional string mime_type = 1 [(google.api.field_behavior ) = OPTIONAL ];
291+
292+ // Optional. The compression quality of the output image.
293+ optional int32 compression_quality = 2
294+ [(google.api.field_behavior ) = OPTIONAL ];
295+ }
296+
297+ // Enum for controlling the generation of people in images.
298+ enum PersonGeneration {
299+ // The default behavior is unspecified. The model will decide whether to
300+ // generate images of people.
301+ PERSON_GENERATION_UNSPECIFIED = 0 ;
302+
303+ // Allows the model to generate images of people, including adults and
304+ // children.
305+ ALLOW_ALL = 1 ;
306+
307+ // Allows the model to generate images of adults, but not children.
308+ ALLOW_ADULT = 2 ;
309+
310+ // Prevents the model from generating images of people.
311+ ALLOW_NONE = 3 ;
312+ }
313+
314+ // Optional. The image output format for generated images.
315+ optional ImageOutputOptions image_output_options = 1
316+ [(google.api.field_behavior ) = OPTIONAL ];
317+
253318 // Optional. The desired aspect ratio for the generated images. The following
254319 // aspect ratios are supported:
255320 //
@@ -260,6 +325,14 @@ message ImageConfig {
260325 // "9:16", "16:9"
261326 // "21:9"
262327 optional string aspect_ratio = 2 [(google.api.field_behavior ) = OPTIONAL ];
328+
329+ // Optional. Controls whether the model can generate people.
330+ optional PersonGeneration person_generation = 3
331+ [(google.api.field_behavior ) = OPTIONAL ];
332+
333+ // Optional. Specifies the size of generated images. Supported values are
334+ // `1K`, `2K`, `4K`. If not specified, the model will use default value `1K`.
335+ optional string image_size = 4 [(google.api.field_behavior ) = OPTIONAL ];
263336}
264337
265338// Generation config.
@@ -308,13 +381,65 @@ message GenerationConfig {
308381
309382 // Config for thinking features.
310383 message ThinkingConfig {
384+ // The thinking level for the model.
385+ enum ThinkingLevel {
386+ // Unspecified thinking level.
387+ THINKING_LEVEL_UNSPECIFIED = 0 ;
388+
389+ // Low thinking level.
390+ LOW = 1 ;
391+
392+ // Medium thinking level.
393+ MEDIUM = 2 ;
394+
395+ // High thinking level.
396+ HIGH = 3 ;
397+
398+ // MINIMAL thinking level.
399+ MINIMAL = 4 ;
400+ }
401+
311402 // Indicates whether to include thoughts in the response.
312403 // If true, thoughts are returned only when available.
313404 optional bool include_thoughts = 1 [(google.api.field_behavior ) = OPTIONAL ];
314405
315406 // Optional. Indicates the thinking budget in tokens.
316407 // This is only applied when enable_thinking is true.
317408 optional int32 thinking_budget = 3 [(google.api.field_behavior ) = OPTIONAL ];
409+
410+ // Optional. The number of thoughts tokens that the model should generate.
411+ optional ThinkingLevel thinking_level = 4
412+ [(google.api.field_behavior ) = OPTIONAL ];
413+ }
414+
415+ // The modalities of the response.
416+ enum Modality {
417+ // Unspecified modality. Will be processed as text.
418+ MODALITY_UNSPECIFIED = 0 ;
419+
420+ // Text modality.
421+ TEXT = 1 ;
422+
423+ // Image modality.
424+ IMAGE = 2 ;
425+
426+ // Audio modality.
427+ AUDIO = 3 ;
428+ }
429+
430+ // Media resolution for the input media.
431+ enum MediaResolution {
432+ // Media resolution has not been set.
433+ MEDIA_RESOLUTION_UNSPECIFIED = 0 ;
434+
435+ // Media resolution set to low (64 tokens).
436+ MEDIA_RESOLUTION_LOW = 1 ;
437+
438+ // Media resolution set to medium (256 tokens).
439+ MEDIA_RESOLUTION_MEDIUM = 2 ;
440+
441+ // Media resolution set to high (zoomed reframing with 256 tokens).
442+ MEDIA_RESOLUTION_HIGH = 3 ;
318443 }
319444
320445 // Optional. Controls the randomness of predictions.
@@ -411,6 +536,27 @@ message GenerationConfig {
411536 optional RoutingConfig routing_config = 17
412537 [(google.api.field_behavior ) = OPTIONAL ];
413538
539+ // Optional. If enabled, audio timestamps will be included in the request to
540+ // the model. This can be useful for synchronizing audio with other modalities
541+ // in the response.
542+ optional bool audio_timestamp = 20 [(google.api.field_behavior ) = OPTIONAL ];
543+
544+ // Optional. The modalities of the response. The model will generate a
545+ // response that includes all the specified modalities. For example, if this
546+ // is set to `[TEXT, IMAGE]`, the response will include both text and an
547+ // image.
548+ repeated Modality response_modalities = 21
549+ [(google.api.field_behavior ) = OPTIONAL ];
550+
551+ // Optional. The token resolution at which input media content is sampled.
552+ // This is used to control the trade-off between the quality of the response
553+ // and the number of tokens used to represent the media. A higher resolution
554+ // allows the model to perceive more detail, which can lead to a more nuanced
555+ // response, but it will also use more tokens. This does not affect the
556+ // image dimensions sent to the model.
557+ optional MediaResolution media_resolution = 22
558+ [(google.api.field_behavior ) = OPTIONAL ];
559+
414560 // Optional. The speech generation config.
415561 optional SpeechConfig speech_config = 23
416562 [(google.api.field_behavior ) = OPTIONAL ];
0 commit comments