Skip to content

Commit 779b172

Browse files
feat: Add fps to message VideoMetadata (#7074)
* feat: Add `fps` to message `VideoMetadata` feat: Add `mediaResolution` to `Part` feat: Add `responseModalities, audioTimestamp, mediaResolution` to `GenerationConfig` feat: Add `thinkingLevel` to `ThinkingConfig` feat: Add `imageOutputOptions, personGeneration, imageSize` to `ImageConfig` PiperOrigin-RevId: 863320215 Source-Link: googleapis/googleapis@8d0579f Source-Link: googleapis/googleapis-gen@a85a936 Copy-Tag: eyJwIjoicGFja2FnZXMvZ29vZ2xlLWNsb3VkLWFpcGxhdGZvcm0vLk93bEJvdC55YW1sIiwiaCI6ImE4NWE5MzYxMTEyZjVmMTljMmQ0YTUxNWI5YTZhY2FlYzJhNzM5ZmUifQ== * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md --------- Co-authored-by: Owl Bot <gcf-owl-bot[bot]@users.noreply.github.com>
1 parent 5d1e2b6 commit 779b172

File tree

4 files changed

+1687
-10
lines changed

4 files changed

+1687
-10
lines changed

packages/google-cloud-aiplatform/protos/google/cloud/aiplatform/v1/content.proto

Lines changed: 148 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
// Copyright 2025 Google LLC
1+
// Copyright 2026 Google LLC
22
//
33
// Licensed under the Apache License, Version 2.0 (the "License");
44
// you may not use this file except in compliance with the License.
@@ -108,6 +108,33 @@ message Content {
108108
// A `Part` must have a fixed IANA MIME type identifying the type and subtype
109109
// of the media if `inline_data` or `file_data` field is filled with raw bytes.
110110
message Part {
111+
// per part media resolution.
112+
// Media resolution for the input media.
113+
message MediaResolution {
114+
// The media resolution level.
115+
enum Level {
116+
// Media resolution has not been set.
117+
MEDIA_RESOLUTION_UNSPECIFIED = 0;
118+
119+
// Media resolution set to low.
120+
MEDIA_RESOLUTION_LOW = 1;
121+
122+
// Media resolution set to medium.
123+
MEDIA_RESOLUTION_MEDIUM = 2;
124+
125+
// Media resolution set to high.
126+
MEDIA_RESOLUTION_HIGH = 3;
127+
128+
// Media resolution set to ultra high. This is for image only.
129+
MEDIA_RESOLUTION_ULTRA_HIGH = 4;
130+
}
131+
132+
oneof value {
133+
// The tokenization quality used for given media.
134+
Level level = 1;
135+
}
136+
}
137+
111138
oneof data {
112139
// Optional. Text part (can be code).
113140
string text = 1 [(google.api.field_behavior) = OPTIONAL];
@@ -150,6 +177,10 @@ message Part {
150177
// video data is presented in inline_data or file_data.
151178
VideoMetadata video_metadata = 4 [(google.api.field_behavior) = OPTIONAL];
152179
}
180+
181+
// per part media resolution.
182+
// Media resolution for the input media.
183+
MediaResolution media_resolution = 12;
153184
}
154185

155186
// Content blob.
@@ -182,6 +213,10 @@ message VideoMetadata {
182213
// Optional. The end offset of the video.
183214
google.protobuf.Duration end_offset = 2
184215
[(google.api.field_behavior) = OPTIONAL];
216+
217+
// Optional. The frame rate of the video sent to the model. If not specified,
218+
// the default value is 1.0. The valid range is (0.0, 24.0].
219+
double fps = 3 [(google.api.field_behavior) = OPTIONAL];
185220
}
186221

187222
// Configuration for a prebuilt voice.
@@ -202,7 +237,6 @@ message ReplicatedVoiceConfig {
202237
bytes voice_sample_audio = 2 [(google.api.field_behavior) = OPTIONAL];
203238
}
204239

205-
206240
// Configuration for a voice.
207241
message VoiceConfig {
208242
// The configuration for the speaker to use.
@@ -250,6 +284,37 @@ message SpeechConfig {
250284

251285
// Config for image generation features.
252286
message ImageConfig {
287+
// The image output format for generated images.
288+
message ImageOutputOptions {
289+
// Optional. The image format that the output should be saved as.
290+
optional string mime_type = 1 [(google.api.field_behavior) = OPTIONAL];
291+
292+
// Optional. The compression quality of the output image.
293+
optional int32 compression_quality = 2
294+
[(google.api.field_behavior) = OPTIONAL];
295+
}
296+
297+
// Enum for controlling the generation of people in images.
298+
enum PersonGeneration {
299+
// The default behavior is unspecified. The model will decide whether to
300+
// generate images of people.
301+
PERSON_GENERATION_UNSPECIFIED = 0;
302+
303+
// Allows the model to generate images of people, including adults and
304+
// children.
305+
ALLOW_ALL = 1;
306+
307+
// Allows the model to generate images of adults, but not children.
308+
ALLOW_ADULT = 2;
309+
310+
// Prevents the model from generating images of people.
311+
ALLOW_NONE = 3;
312+
}
313+
314+
// Optional. The image output format for generated images.
315+
optional ImageOutputOptions image_output_options = 1
316+
[(google.api.field_behavior) = OPTIONAL];
317+
253318
// Optional. The desired aspect ratio for the generated images. The following
254319
// aspect ratios are supported:
255320
//
@@ -260,6 +325,14 @@ message ImageConfig {
260325
// "9:16", "16:9"
261326
// "21:9"
262327
optional string aspect_ratio = 2 [(google.api.field_behavior) = OPTIONAL];
328+
329+
// Optional. Controls whether the model can generate people.
330+
optional PersonGeneration person_generation = 3
331+
[(google.api.field_behavior) = OPTIONAL];
332+
333+
// Optional. Specifies the size of generated images. Supported values are
334+
// `1K`, `2K`, `4K`. If not specified, the model will use default value `1K`.
335+
optional string image_size = 4 [(google.api.field_behavior) = OPTIONAL];
263336
}
264337

265338
// Generation config.
@@ -308,13 +381,65 @@ message GenerationConfig {
308381

309382
// Config for thinking features.
310383
message ThinkingConfig {
384+
// The thinking level for the model.
385+
enum ThinkingLevel {
386+
// Unspecified thinking level.
387+
THINKING_LEVEL_UNSPECIFIED = 0;
388+
389+
// Low thinking level.
390+
LOW = 1;
391+
392+
// Medium thinking level.
393+
MEDIUM = 2;
394+
395+
// High thinking level.
396+
HIGH = 3;
397+
398+
// MINIMAL thinking level.
399+
MINIMAL = 4;
400+
}
401+
311402
// Indicates whether to include thoughts in the response.
312403
// If true, thoughts are returned only when available.
313404
optional bool include_thoughts = 1 [(google.api.field_behavior) = OPTIONAL];
314405

315406
// Optional. Indicates the thinking budget in tokens.
316407
// This is only applied when enable_thinking is true.
317408
optional int32 thinking_budget = 3 [(google.api.field_behavior) = OPTIONAL];
409+
410+
// Optional. The number of thoughts tokens that the model should generate.
411+
optional ThinkingLevel thinking_level = 4
412+
[(google.api.field_behavior) = OPTIONAL];
413+
}
414+
415+
// The modalities of the response.
416+
enum Modality {
417+
// Unspecified modality. Will be processed as text.
418+
MODALITY_UNSPECIFIED = 0;
419+
420+
// Text modality.
421+
TEXT = 1;
422+
423+
// Image modality.
424+
IMAGE = 2;
425+
426+
// Audio modality.
427+
AUDIO = 3;
428+
}
429+
430+
// Media resolution for the input media.
431+
enum MediaResolution {
432+
// Media resolution has not been set.
433+
MEDIA_RESOLUTION_UNSPECIFIED = 0;
434+
435+
// Media resolution set to low (64 tokens).
436+
MEDIA_RESOLUTION_LOW = 1;
437+
438+
// Media resolution set to medium (256 tokens).
439+
MEDIA_RESOLUTION_MEDIUM = 2;
440+
441+
// Media resolution set to high (zoomed reframing with 256 tokens).
442+
MEDIA_RESOLUTION_HIGH = 3;
318443
}
319444

320445
// Optional. Controls the randomness of predictions.
@@ -411,6 +536,27 @@ message GenerationConfig {
411536
optional RoutingConfig routing_config = 17
412537
[(google.api.field_behavior) = OPTIONAL];
413538

539+
// Optional. If enabled, audio timestamps will be included in the request to
540+
// the model. This can be useful for synchronizing audio with other modalities
541+
// in the response.
542+
optional bool audio_timestamp = 20 [(google.api.field_behavior) = OPTIONAL];
543+
544+
// Optional. The modalities of the response. The model will generate a
545+
// response that includes all the specified modalities. For example, if this
546+
// is set to `[TEXT, IMAGE]`, the response will include both text and an
547+
// image.
548+
repeated Modality response_modalities = 21
549+
[(google.api.field_behavior) = OPTIONAL];
550+
551+
// Optional. The token resolution at which input media content is sampled.
552+
// This is used to control the trade-off between the quality of the response
553+
// and the number of tokens used to represent the media. A higher resolution
554+
// allows the model to perceive more detail, which can lead to a more nuanced
555+
// response, but it will also use more tokens. This does not affect the
556+
// image dimensions sent to the model.
557+
optional MediaResolution media_resolution = 22
558+
[(google.api.field_behavior) = OPTIONAL];
559+
414560
// Optional. The speech generation config.
415561
optional SpeechConfig speech_config = 23
416562
[(google.api.field_behavior) = OPTIONAL];

0 commit comments

Comments
 (0)