@@ -4,6 +4,7 @@ SCRIPT_DIR=$(dirname "$0")
4
4
MEDIA_DIR=$( realpath ${SCRIPT_DIR} /../../third_party)
5
5
6
6
IMG_PATH=${MEDIA_DIR} /organ.jpg
7
+ IMG_PATH2=${MEDIA_DIR} /Cajun_instruments.jpg
7
8
AUDIO_PATH=${MEDIA_DIR} /sample.mp3
8
9
VIDEO_PATH=${MEDIA_DIR} /Big_Buck_Bunny.mp4
9
10
PDF_PATH=${MEDIA_DIR} /test.pdf
@@ -38,43 +39,136 @@ curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-flash:s
38
39
39
40
echo " [START text_gen_multimodal_one_image_prompt]"
40
41
# [START text_gen_multimodal_one_image_prompt]
42
+ # Use a temporary file to hold the base64 encoded image data
43
+ TEMP_B64=$( mktemp)
44
+ trap ' rm -f "$TEMP_B64"' EXIT
45
+ base64 $B64FLAGS $IMG_PATH > " $TEMP_B64 "
46
+
47
+ # Use a temporary file to hold the JSON payload
48
+ TEMP_JSON=$( mktemp)
49
+ trap ' rm -f "$TEMP_JSON"' EXIT
50
+
51
+ cat > " $TEMP_JSON " << EOF
52
+ {
53
+ "contents": [{
54
+ "parts":[
55
+ {"text": "Tell me about this instrument"},
56
+ {
57
+ "inline_data": {
58
+ "mime_type":"image/jpeg",
59
+ "data": "$( cat " $TEMP_B64 " ) "
60
+ }
61
+ }
62
+ ]
63
+ }]
64
+ }
65
+ EOF
66
+
41
67
curl " https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-flash:generateContent?key=$GOOGLE_API_KEY " \
42
68
-H ' Content-Type: application/json' \
43
69
-X POST \
44
- -d ' {
45
- "contents": [{
46
- "parts":[
47
- {"text": "Tell me about this instrument"},
48
- {
49
- "inline_data": {
50
- "mime_type":"image/jpeg",
51
- "data": "' $( base64 $B64FLAGS $IMG_PATH ) ' "
52
- }
53
- }
54
- ]
55
- }]
56
- }' 2> /dev/null
70
+ -d " @$TEMP_JSON " 2> /dev/null
57
71
# [END text_gen_multimodal_one_image_prompt]
58
72
59
73
echo " [START text_gen_multimodal_one_image_prompt_streaming]"
60
74
# [START text_gen_multimodal_one_image_prompt_streaming]
75
+ cat > " $TEMP_JSON " << EOF
76
+ {
77
+ "contents": [{
78
+ "parts":[
79
+ {"text": "Tell me about this instrument"},
80
+ {
81
+ "inline_data": {
82
+ "mime_type":"image/jpeg",
83
+ "data": "$( cat " $TEMP_B64 " ) "
84
+ }
85
+ }
86
+ ]
87
+ }]
88
+ }
89
+ EOF
90
+
61
91
curl " https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-flash:streamGenerateContent?alt=sse&key=$GOOGLE_API_KEY " \
62
92
-H ' Content-Type: application/json' \
63
93
-X POST \
64
- -d ' {
65
- "contents": [{
94
+ -d " @$TEMP_JSON " 2> /dev/null
95
+ # [END text_gen_multimodal_one_image_prompt_streaming]
96
+
97
+ echo " [START text_gen_multimodal_two_image_prompt]"
98
+ # [START text_gen_multimodal_two_image_prompt]
99
+ # Base64 encode both images into temporary files
100
+ TEMP_B64_1=$( mktemp)
101
+ TEMP_B64_2=$( mktemp)
102
+ trap ' rm -f "$TEMP_B64_1" "$TEMP_B64_2"' EXIT
103
+ base64 $B64FLAGS " $IMG_PATH " > " $TEMP_B64_1 "
104
+ base64 $B64FLAGS " $IMG_PATH2 " > " $TEMP_B64_2 "
105
+
106
+ # Create the JSON payload using the base64 data from both images
107
+ cat > " $TEMP_JSON " << EOF
108
+ {
109
+ "contents": [{
66
110
"parts":[
67
- {"text": "Tell me about this instrument"},
68
111
{
69
- "inline_data": {
70
- "mime_type":"image/jpeg",
71
- "data": "' $( base64 $B64FLAGS $IMG_PATH ) ' "
72
- }
112
+ "inline_data": {
113
+ "mime_type": "image/jpeg",
114
+ "data": "$( cat " $TEMP_B64_1 " ) "
115
+ }
116
+ },
117
+ {
118
+ "inline_data": {
119
+ "mime_type": "image/jpeg",
120
+ "data": "$( cat " $TEMP_B64_2 " ) "
121
+ }
122
+ },
123
+ {
124
+ "text": "Generate a list of all the objects contained in both images."
73
125
}
74
126
]
75
- }]
76
- }' 2> /dev/null
77
- # [END text_gen_multimodal_one_image_prompt_streaming]
127
+ }]
128
+ }
129
+ EOF
130
+
131
+ # Make the API request using the JSON file
132
+ curl " https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-flash:generateContent?key=$GOOGLE_API_KEY " \
133
+ -H ' Content-Type: application/json' \
134
+ -X POST \
135
+ -d " @$TEMP_JSON " 2> /dev/null > response.json
136
+
137
+ # Display the response
138
+ cat response.json
139
+ # [END text_gen_multimodal_two_image_prompt]
140
+
141
+ echo " [START text_gen_multimodal_one_image_bounding_box_prompt]"
142
+ # [START text_gen_multimodal_one_image_bounding_box_prompt]
143
+ # Re-use TEMP_B64_2 (from the previous two-image prompt) and TEMP_JSON
144
+
145
+ # Create the JSON payload for bounding box detection
146
+ cat > " $TEMP_JSON " << EOF
147
+ {
148
+ "contents": [{
149
+ "parts":[
150
+ {
151
+ "inline_data": {
152
+ "mime_type": "image/jpeg",
153
+ "data": "$( cat " $TEMP_B64_2 " ) "
154
+ }
155
+ },
156
+ {
157
+ "text": "Generate bounding boxes for each of the objects in this image in [y_min, x_min, y_max, x_max] format."
158
+ }
159
+ ]
160
+ }]
161
+ }
162
+ EOF
163
+
164
+ # Make the API request using the JSON file
165
+ curl " https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-pro:generateContent?key=$GOOGLE_API_KEY " \
166
+ -H ' Content-Type: application/json' \
167
+ -X POST \
168
+ -d " @$TEMP_JSON " 2> /dev/null > response.json
169
+
170
+ cat response.json
171
+ # [END text_gen_multimodal_one_image_bounding_box_prompt]
78
172
79
173
echo " [START text_gen_multimodal_audio]"
80
174
# [START text_gen_multimodal_audio]
@@ -184,7 +278,7 @@ DISPLAY_NAME=VIDEO
184
278
# Initial resumable request defining metadata.
185
279
# The upload url is in the response headers dump them to a file.
186
280
curl " ${BASE_URL} /upload/v1beta/files?key=${GOOGLE_API_KEY} " \
187
- -D upload-header.tmp \
281
+ -D " ${tmp_header_file} " \
188
282
-H " X-Goog-Upload-Protocol: resumable" \
189
283
-H " X-Goog-Upload-Command: start" \
190
284
-H " X-Goog-Upload-Header-Content-Length: ${NUM_BYTES} " \
@@ -226,7 +320,7 @@ curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-flash:g
226
320
-d ' {
227
321
"contents": [{
228
322
"parts":[
229
- {"text": "Please describe this file ."},
323
+ {"text": "Transcribe the audio from this video, giving timestamps for salient events in the video. Also provide visual descriptions ."},
230
324
{"file_data":{"mime_type": "video/mp4", "file_uri": ' $file_uri ' }}]
231
325
}]
232
326
}' 2> /dev/null > response.json
0 commit comments