diff --git a/.github/workflows/spelling.yaml b/.github/workflows/spelling.yaml
index ec0f2d77..23af653c 100644
--- a/.github/workflows/spelling.yaml
+++ b/.github/workflows/spelling.yaml
@@ -59,19 +59,19 @@ name: Check Spelling
on:
push:
branches:
- - "**"
+ - "**"
tags-ignore:
- - "**"
+ - "**"
pull_request:
branches:
- - "**"
+ - "**"
types:
- - "opened"
- - "reopened"
- - "synchronize"
+ - "opened"
+ - "reopened"
+ - "synchronize"
issue_comment:
types:
- - "created"
+ - "created"
jobs:
spelling:
@@ -89,70 +89,76 @@ jobs:
# note: If you use only_check_changed_files, you do not want cancel-in-progress
cancel-in-progress: false
steps:
- - name: check-spelling
- id: spelling
- uses: check-spelling/check-spelling@main
- with:
- suppress_push_for_open_pull_request: ${{ github.actor != 'dependabot[bot]' && 1 }}
- checkout: true
- check_file_names: 1
- spell_check_this: check-spelling/spell-check-this@main
- post_comment: 0
- use_magic_file: 1
- report-timing: 1
- warnings: bad-regex,binary-file,deprecated-feature,ignored-expect-variant,large-file,limited-references,no-newline-at-eof,noisy-file,non-alpha-in-dictionary,token-is-substring,unexpected-line-ending,whitespace-in-dictionary,minified-file,unsupported-configuration,no-files-to-check,unclosed-block-ignore-begin,unclosed-block-ignore-end
- experimental_apply_changes_via_bot: 1
- dictionary_source_prefixes: '{"cspell": "https://raw.githubusercontent.com/streetsidesoftware/cspell-dicts/main/dictionaries/"}'
- extra_dictionaries: |
- cspell:aws/dict/aws.txt
- cspell:bash/src/bash-words.txt
- cspell:companies/dict/companies.txt
- cspell:css/dict/css.txt
- cspell:data-science/dict/data-science-models.txt
- cspell:data-science/dict/data-science.txt
- cspell:data-science/dict/data-science-tools.txt
- cspell:dart/src/dart.txt
- cspell:de_DE/src/German_de_DE.dic
- cspell:django/requirements.txt
- cspell:django/dict/django.txt
- cspell:docker/src/docker-words.txt
- cspell:en_GB/src/hunspell/en_GB.dic
- cspell:en_US/src/aoo-mozilla-en-dict/en_US.dic
- cspell:en_US/src/hunspell/en_US-large.dic
- cspell:filetypes/src/filetypes.txt
- cspell:flutter/src/flutter.txt
- cspell:fonts/dict/fonts.txt
- cspell:fr_FR/fr-fr.trie
- cspell:fullstack/dict/fullstack.txt
- cspell:golang/dict/go.txt
- cspell:google/dict/google.txt
- cspell:html/dict/html.txt
- cspell:it_IT/dict/it-it.trie
- cspell:java/src/java.txt
- cspell:k8s/dict/k8s.txt
- cspell:mnemonics/dict/mnemonics.txt
- cspell:monkeyc/src/monkeyc_keywords.txt
- cspell:node/dict/node.txt
- cspell:npm/dict/npm.txt
- cspell:people-names/dict/people-names.txt
- cspell:php/dict/php.txt
- cspell:python/dict/python.txt
- cspell:python/dict/python-common.txt
- cspell:shell/dict/shell-all-words.txt
- cspell:software-terms/dict/softwareTerms.txt
- cspell:software-terms/dict/webServices.txt
- cspell:sql/src/common-terms.txt
- cspell:sql/src/sql.txt
- cspell:sql/src/tsql.txt
- cspell:svelte/dict/svelte.txt
- cspell:terraform/dict/terraform.txt
- cspell:typescript/dict/typescript.txt
- check_extra_dictionaries: cspell:cryptocurrencies/dict/cryptocurrencies.txt cspell:gaming-terms/dict/gaming-terms.txt cspell:latex/dict/latex.txt cspell:public-licenses/src/additional-licenses.txt cspell:public-licenses/src/generated/public-licenses.txt cspell:scientific_terms_US/custom_scientific_US.trie
- ignore-pattern: "[^'a-záéíóúñçüA-ZÁÉÍÓÚÑÇÜ]"
- upper-pattern: "[A-ZÁÉÍÓÚÑÇÜ]"
- lower-pattern: "[a-záéíóúñçü]"
- not-lower-pattern: "[^a-záéíóúñçü]"
- not-upper-or-lower-pattern: "[^A-ZÁÉÍÓÚÑÇÜa-záéíóúñçü]"
- punctuation-pattern: "'"
- only_check_changed_files: true
- longest_word: "10"
\ No newline at end of file
+ - name: check-spelling
+ id: spelling
+ uses: check-spelling/check-spelling@main
+ with:
+ suppress_push_for_open_pull_request: ${{ github.actor != 'dependabot[bot]' && 1 }}
+ checkout: true
+ check_file_names: 1
+ spell_check_this: check-spelling/spell-check-this@main
+ post_comment: 0
+ use_magic_file: 1
+ report-timing: 1
+ warnings: bad-regex,binary-file,deprecated-feature,ignored-expect-variant,large-file,limited-references,no-newline-at-eof,noisy-file,non-alpha-in-dictionary,token-is-substring,unexpected-line-ending,whitespace-in-dictionary,minified-file,unsupported-configuration,no-files-to-check,unclosed-block-ignore-begin,unclosed-block-ignore-end
+ experimental_apply_changes_via_bot: 1
+ dictionary_source_prefixes: '{"cspell": "https://raw.githubusercontent.com/streetsidesoftware/cspell-dicts/main/dictionaries/"}'
+ extra_dictionaries: |
+ cspell:aws/dict/aws.txt
+ cspell:bash/samples/bash-words.txt
+ cspell:companies/dict/companies.txt
+ cspell:css/dict/css.txt
+ cspell:data-science/dict/data-science-models.txt
+ cspell:data-science/dict/data-science.txt
+ cspell:data-science/dict/data-science-tools.txt
+ cspell:dart/src/dart.txt
+ cspell:de_DE/src/German_de_DE.dic
+ cspell:django/requirements.txt
+ cspell:django/dict/django.txt
+ cspell:docker/src/docker-words.txt
+ cspell:en_GB/src/hunspell/en_GB.dic
+ cspell:en_US/src/aoo-mozilla-en-dict/en_US.dic
+ cspell:en_US/src/hunspell/en_US-large.dic
+ cspell:filetypes/src/filetypes.txt
+ cspell:flutter/src/flutter.txt
+ cspell:fonts/dict/fonts.txt
+ cspell:fr_FR/fr-fr.trie
+ cspell:fullstack/dict/fullstack.txt
+ cspell:golang/dict/go.txt
+ cspell:google/dict/google.txt
+ cspell:html/dict/html.txt
+ cspell:it_IT/dict/it-it.trie
+ cspell:java/src/java.txt
+ cspell:k8s/dict/k8s.txt
+ cspell:mnemonics/dict/mnemonics.txt
+ cspell:monkeyc/src/monkeyc_keywords.txt
+ cspell:node/dict/node.txt
+ cspell:npm/dict/npm.txt
+ cspell:people-names/dict/people-names.txt
+ cspell:php/dict/php.txt
+ cspell:python/dict/python.txt
+ cspell:python/dict/python-common.txt
+ cspell:shell/dict/shell-all-words.txt
+ cspell:software-terms/dict/softwareTerms.txt
+ cspell:software-terms/dict/webServices.txt
+ cspell:sql/src/common-terms.txt
+ cspell:sql/src/sql.txt
+ cspell:sql/src/tsql.txt
+ cspell:svelte/dict/svelte.txt
+ cspell:terraform/dict/terraform.txt
+ cspell:typescript/dict/typescript.txt
+ check_extra_dictionaries:
+ cspell:cryptocurrencies/dict/cryptocurrencies.txt
+ cspell:gaming-terms/dict/gaming-terms.txt
+ cspell:latex/dict/latex.txt
+ cspell:public-licenses/src/additional-licenses.txt
+ cspell:public-licenses/src/generated/public-licenses.txt
+ cspell:scientific_terms_US/custom_scientific_US.trie
+ ignore-pattern: "[^'a-záéíóúñçüA-ZÁÉÍÓÚÑÇÜ]"
+ upper-pattern: "[A-ZÁÉÍÓÚÑÇÜ]"
+ lower-pattern: "[a-záéíóúñçü]"
+ not-lower-pattern: "[^a-záéíóúñçü]"
+ not-upper-or-lower-pattern: "[^A-ZÁÉÍÓÚÑÇÜa-záéíóúñçü]"
+ punctuation-pattern: "'"
+ only_check_changed_files: true
+ longest_word: "10"
\ No newline at end of file
diff --git a/.stylelintrc.json b/.stylelintrc.json
new file mode 100644
index 00000000..17e322a8
--- /dev/null
+++ b/.stylelintrc.json
@@ -0,0 +1,3 @@
+{
+ "extends": "stylelint-config-standard"
+ }
\ No newline at end of file
diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml
index a0f6b8b4..18aa044b 100644
--- a/docs/mkdocs.yml
+++ b/docs/mkdocs.yml
@@ -189,6 +189,8 @@ nav:
- Overview: genai-on-vertex-ai/gemini_2_0/ui_ux_tooling/README.md
- Spatial Reasoning (SDK): genai-on-vertex-ai/gemini_2_0/ui_ux_tooling/spatial_reasoning_SDK_for_gemini2.ipynb
- Interactive Spatial Reasoning: genai-on-vertex-ai/gemini_2_0/ui_ux_tooling/spatial_reasoning_app_for_gemini2.ipynb
+ - Gemini Multimodal Live API Developer Guide:
+ - Overview: genai-on-vertex-ai/gemini_2_0/gemini-multimodal-live-api-dev-guide/README.md
- Gemini Prompting Recipes:
- Overview: genai-on-vertex-ai/gemini/prompting_recipes/README.md
- Long Context Windows:
diff --git a/docs/stylesheets/extra.css b/docs/stylesheets/extra.css
index d24a7a02..02061170 100644
--- a/docs/stylesheets/extra.css
+++ b/docs/stylesheets/extra.css
@@ -1,3 +1,3 @@
-.md-main__inner.md-grid {
+.md-main-inner.md-grid {
max-width: 80rem;
}
\ No newline at end of file
diff --git a/genai-on-vertex-ai/gemini_2_0/gemini-multimodal-live-api-dev-guide/.gitignore b/genai-on-vertex-ai/gemini_2_0/gemini-multimodal-live-api-dev-guide/.gitignore
new file mode 100644
index 00000000..2163aca3
--- /dev/null
+++ b/genai-on-vertex-ai/gemini_2_0/gemini-multimodal-live-api-dev-guide/.gitignore
@@ -0,0 +1,175 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+# Usually these files are written by a python script from a template
+# before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+# For a library or package, you might want to ignore these files since the code is
+# intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+# However, in case of collaboration, if having platform-specific dependencies or dependencies
+# having no cross-platform support, pipenv may install dependencies that don't work, or not
+# install all needed dependencies.
+#Pipfile.lock
+
+# UV
+# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
+# This is especially recommended for binary packages to ensure reproducibility, and is more
+# commonly ignored for libraries.
+#uv.lock
+
+# poetry
+# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+# This is especially recommended for binary packages to ensure reproducibility, and is more
+# commonly ignored for libraries.
+# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+# in version control.
+# https://pdm.fming.dev/latest/usage/project/#working-with-version-control
+.pdm.toml
+.pdm-python
+.pdm-build/
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+# and can be added to the global gitignore or merged into this file. For a more nuclear
+# option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+
+# PyPI configuration file
+.pypirc
+
+.DS_Store
+
+_*.html
\ No newline at end of file
diff --git a/genai-on-vertex-ai/gemini_2_0/gemini-multimodal-live-api-dev-guide/CONTRIBUTING.md b/genai-on-vertex-ai/gemini_2_0/gemini-multimodal-live-api-dev-guide/CONTRIBUTING.md
new file mode 100644
index 00000000..be3e7d9f
--- /dev/null
+++ b/genai-on-vertex-ai/gemini_2_0/gemini-multimodal-live-api-dev-guide/CONTRIBUTING.md
@@ -0,0 +1,42 @@
+# How to Contribute
+
+At this time, we're not actively accepting external contributions to this project. However, we welcome and encourage community involvement. In the future, we plan to open up the project for contributions through a process that will likely involve:
+
+1. Forking the repository.
+2. Creating a new branch for your changes.
+3. Submitting a pull request for review.
+
+We appreciate your interest and look forward to your future contributions!
+
+## Before you begin
+
+### Sign our Contributor License Agreement
+
+Contributions to this project must be accompanied by a
+[Contributor License Agreement](https://cla.developers.google.com/about) (CLA).
+You (or your employer) retain the copyright to your contribution; this simply
+gives us permission to use and redistribute your contributions as part of the
+project.
+
+If you or your current employer have already signed the Google CLA (even if it
+was for a different project), you probably don't need to do it again.
+
+Visit to see your current agreements or to
+sign a new one.
+
+### Review our Community Guidelines
+
+This project follows [Google's Open Source Community
+Guidelines](https://opensource.google/conduct/).
+
+## Contribution process
+
+At this time, we're not looking to take contributions to this project.
+In the future, we look forward to your patches and contributions to this project.
+
+### Code Reviews
+
+All submissions, including submissions by project members, require review. We
+use GitHub pull requests for this purpose. Consult
+[GitHub Help](https://help.github.com/articles/about-pull-requests/) for more
+information on using pull requests.
diff --git a/genai-on-vertex-ai/gemini_2_0/gemini-multimodal-live-api-dev-guide/LICENSE b/genai-on-vertex-ai/gemini_2_0/gemini-multimodal-live-api-dev-guide/LICENSE
new file mode 100644
index 00000000..7a4a3ea2
--- /dev/null
+++ b/genai-on-vertex-ai/gemini_2_0/gemini-multimodal-live-api-dev-guide/LICENSE
@@ -0,0 +1,202 @@
+
+ Apache License
+ Version 2.0, January 2004
+ http://www.apache.org/licenses/
+
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+ 1. Definitions.
+
+ "License" shall mean the terms and conditions for use, reproduction,
+ and distribution as defined by Sections 1 through 9 of this document.
+
+ "Licensor" shall mean the copyright owner or entity authorized by
+ the copyright owner that is granting the License.
+
+ "Legal Entity" shall mean the union of the acting entity and all
+ other entities that control, are controlled by, or are under common
+ control with that entity. For the purposes of this definition,
+ "control" means (i) the power, direct or indirect, to cause the
+ direction or management of such entity, whether by contract or
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
+ outstanding shares, or (iii) beneficial ownership of such entity.
+
+ "You" (or "Your") shall mean an individual or Legal Entity
+ exercising permissions granted by this License.
+
+ "Source" form shall mean the preferred form for making modifications,
+ including but not limited to software source code, documentation
+ source, and configuration files.
+
+ "Object" form shall mean any form resulting from mechanical
+ transformation or translation of a Source form, including but
+ not limited to compiled object code, generated documentation,
+ and conversions to other media types.
+
+ "Work" shall mean the work of authorship, whether in Source or
+ Object form, made available under the License, as indicated by a
+ copyright notice that is included in or attached to the work
+ (an example is provided in the Appendix below).
+
+ "Derivative Works" shall mean any work, whether in Source or Object
+ form, that is based on (or derived from) the Work and for which the
+ editorial revisions, annotations, elaborations, or other modifications
+ represent, as a whole, an original work of authorship. For the purposes
+ of this License, Derivative Works shall not include works that remain
+ separable from, or merely link (or bind by name) to the interfaces of,
+ the Work and Derivative Works thereof.
+
+ "Contribution" shall mean any work of authorship, including
+ the original version of the Work and any modifications or additions
+ to that Work or Derivative Works thereof, that is intentionally
+ submitted to Licensor for inclusion in the Work by the copyright owner
+ or by an individual or Legal Entity authorized to submit on behalf of
+ the copyright owner. For the purposes of this definition, "submitted"
+ means any form of electronic, verbal, or written communication sent
+ to the Licensor or its representatives, including but not limited to
+ communication on electronic mailing lists, source code control systems,
+ and issue tracking systems that are managed by, or on behalf of, the
+ Licensor for the purpose of discussing and improving the Work, but
+ excluding communication that is conspicuously marked or otherwise
+ designated in writing by the copyright owner as "Not a Contribution."
+
+ "Contributor" shall mean Licensor and any individual or Legal Entity
+ on behalf of whom a Contribution has been received by Licensor and
+ subsequently incorporated within the Work.
+
+ 2. Grant of Copyright License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ copyright license to reproduce, prepare Derivative Works of,
+ publicly display, publicly perform, sublicense, and distribute the
+ Work and such Derivative Works in Source or Object form.
+
+ 3. Grant of Patent License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ (except as stated in this section) patent license to make, have made,
+ use, offer to sell, sell, import, and otherwise transfer the Work,
+ where such license applies only to those patent claims licensable
+ by such Contributor that are necessarily infringed by their
+ Contribution(s) alone or by combination of their Contribution(s)
+ with the Work to which such Contribution(s) was submitted. If You
+ institute patent litigation against any entity (including a
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
+ or a Contribution incorporated within the Work constitutes direct
+ or contributory patent infringement, then any patent licenses
+ granted to You under this License for that Work shall terminate
+ as of the date such litigation is filed.
+
+ 4. Redistribution. You may reproduce and distribute copies of the
+ Work or Derivative Works thereof in any medium, with or without
+ modifications, and in Source or Object form, provided that You
+ meet the following conditions:
+
+ (a) You must give any other recipients of the Work or
+ Derivative Works a copy of this License; and
+
+ (b) You must cause any modified files to carry prominent notices
+ stating that You changed the files; and
+
+ (c) You must retain, in the Source form of any Derivative Works
+ that You distribute, all copyright, patent, trademark, and
+ attribution notices from the Source form of the Work,
+ excluding those notices that do not pertain to any part of
+ the Derivative Works; and
+
+ (d) If the Work includes a "NOTICE" text file as part of its
+ distribution, then any Derivative Works that You distribute must
+ include a readable copy of the attribution notices contained
+ within such NOTICE file, excluding those notices that do not
+ pertain to any part of the Derivative Works, in at least one
+ of the following places: within a NOTICE text file distributed
+ as part of the Derivative Works; within the Source form or
+ documentation, if provided along with the Derivative Works; or,
+ within a display generated by the Derivative Works, if and
+ wherever such third-party notices normally appear. The contents
+ of the NOTICE file are for informational purposes only and
+ do not modify the License. You may add Your own attribution
+ notices within Derivative Works that You distribute, alongside
+ or as an addendum to the NOTICE text from the Work, provided
+ that such additional attribution notices cannot be construed
+ as modifying the License.
+
+ You may add Your own copyright statement to Your modifications and
+ may provide additional or different license terms and conditions
+ for use, reproduction, or distribution of Your modifications, or
+ for any such Derivative Works as a whole, provided Your use,
+ reproduction, and distribution of the Work otherwise complies with
+ the conditions stated in this License.
+
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
+ any Contribution intentionally submitted for inclusion in the Work
+ by You to the Licensor shall be under the terms and conditions of
+ this License, without any additional terms or conditions.
+ Notwithstanding the above, nothing herein shall supersede or modify
+ the terms of any separate license agreement you may have executed
+ with Licensor regarding such Contributions.
+
+ 6. Trademarks. This License does not grant permission to use the trade
+ names, trademarks, service marks, or product names of the Licensor,
+ except as required for reasonable and customary use in describing the
+ origin of the Work and reproducing the content of the NOTICE file.
+
+ 7. Disclaimer of Warranty. Unless required by applicable law or
+ agreed to in writing, Licensor provides the Work (and each
+ Contributor provides its Contributions) on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ implied, including, without limitation, any warranties or conditions
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+ PARTICULAR PURPOSE. You are solely responsible for determining the
+ appropriateness of using or redistributing the Work and assume any
+ risks associated with Your exercise of permissions under this License.
+
+ 8. Limitation of Liability. In no event and under no legal theory,
+ whether in tort (including negligence), contract, or otherwise,
+ unless required by applicable law (such as deliberate and grossly
+ negligent acts) or agreed to in writing, shall any Contributor be
+ liable to You for damages, including any direct, indirect, special,
+ incidental, or consequential damages of any character arising as a
+ result of this License or out of the use or inability to use the
+ Work (including but not limited to damages for loss of goodwill,
+ work stoppage, computer failure or malfunction, or any and all
+ other commercial damages or losses), even if such Contributor
+ has been advised of the possibility of such damages.
+
+ 9. Accepting Warranty or Additional Liability. While redistributing
+ the Work or Derivative Works thereof, You may choose to offer,
+ and charge a fee for, acceptance of support, warranty, indemnity,
+ or other liability obligations and/or rights consistent with this
+ License. However, in accepting such obligations, You may act only
+ on Your own behalf and on Your sole responsibility, not on behalf
+ of any other Contributor, and only if You agree to indemnify,
+ defend, and hold each Contributor harmless for any liability
+ incurred by, or claims asserted against, such Contributor by reason
+ of your accepting any such warranty or additional liability.
+
+ END OF TERMS AND CONDITIONS
+
+ APPENDIX: How to apply the Apache License to your work.
+
+ To apply the Apache License to your work, attach the following
+ boilerplate notice, with the fields enclosed by brackets "[]"
+ replaced with your own identifying information. (Don't include
+ the brackets!) The text should be enclosed in the appropriate
+ comment syntax for the file format. We also recommend that a
+ file or class name and description of purpose be included on the
+ same "printed page" as the copyright notice for easier
+ identification within third-party archives.
+
+ Copyright [yyyy] [name of copyright owner]
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
\ No newline at end of file
diff --git a/genai-on-vertex-ai/gemini_2_0/gemini-multimodal-live-api-dev-guide/README.md b/genai-on-vertex-ai/gemini_2_0/gemini-multimodal-live-api-dev-guide/README.md
new file mode 100644
index 00000000..4b92730e
--- /dev/null
+++ b/genai-on-vertex-ai/gemini_2_0/gemini-multimodal-live-api-dev-guide/README.md
@@ -0,0 +1,131 @@
+# Project Pastra - A Gemini Multimodal Live API Developer Guide
+
+
+
+This repository serves as a comprehensive developer guide for [Google's Gemini Multimodal Live API](https://developers.googleblog.com/en/gemini-2-0-level-up-your-apps-with-real-time-multimodal-interactions/). Through a structured, hands-on approach, you'll learn how to build sophisticated real-time applications that can see, hear, and interact naturally using Gemini's multimodal capabilities.
+
+## What You'll Learn
+
+By following this guide, you'll be able to:
+
+- Build real-time audio chat applications with Gemini
+- Implement live video interactions through webcam and screen sharing
+- Create multimodal experiences combining audio and video
+- Deploy production-ready AI assistants
+- Choose between Development API and Vertex AI implementations
+
+The guide progresses from basic concepts to advanced implementations, culminating in a [Project Astra](https://deepmind.google/technologies/project-astra/)-inspired AI assistant that demonstrates the full potential of the Gemini Multimodal Live API.
+
+## Key Concepts Covered
+
+- **Real-time Communication:**
+
+ - WebSocket-based streaming
+ - Bidirectional audio chat
+ - Live video processing
+ - Turn-taking and interruption handling
+
+- **Audio Processing:**
+
+ - Microphone input capture
+ - Audio chunking and streaming
+ - Voice Activity Detection (VAD)
+ - Real-time audio playback
+
+- **Video Integration:**
+
+ - Webcam and screen capture
+ - Frame processing and encoding
+ - Simultaneous audio-video streaming
+ - Efficient media handling
+
+- **Production Features:**
+ - Function calling capabilities
+ - System instructions
+ - Mobile-first UI design
+ - Cloud deployment
+ - Enterprise security
+
+## Guide Structure
+
+### [Part 1](part_1_intro): Introduction to Gemini's Multimodal Live API
+
+Basic concepts and SDK usage:
+
+- SDK setup and authentication
+- Text and audio interactions
+- Real-time audio chat implementation
+
+### [Part 2](part_2_dev_api): WebSocket Development with [Gemini Developer API](https://ai.google.dev/api/multimodal-live)
+
+Direct WebSocket implementation, building towards Project Pastra - a production-ready multimodal AI assistant inspired by Project Astra:
+
+- Low-level WebSocket communication
+- Audio and video streaming
+- Function calling and system instructions
+- Mobile-first deployment
+
+### [Part 3](part_3_vertex_api): WebSocket Development with [Vertex AI API](https://cloud.google.com/vertex-ai/generative-ai/docs/model-reference/multimodal-live)
+
+Enterprise-grade implementation using Vertex AI, mirroring Part 2's journey with production-focused architecture:
+
+- Proxy-based authentication
+- Service account integration
+- Cloud deployment architecture
+- Enterprise security considerations
+
+## Feature Implementation Guide
+
+Below is a comprehensive overview of where each feature is implemented across the Development API and Vertex AI versions:
+
+| Feature | Part 1 - Intro Chapter | Part 2 - Dev API Chapter | Part 3 - Vertex AI Chapter |
+| ------------------------------- | ------------------------------------ | -------------------------------------- | ------------------------------------------ |
+| SDK setup and authentication | [Chapter 1](part_1_intro/chapter_01) | - | - |
+| Text and audio interactions | [Chapter 1](part_1_intro/chapter_01) | - | - |
+| Real-time Audio Chat | [Chapter 2](part_1_intro/chapter_02) | [Chapter 5](part_2_dev_api/chapter_05) | [Chapter 9](part_3_vertex_api/chapter_09) |
+| Multimodal (Audio + Video) | - | [Chapter 6](part_2_dev_api/chapter_06) | [Chapter 10](part_3_vertex_api/chapter_10) |
+| Function Calling & Instructions | - | [Chapter 7](part_2_dev_api/chapter_07) | [Chapter 11](part_3_vertex_api/chapter_11) |
+| Production Deployment | - | [Chapter 8](part_2_dev_api/chapter_08) | [Chapter 12](part_3_vertex_api/chapter_12) |
+
+Note: Vertex AI implementation starts directly with advanced features, skipping basic WebSocket and text-to-speech examples.
+
+## Prerequisites
+
+- Google Cloud Project (for Vertex AI)
+- [AI Studio API key](https://aistudio.google.com/app/apikey) (for Gemini Developer API)
+- OpenWeather API key (if you want to use the weather tool)
+- Python 3.9 or higher
+- Modern web browser
+- Basic understanding of:
+ - JavaScript and HTML
+ - WebSocket communication
+ - Audio/video processing concepts
+
+## Key Differences Between Dev API and Vertex AI
+
+### Development API (Part 2)
+
+- Simple API key authentication
+- Direct WebSocket connection
+- All tools available simultaneously
+- Single-service deployment
+- Ideal for prototyping and learning
+
+### Vertex AI (Part 3)
+
+- Service account authentication
+- Proxy-based architecture
+- Single tool limitation
+- Two-service deployment (app + proxy)
+- Suited for enterprise deployment
+
+## Getting Started
+
+1. Start with Part 1 to understand basic SDK concepts
+2. Choose your implementation path:
+ - For quick prototyping: Follow Part 2 (Dev API)
+ - For enterprise deployment: Skip to Part 3 (Vertex AI)
+
+## License
+
+This project is licensed under the Apache License.
diff --git a/genai-on-vertex-ai/gemini_2_0/gemini-multimodal-live-api-dev-guide/assets/audio-client.png b/genai-on-vertex-ai/gemini_2_0/gemini-multimodal-live-api-dev-guide/assets/audio-client.png
new file mode 100644
index 00000000..5bfdaf21
Binary files /dev/null and b/genai-on-vertex-ai/gemini_2_0/gemini-multimodal-live-api-dev-guide/assets/audio-client.png differ
diff --git a/genai-on-vertex-ai/gemini_2_0/gemini-multimodal-live-api-dev-guide/assets/audio-to-audio-websocket.png b/genai-on-vertex-ai/gemini_2_0/gemini-multimodal-live-api-dev-guide/assets/audio-to-audio-websocket.png
new file mode 100644
index 00000000..d4e4f999
Binary files /dev/null and b/genai-on-vertex-ai/gemini_2_0/gemini-multimodal-live-api-dev-guide/assets/audio-to-audio-websocket.png differ
diff --git a/genai-on-vertex-ai/gemini_2_0/gemini-multimodal-live-api-dev-guide/assets/hero-image.webp b/genai-on-vertex-ai/gemini_2_0/gemini-multimodal-live-api-dev-guide/assets/hero-image.webp
new file mode 100644
index 00000000..cbab2c41
Binary files /dev/null and b/genai-on-vertex-ai/gemini_2_0/gemini-multimodal-live-api-dev-guide/assets/hero-image.webp differ
diff --git a/genai-on-vertex-ai/gemini_2_0/gemini-multimodal-live-api-dev-guide/assets/mm_live_api.jpg b/genai-on-vertex-ai/gemini_2_0/gemini-multimodal-live-api-dev-guide/assets/mm_live_api.jpg
new file mode 100644
index 00000000..078ad5b1
Binary files /dev/null and b/genai-on-vertex-ai/gemini_2_0/gemini-multimodal-live-api-dev-guide/assets/mm_live_api.jpg differ
diff --git a/genai-on-vertex-ai/gemini_2_0/gemini-multimodal-live-api-dev-guide/assets/project_pastra.png b/genai-on-vertex-ai/gemini_2_0/gemini-multimodal-live-api-dev-guide/assets/project_pastra.png
new file mode 100644
index 00000000..c4e58b67
Binary files /dev/null and b/genai-on-vertex-ai/gemini_2_0/gemini-multimodal-live-api-dev-guide/assets/project_pastra.png differ
diff --git a/genai-on-vertex-ai/gemini_2_0/gemini-multimodal-live-api-dev-guide/part_1_intro/README.md b/genai-on-vertex-ai/gemini_2_0/gemini-multimodal-live-api-dev-guide/part_1_intro/README.md
new file mode 100644
index 00000000..2f9cb0b4
--- /dev/null
+++ b/genai-on-vertex-ai/gemini_2_0/gemini-multimodal-live-api-dev-guide/part_1_intro/README.md
@@ -0,0 +1,40 @@
+# Part 1: Introduction to Google Gemini AI
+
+This section provides a foundational introduction to working with Google's Gemini AI model through practical examples and hands-on code.
+
+## Contents
+
+### Chapter 1: SDK Basics
+
+- Introduction to the Google Gemini AI SDK
+- Setting up the development environment
+- Basic text interactions with Gemini
+- Audio response generation examples
+- Examples using both direct API key authentication and Vertex AI authentication
+
+### Chapter 2: Multimodal Interactions
+
+- Real-time audio conversations with Gemini
+- Streaming audio input and output
+- Voice activity detection and turn-taking
+- Example implementation of an interactive voice chat
+
+## Key Features Covered
+
+- Text generation and conversations
+- Audio output generation
+- Real-time streaming interactions
+- Different authentication methods (API key and Vertex AI)
+- Multimodal capabilities (text-to-audio, audio-to-audio)
+
+## Prerequisites
+
+- Python environment
+- Google Gemini API access
+- Required packages:
+ - `google-genai`
+ - `pyaudio` (for audio examples)
+
+## Getting Started
+
+Each chapter contains Jupyter notebooks and Python scripts that demonstrate different aspects of the Gemini AI capabilities. Start with Chapter 1's notebooks for basic SDK usage, and then move on to the more advanced multimodal examples in Chapter 2.
diff --git a/genai-on-vertex-ai/gemini_2_0/gemini-multimodal-live-api-dev-guide/part_1_intro/chapter_01/audio.wav b/genai-on-vertex-ai/gemini_2_0/gemini-multimodal-live-api-dev-guide/part_1_intro/chapter_01/audio.wav
new file mode 100644
index 00000000..a0ae066f
Binary files /dev/null and b/genai-on-vertex-ai/gemini_2_0/gemini-multimodal-live-api-dev-guide/part_1_intro/chapter_01/audio.wav differ
diff --git a/genai-on-vertex-ai/gemini_2_0/gemini-multimodal-live-api-dev-guide/part_1_intro/chapter_01/sdk-intro-veretxai.ipynb b/genai-on-vertex-ai/gemini_2_0/gemini-multimodal-live-api-dev-guide/part_1_intro/chapter_01/sdk-intro-veretxai.ipynb
new file mode 100644
index 00000000..d757aebd
--- /dev/null
+++ b/genai-on-vertex-ai/gemini_2_0/gemini-multimodal-live-api-dev-guide/part_1_intro/chapter_01/sdk-intro-veretxai.ipynb
@@ -0,0 +1,206 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Google Gemini SDK Introduction\n",
+ "\n",
+ "This notebook demonstrates how to use the Google Gemini AI SDK to interact with the Gemini model in both text and audio modes.\n",
+ "\n",
+ "## Setup\n",
+ "First, we'll install the required package and initialize the client with our API key."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "!pip install -U -q google-genai"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from google import genai\n",
+ "# client = genai.Client(vertexai=True, location='us-central1', http_options= {'api_version': 'v1beta'})"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Text Interaction Example\n",
+ "\n",
+ "Below we'll demonstrate how to have a text conversation with Gemini. The code:\n",
+ "1. Sets up a configuration for text responses\n",
+ "2. Opens an async connection to the model\n",
+ "3. Sends a message and receives the response in chunks\n",
+ "4. Prints each chunk of the response as it arrives"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "PROJECT_ID = \"\"\n",
+ "LOCATION = \"us-central1\""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "> Hello? Gemini, are you there? \n",
+ "\n",
+ "Yes\n",
+ ", I am here. What would you like to talk about?\n",
+ "\n",
+ "None\n"
+ ]
+ }
+ ],
+ "source": [
+ "model_id = \"gemini-2.0-flash-exp\"\n",
+ "\n",
+ "client = genai.Client(vertexai=True, location=LOCATION, project=PROJECT_ID)\n",
+ "config = {\"response_modalities\": [\"TEXT\"]}\n",
+ "\n",
+ "async with client.aio.live.connect(model=model_id, config=config) as session:\n",
+ " message = \"Hello? Gemini, are you there?\"\n",
+ " print(\"> \", message, \"\\n\")\n",
+ " await session.send(message, end_of_turn=True)\n",
+ "\n",
+ " async for response in session.receive():\n",
+ " print(response.text)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Audio Generation Example\n",
+ "\n",
+ "Now we'll see how to generate audio responses from Gemini. This section:\n",
+ "1. Creates a wave file handler to save the audio\n",
+ "2. Configures the model for audio output\n",
+ "3. Sends a text prompt and receives audio data\n",
+ "4. Saves the audio chunks and plays them in the notebook\n",
+ "\n",
+ "Note: Make sure your browser's audio is enabled to hear the responses."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import contextlib\n",
+ "import wave\n",
+ "\n",
+ "\n",
+ "@contextlib.contextmanager\n",
+ "def wave_file(filename, channels=1, rate=24000, sample_width=2):\n",
+ " with wave.open(filename, \"wb\") as wf:\n",
+ " wf.setnchannels(channels)\n",
+ " wf.setsampwidth(sample_width)\n",
+ " wf.setframerate(rate)\n",
+ " yield wf"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "> Hello? Gemini are you there? \n",
+ "\n",
+ "audio/pcm\n",
+ "................................"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ " \n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "from IPython.display import display, Audio\n",
+ "\n",
+ "config={\n",
+ " \"generation_config\": {\"response_modalities\": [\"AUDIO\"]}}\n",
+ "\n",
+ "async with client.aio.live.connect(model=model_id, config=config) as session:\n",
+ " file_name = 'audio.wav'\n",
+ " with wave_file(file_name) as wav:\n",
+ " message = \"Hello? Gemini are you there?\"\n",
+ " print(\"> \", message, \"\\n\")\n",
+ " await session.send(message, end_of_turn=True)\n",
+ "\n",
+ " first = True\n",
+ " async for response in session.receive():\n",
+ " if response.data is not None:\n",
+ " model_turn = response.server_content.model_turn\n",
+ " if first:\n",
+ " print(model_turn.parts[0].inline_data.mime_type)\n",
+ " first = False\n",
+ " print('.', end='.')\n",
+ " wav.writeframes(response.data)\n",
+ "\n",
+ "\n",
+ "display(Audio(file_name, autoplay=True))"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": ".venv",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.11.9"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/genai-on-vertex-ai/gemini_2_0/gemini-multimodal-live-api-dev-guide/part_1_intro/chapter_01/sdk-intro.ipynb b/genai-on-vertex-ai/gemini_2_0/gemini-multimodal-live-api-dev-guide/part_1_intro/chapter_01/sdk-intro.ipynb
new file mode 100644
index 00000000..6d8795f7
--- /dev/null
+++ b/genai-on-vertex-ai/gemini_2_0/gemini-multimodal-live-api-dev-guide/part_1_intro/chapter_01/sdk-intro.ipynb
@@ -0,0 +1,212 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Google Gemini SDK Introduction\n",
+ "\n",
+ "This notebook demonstrates how to use the Google Gemini AI SDK to interact with the Gemini model in both text and audio modes.\n",
+ "\n",
+ "## Setup\n",
+ "First, we'll install the required package and initialize the client with our API key."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "!pip install -U -q google-genai"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from google import genai\n",
+ "client = genai.Client(vertexai=False, http_options= {'api_version': 'v1alpha'}, api_key='')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Text Interaction Example\n",
+ "\n",
+ "Below we'll demonstrate how to have a text conversation with Gemini. The code:\n",
+ "1. Sets up a configuration for text responses\n",
+ "2. Opens an async connection to the model\n",
+ "3. Sends a message and receives the response in chunks\n",
+ "4. Prints each chunk of the response as it arrives"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "MODEL = \"gemini-2.0-flash-exp\""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "> Hello? Gemini are you there? \n",
+ "\n",
+ "- Yes, I'\n",
+ "- m here! How can I help you today?\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "config={\n",
+ " \"generation_config\": {\"response_modalities\": [\"TEXT\"]}}\n",
+ "\n",
+ "async with client.aio.live.connect(model=MODEL, config=config) as session:\n",
+ " message = \"Hello? Gemini are you there?\"\n",
+ " print(\"> \", message, \"\\n\")\n",
+ " await session.send(message, end_of_turn=True)\n",
+ "\n",
+ " # For text responses, When the model's turn is complete it breaks out of the loop.\n",
+ " turn = session.receive()\n",
+ " async for chunk in turn:\n",
+ " if chunk.text is not None:\n",
+ " print(f'- {chunk.text}')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Audio Generation Example\n",
+ "\n",
+ "Now we'll see how to generate audio responses from Gemini. This section:\n",
+ "1. Creates a wave file handler to save the audio\n",
+ "2. Configures the model for audio output\n",
+ "3. Sends a text prompt and receives audio data\n",
+ "4. Saves the audio chunks and plays them in the notebook\n",
+ "\n",
+ "Note: Make sure your browser's audio is enabled to hear the responses."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import contextlib\n",
+ "import wave\n",
+ "\n",
+ "\n",
+ "@contextlib.contextmanager\n",
+ "def wave_file(filename, channels=1, rate=24000, sample_width=2):\n",
+ " with wave.open(filename, \"wb\") as wf:\n",
+ " wf.setnchannels(channels)\n",
+ " wf.setsampwidth(sample_width)\n",
+ " wf.setframerate(rate)\n",
+ " yield wf"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "> Hello? Gemini are you there? \n",
+ "\n",
+ "audio/pcm;rate=24000\n",
+ ".............................."
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ " \n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "from IPython.display import display, Audio\n",
+ "\n",
+ "config={\n",
+ " \"generation_config\": {\"response_modalities\": [\"AUDIO\"]}}\n",
+ "\n",
+ "async with client.aio.live.connect(model=MODEL, config=config) as session:\n",
+ " file_name = 'audio.wav'\n",
+ " with wave_file(file_name) as wav:\n",
+ " message = \"Hello? Gemini are you there?\"\n",
+ " print(\"> \", message, \"\\n\")\n",
+ " await session.send(message, end_of_turn=True)\n",
+ "\n",
+ " first = True\n",
+ " async for response in session.receive():\n",
+ " if response.data is not None:\n",
+ " model_turn = response.server_content.model_turn\n",
+ " if first:\n",
+ " print(model_turn.parts[0].inline_data.mime_type)\n",
+ " first = False\n",
+ " print('.', end='.')\n",
+ " wav.writeframes(response.data)\n",
+ "\n",
+ "\n",
+ "display(Audio(file_name, autoplay=True))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": ".venv",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.13.1"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/genai-on-vertex-ai/gemini_2_0/gemini-multimodal-live-api-dev-guide/part_1_intro/chapter_02/README.md b/genai-on-vertex-ai/gemini_2_0/gemini-multimodal-live-api-dev-guide/part_1_intro/chapter_02/README.md
new file mode 100644
index 00000000..1d4ec18f
--- /dev/null
+++ b/genai-on-vertex-ai/gemini_2_0/gemini-multimodal-live-api-dev-guide/part_1_intro/chapter_02/README.md
@@ -0,0 +1,152 @@
+# Gemini Live Audio Chat
+
+This project enables real-time, two-way audio communication with a Gemini language model. The application captures audio input from the user's microphone, sends it to the Gemini API for processing, receives the model's audio response, and plays it back through the user's speakers. This creates an interactive and conversational experience, similar to talking to a voice assistant.
+
+The core of the application lies in its ability to manage the continuous flow of audio data between the user and the model. It uses asynchronous programming to handle audio input and output concurrently, ensuring smooth and responsive interaction. The application utilizes the `pyaudio` library to interface with the user's audio hardware, capturing microphone input and playing audio output. The `google-genai` library facilitates communication with the Gemini API, sending audio data for processing and receiving the model's responses.
+
+## How it works
+
+### System Architecture
+
+
+
+The application's functionality can be broken down into several key components:
+
+### Audio Input and Output
+
+The `pyaudio` library is used to create input and output streams that interface with the user's audio hardware.
+
+- **Input Stream:** An input stream is initialized to capture audio data from the user's microphone. The stream is configured with parameters such as format, channels, sample rate, and chunk size. The `SEND_SAMPLE_RATE` is set to 16000 Hz, which is a common sample rate for speech recognition. The `CHUNK_SIZE` determines the number of audio frames read from the microphone at a time. The `exception_on_overflow` parameter is set to `False` to prevent the stream from raising an exception if the buffer overflows.
+- **Output Stream:** An output stream is initialized to play audio data through the user's speakers. Similar to the input stream, it is configured with appropriate parameters. The `RECEIVE_SAMPLE_RATE` is set to 24000 Hz, which is suitable for high-quality audio playback.
+
+### Communication with Gemini API
+
+The `google-genai` library provides the necessary tools to connect to the Gemini API and establish a communication session.
+
+- **Client Initialization:** A `genai.Client` is created to interact with the API. The `http_options` parameter is used to specify the API version, which is set to `'v1alpha'` in this case.
+- **Session Configuration:** A configuration object `CONFIG` is defined to customize the interaction with the model. This includes:
+ - `generation_config`: Specifies the response modality as "AUDIO" and configures the "speech_config" to "Puck".
+ - `system_instruction`: Sets a system instruction to always start the model's sentences with "mate".
+- **Live Connection:** The `client.aio.live.connect` method establishes a live connection to the Gemini model specified by `MODEL`, which is set to `"models/gemini-2.0-flash-exp"`.
+
+### Asynchronous Audio Handling
+
+The `asyncio` library is used to manage the asynchronous operations involved in audio processing and communication.
+
+- **Audio Queue:** An `asyncio.Queue` is created to store audio data temporarily. This queue is not used in the current implementation but is defined for potential future use.
+- **Task Group:** An `asyncio.TaskGroup` is used to manage two concurrent tasks: `listen_and_send` and `receive_and_play`.
+- **`listen_and_send` Task:** This task continuously reads audio data from the input stream in chunks and sends it to the Gemini API. It checks if the model is currently speaking (`model_speaking` flag) and only sends data if the model is not speaking. The chunking is performed using the `pyaudio` library's `read()` method, which is called with a specific `CHUNK_SIZE` (number of audio frames per chunk). Here's how it's done in the code:
+
+ ```python
+ while True:
+ if not model_speaking:
+ try:
+ data = await asyncio.to_thread(input_stream.read, CHUNK_SIZE, exception_on_overflow=False)
+ # ... send data to API ...
+ except OSError as e:
+ # ... handle error ...
+ ```
+
+ In this code, `input_stream.read(CHUNK_SIZE)` reads a chunk of audio frames from the microphone's input buffer. Each chunk is then sent to the API along with the `end_of_turn=True` flag.
+
+- **`receive_and_play` Task:** This task continuously receives responses from the Gemini API and plays the audio data through the output stream. It sets the `model_speaking` flag to `True` when the model starts speaking and to `False` when the turn is complete. It then iterates through the parts of the response and writes the audio data to the output stream.
+
+### Audio Chunking and Real-time Interaction
+
+A crucial aspect of the application's real-time audio processing is how the continuous audio stream from the microphone is divided into smaller chunks before being sent to the Gemini API. This chunking is performed in the `listen_and_send` task using the `pyaudio` library.
+
+**Chunking Process:**
+
+The `input_stream.read(CHUNK_SIZE)` method is called repeatedly to read a fixed number of audio frames (defined by `CHUNK_SIZE`) from the microphone's buffer. Each chunk represents a small segment of the audio stream. The current implementation uses a `CHUNK_SIZE` of 512 frames.
+
+**Calculating Chunk Duration:**
+
+The duration of each audio chunk can be calculated using the following formula:
+
+`Chunk Duration (seconds) = (Number of Frames) / (Sample Rate)`
+
+In this case, with a `CHUNK_SIZE` of 512 frames and a `SEND_SAMPLE_RATE` of 16000 Hz, the chunk duration is:
+
+`Chunk Duration = 512 frames / 16000 Hz = 0.032 seconds = 32 milliseconds`
+
+Therefore, each chunk represents 32 milliseconds of audio.
+
+**Real-time Interaction Flow:**
+
+To understand how chunking enables a smooth, real-time conversation, let's trace the steps involved when you speak to the model:
+
+1. **User Speaks:** You start speaking into the microphone.
+2. **Audio Capture:** The `listen_and_send` task continuously captures audio data from the microphone.
+3. **Chunking (Fast):** Every time 512 frames (32 milliseconds of audio) are captured, a chunk is created.
+4. **Send to API (Frequent):** This small chunk is immediately sent to the Gemini API, along with `end_of_turn=True`.
+5. **API Processing (Starts Early):** The API receives the chunk and its Voice Activity Detection (VAD) starts analyzing it. Because the chunks are small and frequent, the API can begin processing the audio very quickly, even while the user is still speaking.
+6. **Model Response (Begins Quickly):** Once the APIs VAD detects a pause that it interprets as the end of a user's turn (even if it's a short pause between phrases), the Gemini model starts generating a response based on the audio it has received so far.
+7. **Audio Output (Low Latency):** The response audio is sent back to the client in chunks. The `receive_and_play` task starts playing the response audio as soon as it arrives, minimizing the delay.
+
+**Impact of `CHUNK_SIZE`:**
+
+The `CHUNK_SIZE` is a configurable parameter that affects the latency and responsiveness of the system. Smaller chunks can potentially reduce latency, as they allow the API to start processing and responding sooner. However, very small chunks might increase processing overhead. Larger chunks, on the other hand, would introduce noticeable delays in the conversation, making it feel sluggish and less interactive. The choice of 512 frames strikes a good balance between low latency and manageable processing overhead for a real-time chat application.
+
+**Why `end_of_turn=True` with Each Chunk?**
+
+Each chunk is sent to the API with the `end_of_turn=True` flag. While this might seem like it would interrupt the flow of the conversation, the Gemini API uses its Voice Activity Detection (VAD) to determine the actual turn boundaries based on longer pauses in the audio stream, not solely on the `end_of_turn` flag from each chunk. This allows for a relatively smooth conversation flow despite the frequent `end_of_turn` signals.
+
+### Input/Output and Turn-Taking
+
+The application distinguishes between user input and model output through a combination of the `model_speaking` flag, the `end_of_turn=True` signal sent with each audio chunk, and the Gemini APIs Voice Activity Detection (VAD).
+
+**Distinguishing Input from Output:**
+
+- **`model_speaking` Flag:** This boolean flag serves as a primary mechanism to differentiate between when the user is providing input and when the model is generating output.
+ - When `model_speaking` is `False`, the application assumes it's the user's turn to speak. The `listen_and_send` task reads audio data from the microphone and sends it to the API.
+ - When `model_speaking` is `True`, the application understands that the model is currently generating an audio response. The `listen_and_send` task pauses, preventing user input from being sent to the API while the model is "speaking." The `receive_and_play` task is active during this time, receiving and playing the model's audio output.
+
+**How Audio Chunks are Sent:**
+
+- **`end_of_turn=True` with Each Chunk:** The `listen_and_send` task sends each chunk of audio data (determined by `CHUNK_SIZE`) with `end_of_turn=True` in the message payload: `await session.send({"data": data, "mime_type": "audio/pcm"}, end_of_turn=True)`. This might seem like it would constantly interrupt the conversation flow. However, the API handles this gracefully.
+- **API-Side Buffering and VAD:** The Gemini API likely buffers the incoming audio chunks on its end. Even though each chunk is marked as the end of a turn with `end_of_turn=True`, the APIs Voice Activity Detection (VAD) analyzes the buffered audio to identify longer pauses or periods of silence that more accurately represent the actual end of the user's speech. The API can group several chunks into what it considers a single user turn based on its VAD analysis, rather than strictly treating each chunk as a separate turn.
+- **Low-Latency Processing:** The API is designed for low-latency interaction. It starts processing the received audio chunks as soon as possible. Even if `end_of_turn=True` is sent with each chunk, the API can begin generating a response while still receiving more audio from the user, as long as it hasn't detected a significant enough pause to finalize the user's turn based on its VAD.
+
+**Determining End of Model Turn:**
+
+- **`turn_complete` Field:** The `receive_and_play` task continuously listens for responses from the API. Each response includes a `server_content` object, which contains a `turn_complete` field.
+ - When `turn_complete` is `True`, it signifies that the model has finished generating its response for the current turn.
+ - Upon receiving a `turn_complete: True` signal, the `receive_and_play` task sets the `model_speaking` flag to `False`. This signals that the model's turn is over, and the application is ready to accept new user input.
+
+**Turn-Taking Flow:**
+
+1. Initially, `model_speaking` is `False`, indicating it's the user's turn.
+2. The `listen_and_send` task captures audio chunks from the microphone and sends each chunk to the API with `end_of_turn=True`.
+3. The API buffers the audio and its VAD determines the actual end of the user's speech based on longer pauses, not just the `end_of_turn` signal from each chunk.
+4. The model processes the input and starts generating a response.
+5. The `receive_and_play` task receives the response, sets `model_speaking` to `True`, and plays the audio.
+6. When the model finishes, it sends `turn_complete: True`.
+7. The `receive_and_play` task sets `model_speaking` to `False`, switching back to the user's turn.
+
+In essence, although `end_of_turn=True` is sent with each audio chunk, the APIs VAD plays a more significant role in determining the actual turn boundaries. The `end_of_turn=True` in this implementation might act more as a hint or a nudge to the API to process the audio, rather than a definitive end-of-turn marker. This approach allows for a relatively smooth conversation flow despite the frequent `end_of_turn` signals, thanks to the APIs buffering, VAD, and low-latency processing.
+
+### Why Always Set `end_of_turn=True`?
+
+Setting `end_of_turn=True` with each audio chunk, even when the user hasn't finished speaking, might seem counterintuitive. Here are some reasons for this design choice:
+
+1. **Simplicity and Reduced Client-Side Complexity:** Implementing robust Voice Activity Detection (VAD) on the client-side can be complex. By always setting `end_of_turn=True`, the developers might have opted for a simpler client-side implementation that offloads the more complex VAD task to the Gemini API.
+2. **Lower Latency:** Sending smaller chunks with `end_of_turn=True` might allow the API to start processing the audio sooner. However, this potential latency benefit depends heavily on how the API is designed.
+3. **Emphasis on API-Side Control:** By sending `end_of_turn=True` frequently, the client cedes more control over turn-taking to the API. The APIs VAD becomes the primary mechanism for determining turn boundaries.
+
+**It's important to note:** While this approach can work, it's not necessarily the most optimal or efficient way to handle turn-taking in a voice conversation system. Ideally, you would want to send `end_of_turn=True` only when the user has actually finished speaking, which would typically involve implementing client-side VAD.
+
+### Main Loop
+
+The `audio_loop` function orchestrates the entire process.
+
+1. **Initialization:** It initializes variables, including the audio queue, `model_speaking` flag, and session object.
+2. **Connection and Task Creation:** It establishes a live connection to the Gemini API and creates the `listen_and_send` and `receive_and_play` tasks within a task group.
+3. **Error Handling:** It includes a `try...except` block to catch any exceptions that occur during the process and prints the traceback.
+
+### Execution
+
+The `if __name__ == "__main__":` block ensures that the `audio_loop` function is executed only when the script is run directly. The `asyncio.run` function starts the asynchronous event loop and runs the `audio_loop` function, enabling the real-time audio chat.
+
+## Limitations
+
+The current implementation does not support user interruption of the model's speech. Future implementations could support interruption by sending a specific interrupt signal to the API or by modifying the current `end_of_turn` logic to be more responsive to shorter pauses in user speech.
diff --git a/genai-on-vertex-ai/gemini_2_0/gemini-multimodal-live-api-dev-guide/part_1_intro/chapter_02/audio-to-audio.py b/genai-on-vertex-ai/gemini_2_0/gemini-multimodal-live-api-dev-guide/part_1_intro/chapter_02/audio-to-audio.py
new file mode 100755
index 00000000..bb3b3299
--- /dev/null
+++ b/genai-on-vertex-ai/gemini_2_0/gemini-multimodal-live-api-dev-guide/part_1_intro/chapter_02/audio-to-audio.py
@@ -0,0 +1,99 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import traceback
+import pyaudio
+
+from google import genai
+
+FORMAT = pyaudio.paInt16
+CHANNELS = 1
+SEND_SAMPLE_RATE = 16000
+RECEIVE_SAMPLE_RATE = 24000
+CHUNK_SIZE = 512
+
+MODEL = "models/gemini-2.0-flash-exp"
+
+client = genai.Client(http_options={'api_version': 'v1alpha'})
+
+CONFIG = {
+ "generation_config": {"response_modalities": ["AUDIO"], "speech_config": "Puck"},
+ "system_instruction": "Always start your sentence with 'mate'."
+}
+
+async def audio_loop():
+ """Orchestrates the real-time audio chat loop."""
+ model_speaking = False
+ session = None
+ model_speaking = False
+ session = None
+
+ pya = pyaudio.PyAudio()
+ mic_info = pya.get_default_input_device_info()
+
+ try:
+ async with (
+ client.aio.live.connect(model=MODEL, config=CONFIG) as session,
+ asyncio.TaskGroup() as tg,
+ ):
+ input_stream = await asyncio.to_thread(
+ pya.open,
+ format=FORMAT,
+ channels=CHANNELS,
+ rate=SEND_SAMPLE_RATE,
+ input=True,
+ input_device_index=mic_info["index"],
+ frames_per_buffer=CHUNK_SIZE,
+ )
+ output_stream = await asyncio.to_thread(
+ pya.open, format=FORMAT, channels=CHANNELS, rate=RECEIVE_SAMPLE_RATE, output=True
+ )
+
+ async def listen_and_send():
+ nonlocal model_speaking
+ while True:
+ if not model_speaking:
+ try:
+ data = await asyncio.to_thread(input_stream.read, CHUNK_SIZE, exception_on_overflow=False)
+ await session.send({"data": data, "mime_type": "audio/pcm"}, end_of_turn=True)
+ except OSError as e:
+ print(f"Audio input error: {e}")
+ await asyncio.sleep(0.1)
+ else:
+ await asyncio.sleep(0.1)
+
+ async def receive_and_play():
+ nonlocal model_speaking
+ while True:
+ async for response in session.receive():
+ server_content = response.server_content
+ if server_content and server_content.model_turn:
+ model_speaking = True
+ for part in server_content.model_turn.parts:
+ if part.inline_data:
+ await asyncio.to_thread(output_stream.write, part.inline_data.data)
+
+ if server_content and server_content.turn_complete:
+ print("Turn complete")
+ model_speaking = False
+
+ tg.create_task(listen_and_send())
+ tg.create_task(receive_and_play())
+
+ except Exception as e:
+ traceback.print_exception(None, e, e.__traceback__)
+
+if __name__ == "__main__":
+ asyncio.run(audio_loop(), debug=True)
\ No newline at end of file
diff --git a/genai-on-vertex-ai/gemini_2_0/gemini-multimodal-live-api-dev-guide/part_2_dev_api/README.md b/genai-on-vertex-ai/gemini_2_0/gemini-multimodal-live-api-dev-guide/part_2_dev_api/README.md
new file mode 100644
index 00000000..b87a45d6
--- /dev/null
+++ b/genai-on-vertex-ai/gemini_2_0/gemini-multimodal-live-api-dev-guide/part_2_dev_api/README.md
@@ -0,0 +1,91 @@
+# Part 2: WebSocket Development with Gemini API
+
+This section demonstrates how to work directly with the Gemini API using WebSockets, progressively building towards Project Pastra - a production-ready multimodal AI assistant inspired by Google DeepMind's Project Astra. Through a series of chapters, we evolve from basic implementations to a sophisticated, mobile-first application that showcases the full potential of the Gemini API.
+
+## Journey to Project Pastra
+
+Starting with fundamental WebSocket concepts, each chapter adds new capabilities, ultimately culminating in Project Pastra - our implementation of a universal AI assistant that can see, hear, and interact in real-time. Like Project Astra (Google DeepMind's research prototype), our application demonstrates how to create an AI assistant that can engage in natural, multimodal interactions while maintaining production-grade reliability.
+
+## Contents
+
+### Chapter 3: Basic WebSocket Communication
+
+- Single exchange example with the Gemini API
+- Core WebSocket setup and communication
+- Understanding the APIs message formats
+- Handling the mandatory setup phase
+
+### Chapter 4: Text-to-Speech Implementation
+
+- Converting text input to audio responses
+- Real-time audio playback in the browser
+- Audio chunk management and streaming
+- WebSocket and AudioContext integration
+
+### Chapter 5: Real-time Audio Chat
+
+- Bidirectional audio communication
+- Live microphone input processing
+- Voice activity detection and turn management
+- Advanced audio streaming techniques
+
+### Chapter 6: Multimodal Interactions
+
+- Adding video capabilities (webcam and screen sharing)
+- Frame capture and processing
+- Simultaneous audio and video streaming
+- Enhanced user interface controls
+
+### Chapter 7: Advanced Features
+
+- Function calling capabilities
+- System instructions integration
+- External API integrations (weather, search)
+- Code execution functionality
+
+### Chapter 8: Project Pastra
+
+- Mobile-first UI design inspired by Project Astra
+- Cloud Run deployment setup
+- Production-grade error handling
+- Scalable architecture implementation
+
+## Key Features
+
+- Direct WebSocket communication with Gemini API
+- Real-time audio and video processing
+- Browser-based implementation
+- Mobile and desktop support
+- Production deployment guidance
+
+## Prerequisites
+
+- Basic understanding of WebSockets
+- Familiarity with JavaScript and HTML5
+- Google Gemini API access
+- Modern web browser with WebSocket support
+
+## Getting Started
+
+This guide uses a simple development server to:
+
+- Serve the HTML/JavaScript files for each chapter
+- Provide access to shared components (audio processing, media handling, etc.) used across chapters
+- Enable proper loading of JavaScript modules and assets
+- Avoid CORS issues when accessing local files
+
+1. Start the development server:
+
+ ```bash
+ python server.py
+ ```
+
+ This will serve both the chapter files and shared components at http://localhost:8000
+
+2. Navigate to the specific chapter you want to work with:
+
+ - Chapter 3: http://localhost:8000/chapter_03/
+ - Chapter 4: http://localhost:8000/chapter_04/
+ And so on...
+
+3. Begin with Chapter 3 to understand the fundamentals of WebSocket communication with Gemini. Each subsequent chapter builds upon previous concepts, gradually introducing more complex features and capabilities. By Chapter 8, you'll have transformed the development prototype into Project Pastra - a production-ready AI assistant that demonstrates the future of human-AI interaction.
diff --git a/genai-on-vertex-ai/gemini_2_0/gemini-multimodal-live-api-dev-guide/part_2_dev_api/chapter_03/README.md b/genai-on-vertex-ai/gemini_2_0/gemini-multimodal-live-api-dev-guide/part_2_dev_api/chapter_03/README.md
new file mode 100644
index 00000000..88454102
--- /dev/null
+++ b/genai-on-vertex-ai/gemini_2_0/gemini-multimodal-live-api-dev-guide/part_2_dev_api/chapter_03/README.md
@@ -0,0 +1,138 @@
+# Gemini WebSocket Test - Single Exchange Example
+
+This HTML file demonstrates a **single exchange** with the Gemini language model using WebSockets, illustrating the fundamental principles of interacting with the API at a low level, **without using an SDK**. The application establishes a WebSocket connection, sends a hardcoded user message, and displays the model's response in the browser.
+
+**This example is primarily for educational purposes**, showcasing how to use the browser's built-in WebSocket API to communicate with the Gemini API directly. It is not intended to be a full-fledged chat application but rather a simplified demonstration of the underlying communication mechanism.
+
+## How it works
+
+The application's functionality can be broken down into several key components:
+
+### 1. Establishing a WebSocket Connection
+
+- **API Endpoint:** The application connects to the Gemini API using a specific WebSocket endpoint URL:
+ ```
+ wss://generativelanguage.googleapis.com/ws/google.ai.generativelanguage.v1alpha.GenerativeService.BidiGenerateContent?key=${apiKey}
+ ```
+ This URL includes the API host, the service path, and an API key for authentication. Replace `${apiKey}` with your actual API key.
+- **WebSocket Object:** A new `WebSocket` object is created in JavaScript, initiating the connection:
+ ```javascript
+ const ws = new WebSocket(endpoint);
+ ```
+- **Event Handlers:** Event handlers are defined to manage the connection's lifecycle and handle incoming messages:
+ - `onopen`: Triggered when the connection is successfully opened.
+ - `onmessage`: Triggered when a message is received from the server.
+ - `onerror`: Triggered if an error occurs during the connection.
+ - `onclose`: Triggered when the connection is closed.
+
+### 2. Sending a Setup Message (Mandatory First Step)
+
+- **API Requirement:** The Gemini API requires a setup message to be sent as the **very first message** after the WebSocket connection is established. This is crucial for configuring the session.
+- **`onopen` Handler:** The `onopen` event handler, which is triggered when the connection is open, is responsible for sending this setup message.
+- **Setup Message Structure:** The setup message is a JSON object that conforms to the `BidiGenerateContentSetup` format as defined in the API documentation:
+ ```javascript
+ const setupMessage = {
+ setup: {
+ model: "models/gemini-2.0-flash-exp",
+ generation_config: {
+ response_modalities: ["text"],
+ },
+ },
+ };
+ ```
+ - `model`: Specifies the Gemini model to use (`"models/gemini-2.0-flash-exp"` in this case).
+ - `generation_config`: Configures the generation parameters, such as the `response_modalities` (set to `"text"` for text-based output). You can also specify other parameters like `temperature`, `top_p`, `top_k`, etc., within `generation_config` as needed.
+- **Sending the Message:** The setup message is stringified and sent to the server using `ws.send()`:
+ ```javascript
+ ws.send(JSON.stringify(setupMessage));
+ ```
+
+### 3. Receiving and Processing Messages
+
+- **`onmessage` Handler:** The `onmessage` event handler receives messages from the server.
+- **Data Handling:** The code handles potential `Blob` data using `new Response(event.data).text()`, but in this text-only example, it directly parses the message as JSON.
+- **Response Parsing:** The received message is parsed as a JSON object using `JSON.parse()`.
+- **Message Types:** The code specifically checks for a `BidiGenerateContentSetupComplete` message type, indicated by the `setupComplete` field in the response.
+
+### 4. Confirming Setup Completion Before Proceeding
+
+- **`setupComplete` Check:** The code includes a conditional check to ensure that a `setupComplete` message is received before sending any user content:
+ ```javascript
+ if (response.setupComplete) {
+ // ... Send user message ...
+ }
+ ```
+- **Why This Is Important:** This check is essential because the API will not process user content messages until the setup is complete. Sending content before receiving confirmation that the setup is complete will likely result in an error or unexpected behavior. The API might close the connection if messages other than the initial setup message are sent before the setup is completed.
+
+### 5. Sending a Hardcoded User Message
+
+- **Triggered by `setupComplete`:** Only after the `setupComplete` message is received and processed does the application send a user message to the model.
+- **User Message Structure:** The user message is a JSON object conforming to the `BidiGenerateContentClientContent` format:
+ ```javascript
+ const contentMessage = {
+ client_content: {
+ turns: [
+ {
+ role: "user",
+ parts: [{ text: "Hello! Are you there?" }],
+ },
+ ],
+ turn_complete: true,
+ },
+ };
+ ```
+ - `client_content`: Contains the conversation content.
+ - `turns`: An array representing the conversation turns.
+ - `role`: Indicates the role of the speaker ("user" in this case).
+ - `parts`: An array of content parts (in this case, a single text part).
+ - `text`: The actual user message (hardcoded to "Hello! Are you there?").
+ - `turn_complete`: Set to `true` to signal the end of the user's turn.
+- **Sending the Message:** The content message is stringified and sent to the server using `ws.send()`.
+
+### 6. Displaying the Model's Response
+
+- **`serverContent` Handling:** When a `serverContent` message is received (which contains the model's response), the application extracts the response text.
+- **Response Extraction:** The model's response is accessed using `response.serverContent.modelTurn.parts[0]?.text`.
+- **Displaying the Response:** The `logMessage()` function displays the model's response in the `output` div on the HTML page.
+
+### 7. Error Handling and Connection Closure
+
+- **`onerror` Handler:** The `onerror` event handler logs any WebSocket errors to the console and displays an error message on the page.
+- **`onclose` Handler:** The `onclose` event handler logs information about the connection closure, including the reason and status code.
+
+### 8. Logging Messages
+
+- **`logMessage()` Function:** This utility function creates a new paragraph element (`
`) and appends it to the `output` div, displaying the provided message on the page.
+
+## Educational Purpose
+
+This example focuses on demonstrating the **low-level interaction with the Gemini API using WebSockets**. It highlights the importance of the **setup phase** and demonstrates how to:
+
+1. **Establish a raw WebSocket connection** without relying on an SDK.
+2. **Send a properly formatted `BidiGenerateContentSetup` message** as the first message to configure the session.
+3. **Wait for and verify the `setupComplete` message** before sending any user content.
+4. **Send properly formatted `BidiGenerateContentClientContent` messages** containing user input.
+5. **Parse and interpret the JSON responses** from the API, including `BidiGenerateContentSetupComplete` and `BidiGenerateContentServerContent` messages.
+6. **Handle basic events** like `onopen`, `onmessage`, `onerror`, and `onclose`.
+
+By examining this code, you can gain a deeper understanding of the underlying communication protocol and message formats used by the Gemini API, particularly the **mandatory setup phase**. This knowledge can be valuable for debugging, troubleshooting, or building custom integrations that require more control than an SDK might offer.
+
+**Note:** This is a simplified example for educational purposes. A real-world chat application would involve more complex features like:
+
+- Dynamic user input.
+- Handling multiple conversation turns.
+- Maintaining conversation history.
+- Potentially integrating audio or video.
+
+This example provides a solid foundation for understanding the basic principles involved in interacting with the Gemini API at a low level using WebSockets, especially the crucial setup process.
+
+**Note:** This is a simplified example for educational purposes. A real-world chat application would involve more complex features like:
+
+- Dynamic user input (see Chapter 4).
+- Handling multiple conversation turns.
+- Maintaining conversation history.
+- Potentially integrating audio or video (see Chapter 5 & 6).
+
+**Security Best Practices:**
+
+For production applications, **never** expose your API key directly in client-side code. Instead, use a secure backend server to handle authentication and proxy requests to the API. This protects your API key from unauthorized access.
diff --git a/genai-on-vertex-ai/gemini_2_0/gemini-multimodal-live-api-dev-guide/part_2_dev_api/chapter_03/index.html b/genai-on-vertex-ai/gemini_2_0/gemini-multimodal-live-api-dev-guide/part_2_dev_api/chapter_03/index.html
new file mode 100644
index 00000000..16c19623
--- /dev/null
+++ b/genai-on-vertex-ai/gemini_2_0/gemini-multimodal-live-api-dev-guide/part_2_dev_api/chapter_03/index.html
@@ -0,0 +1,141 @@
+
+
+
+
+
+ Gemini WebSocket Test
+
+
+
+
+
Gemini WebSocket Test
+
+ This is a simple demonstration of WebSocket communication with the
+ Gemini API, showing a single exchange between user and model. It
+ illustrates the fundamental principles of interacting with the API at a
+ low level, without using an SDK.
+
+
+
+
+
+
+
+
diff --git a/genai-on-vertex-ai/gemini_2_0/gemini-multimodal-live-api-dev-guide/part_2_dev_api/chapter_04/README.md b/genai-on-vertex-ai/gemini_2_0/gemini-multimodal-live-api-dev-guide/part_2_dev_api/chapter_04/README.md
new file mode 100644
index 00000000..73c71cc6
--- /dev/null
+++ b/genai-on-vertex-ai/gemini_2_0/gemini-multimodal-live-api-dev-guide/part_2_dev_api/chapter_04/README.md
@@ -0,0 +1,137 @@
+# Gemini Text-to-Speech with WebSockets
+
+This HTML file demonstrates a text-to-speech application using the Gemini API and WebSockets. It allows you to type a text message into an input field, send it to the Gemini model, and receive an audio response that is played back in the browser. The application uses the browser's built-in `WebSocket` and `AudioContext` APIs to handle real-time communication and audio playback.
+
+This example focuses on demonstrating:
+
+1. **Low-level interaction with the Gemini API using WebSockets.**
+2. **Handling audio responses** from the API and playing them back in the browser.
+3. **Managing user input** and displaying messages.
+4. **Implementing audio chunk queuing and playback** with `AudioContext`.
+
+## How it works
+
+The application's functionality can be broken down into several key components:
+
+### 1. Establishing a WebSocket Connection
+
+- **API Endpoint:** The application connects to the Gemini API using a specific WebSocket endpoint URL:
+ ```
+ wss://generativelanguage.googleapis.com/ws/google.ai.generativelanguage.v1alpha.GenerativeService.BidiGenerateContent?key=${apiKey}
+ ```
+ This URL includes the API host, the service path, and an API key for authentication. Remember to replace `${apiKey}` with your actual API key.
+- **WebSocket Object:** A new `WebSocket` object is created in JavaScript, initiating the connection:
+ ```javascript
+ const ws = new WebSocket(endpoint);
+ ```
+- **Event Handlers:** Event handlers are defined to manage the connection's lifecycle and handle incoming messages:
+ - `onopen`: Triggered when the connection is successfully opened.
+ - `onmessage`: Triggered when a message is received from the server.
+ - `onerror`: Triggered if an error occurs during the connection.
+ - `onclose`: Triggered when the connection is closed.
+
+### 2. Sending a Setup Message
+
+- **`onopen` Handler:** When the `onopen` event is triggered, the application sends a setup message to the API.
+- **Setup Message Structure:** The setup message is a JSON object that configures the interaction:
+ ```javascript
+ const setupMessage = {
+ setup: {
+ model: "models/gemini-2.0-flash-exp",
+ generation_config: {
+ response_modalities: ["AUDIO"],
+ },
+ },
+ };
+ ```
+ - `model`: Specifies the Gemini model to use (`"models/gemini-2.0-flash-exp"` in this case).
+ - `generation_config`: Configures the generation parameters. Here, `response_modalities` is set to `["AUDIO"]` to request audio output.
+- **Sending the Message:** The setup message is stringified and sent to the server using `ws.send()`.
+- **Input Disabled:** Initially, the user input field and send button are disabled. They are only enabled after the setup is complete.
+
+### 3. Receiving and Processing Messages
+
+- **`onmessage` Handler:** The `onmessage` event handler receives messages from the server.
+- **Handling Different Response Types:** The code handles either `Blob` or `JSON` data. It converts `Blob` data to text and parses the text as JSON.
+- **Response Parsing:** The received message is parsed as a JSON object using `JSON.parse()`.
+- **Message Types:** The code checks for two types of messages:
+ - **`setupComplete`:** Indicates that the setup process is finished.
+ - **`serverContent`:** Contains the model's response, which in this case will be audio data.
+
+### 4. Sending User Messages
+
+- **Enabling Input:** When a `setupComplete` message is received, the application enables the user input field and the send button.
+- **`sendUserMessage()` Function:** This function is called when the user clicks the "Send" button or presses Enter in the input field.
+- **User Message Structure:** The user message is a JSON object:
+ ```javascript
+ const contentMessage = {
+ client_content: {
+ turns: [
+ {
+ role: "user",
+ parts: [{ text: message }],
+ },
+ ],
+ turn_complete: true,
+ },
+ };
+ ```
+ - `client_content`: Contains the conversation content.
+ - `turns`: An array representing the conversation turns.
+ - `role`: Indicates the role of the speaker ("user" in this case).
+ - `parts`: An array of content parts (in this case, a single text part containing the user's message).
+ - `turn_complete`: Set to `true` to signal the end of the user's turn.
+- **Sending the Message:** The content message is stringified and sent to the server using `ws.send()`.
+- **Clearing Input:** The input field is cleared after the message is sent.
+
+### 5. Handling Audio Responses
+
+- **`serverContent` with Audio:** When a `serverContent` message containing audio data is received, the application extracts the base64-encoded audio data.
+- **`inlineData`:** The audio data is found in `response.serverContent.modelTurn.parts[0].inlineData.data`.
+- **`playAudioChunk()`:** This function is called to handle the audio chunk.
+- **Audio Queue:** Audio is pushed into an `audioQueue` array for processing.
+- **Audio Playback Management:** `isPlayingAudio` flag ensures that chunks are played sequentially, one after the other.
+
+### 6. Audio Playback with `AudioContext`
+
+- **`ensureAudioInitialized()`:** This function initializes the `AudioContext` when the first audio chunk is received. This is done lazily to comply with browser autoplay policies. It sets a sample rate of 24000.
+ - **Lazy Initialization:** The `AudioContext` is only created when the first audio chunk is received. This is because some browsers restrict audio playback unless it's initiated by a user action.
+ - **Sample Rate:** The sample rate is set to 24000 Hz, which is a common sample rate for speech audio.
+- **`playAudioChunk()`:** This function adds an audio chunk to a queue (`audioQueue`) and initiates audio playback if it's not already playing.
+- **`processAudioQueue()`:** This function is responsible for processing and playing audio chunks from the queue.
+ - **Chunk Handling:** It retrieves an audio chunk from the queue.
+ - **Base64 Decoding:** The base64-encoded audio chunk is decoded to an `ArrayBuffer` using `base64ToArrayBuffer()`.
+ - **PCM to Float32 Conversion:** The raw PCM16LE (16-bit little-endian Pulse Code Modulation) audio data is converted to Float32 format using `convertPCM16LEToFloat32()`. This is necessary because `AudioContext` works with floating-point audio data.
+ - **Creating an `AudioBuffer`:** An `AudioBuffer` is created with a single channel, the appropriate length, and a sample rate of 24000 Hz. The Float32 audio data is then copied into the `AudioBuffer`.
+ - **Creating an `AudioBufferSourceNode`:** An `AudioBufferSourceNode` is created, which acts as a source for the audio data. The `AudioBuffer` is assigned to the source node.
+ - **Connecting to Destination:** The source node is connected to the `AudioContext`'s destination (the speakers).
+ - **Starting Playback:** `source.start(0)` starts the playback of the audio chunk immediately.
+ - **`onended` Event:** A promise is used with the `onended` event of the source node to ensure that the next chunk in the queue is only played after the current chunk has finished playing. This is crucial for maintaining the correct order and avoiding overlapping audio.
+
+### 7. Helper Functions
+
+- **`base64ToArrayBuffer()`:** Converts a base64-encoded string to an `ArrayBuffer`.
+- **`convertPCM16LEToFloat32()`:** Converts PCM16LE audio data to Float32 format.
+- **`logMessage()`:** Appends a message to the `output` div on the HTML page.
+
+### 8. Error Handling and Connection Closure
+
+- **`onerror` Handler:** Logs WebSocket errors to the console and displays an error message on the page.
+- **`onclose` Handler:** Logs information about the connection closure.
+
+## Summary
+
+This example demonstrates a basic text-to-speech application using the Gemini API with WebSockets. It showcases:
+
+- Establishing a WebSocket connection and sending a setup message.
+- Handling user input and sending text messages to the API.
+- Receiving audio responses in base64-encoded chunks.
+- Decoding and converting audio data to a format suitable for playback.
+- Using `AudioContext` to play the audio in the browser sequentially, one chunk after the other.
+- Implementing basic error handling and connection closure.
+
+This example provides a starting point for building more sophisticated applications that can generate audio responses from the Gemini model and play them back in real time, all within the browser environment using low-level WebSockets and `AudioContext` for audio management. The sample rate is set to 24000 Hz to match the APIs output sample rate, ensuring correct playback speed and pitch.
+
+**Security Best Practices:**
+
+For production applications, **never** expose your API key directly in client-side code. Instead, use a secure backend server to handle authentication and proxy requests to the API. This protects your API key from unauthorized access.
diff --git a/genai-on-vertex-ai/gemini_2_0/gemini-multimodal-live-api-dev-guide/part_2_dev_api/chapter_04/index.html b/genai-on-vertex-ai/gemini_2_0/gemini-multimodal-live-api-dev-guide/part_2_dev_api/chapter_04/index.html
new file mode 100644
index 00000000..cfafeaf2
--- /dev/null
+++ b/genai-on-vertex-ai/gemini_2_0/gemini-multimodal-live-api-dev-guide/part_2_dev_api/chapter_04/index.html
@@ -0,0 +1,324 @@
+
+
+
+
+
+ Gemini Text-to-Speech WebSocket Test
+
+
+
+
+
Gemini Text-to-Speech with WebSockets
+
+ This application demonstrates real-time text-to-speech using the Gemini
+ API. Type a message and receive an audio response that plays
+ automatically in your browser. The app uses WebSockets for communication
+ and AudioContext for handling audio playback.
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/genai-on-vertex-ai/gemini_2_0/gemini-multimodal-live-api-dev-guide/part_2_dev_api/chapter_05/README.md b/genai-on-vertex-ai/gemini_2_0/gemini-multimodal-live-api-dev-guide/part_2_dev_api/chapter_05/README.md
new file mode 100644
index 00000000..630af171
--- /dev/null
+++ b/genai-on-vertex-ai/gemini_2_0/gemini-multimodal-live-api-dev-guide/part_2_dev_api/chapter_05/README.md
@@ -0,0 +1,362 @@
+# Chapter 5: Gemini Live Audio Chat - Real-time Audio-to-Audio with WebSockets
+
+This chapter presents a real-time audio-to-audio chat application that interacts with the Gemini Multimodal Live API using WebSockets and the Web Audio API. It demonstrates a browser-based implementation that captures live microphone input, sends it to the Gemini API, and plays back the model's audio response in real time.
+
+**This chapter builds upon the concepts introduced in previous chapters but introduces significantly more complexity** due to the use of raw WebSockets for bidirectional communication and advanced audio processing techniques for handling live audio streams.
+
+**How This Chapter Differs from Previous Chapters:**
+
+- **Chapter 2 (Live Audio Chat with Gemini):** Utilized the Python SDK for simplifying the audio streaming, but didn't run in the browser. It handled audio-to-audio but with the assistance of the SDK's higher-level abstractions. **Importantly, Chapter 2 used a `model_speaking` flag on the client-side to prevent the model's output from being treated as input.** This chapter achieves a similar outcome through a different mechanism, relying on the APIs turn management.
+- **Chapter 3 (Low-Level WebSocket Interaction - Single Exchange Example):** Introduced low level WebSocket interaction but only for sending a single text query to the model.
+- **Chapter 4 (Text-to-Speech with WebSockets):** Focused on text-to-speech, sending text to the API and playing back the received audio. It introduced basic audio handling but did not involve live microphone input or complex audio stream management.
+
+**Chapter 5, in contrast, combines the real-time nature of Chapter 2 with the low-level WebSocket approach of Chapters 3 and 4 but implements a full audio-to-audio chat entirely within the browser.** This requires handling:
+
+- **Live Microphone Input:** Capturing and processing a continuous stream of audio data from the user's microphone.
+- **Bidirectional Audio Streaming:** Sending audio chunks to the API while simultaneously receiving and playing back audio responses in real time.
+- **Advanced Audio Processing:** Converting between audio formats, managing audio buffers, and ensuring smooth playback using the Web Audio API.
+- **Complex State Management:** Handling interruptions, turn-taking, and potential errors in a real-time audio stream.
+
+**Why the Increased Complexity?**
+
+The jump in complexity comes from the need to manage real-time, bidirectional audio streams directly within the browser using low-level APIs. This involves:
+
+- **No SDK Abstraction:** We're working directly with WebSockets and handling the raw message formats defined by the Gemini API, including setup and control messages.
+- **Manual Audio Handling:** We must manually capture, chunk, encode, decode, process, and play audio data, without the convenience of an SDK's built-in methods.
+- **Real-time Constraints:** We need to ensure that audio is processed and played back with minimal latency to maintain a natural conversational flow.
+- **Asynchronous Operations:** We rely heavily on asynchronous JavaScript and Promises to manage the non-blocking nature of WebSockets and audio processing.
+
+## Project Structure
+
+This chapter's application consists of the following files:
+
+- **`index.html`:** The main HTML file that sets up the user interface (a microphone button and an output area for messages) and includes the core JavaScript logic for WebSocket communication and overall application flow.
+- **`audio-recorder.js`:** Contains the `AudioRecorder` class, which handles capturing audio from the microphone, converting it to the required format, and emitting chunks of audio data using an `EventEmitter3` interface.
+- **`audio-streamer.js`:** Contains the `AudioStreamer` class, which manages audio playback using the Web Audio API. It handles queuing, buffering, and playing audio chunks received from the API, ensuring smooth and continuous playback.
+- **`audio-recording-worklet.js`:** Defines an `AudioWorkletProcessor` that runs in a separate thread and performs the low-level audio processing, including float32 to int16 conversion and chunking.
+- **`audioworklet-registry.js`:** A utility to help register and manage `AudioWorklet`s, preventing duplicate registration.
+- **`utils.js`:** Provides utility functions like `audioContext` (for creating an `AudioContext`) and `base64ToArrayBuffer` (for decoding base64 audio data).
+- **`style.css`:** Contains basic CSS styles for the user interface.
+
+## System Architecture
+
+
+
+## Detailed Explanation of Audio Processing
+
+The audio processing pipeline in this application is crucial for real-time performance. Let's break down the components, design choices, and address the specific questions raised:
+
+**1. Microphone Input and `AudioRecorder`:**
+
+- **`AudioRecorder` Class:** This class encapsulates the logic for capturing audio from the user's microphone using the browser's `MediaDevices` API (`navigator.mediaDevices.getUserMedia`).
+- **`AudioWorklet`:** It utilises an `AudioWorklet` to perform audio processing in a separate thread, preventing the main thread from being blocked by computationally intensive audio operations, which is essential for maintaining a smooth user experience.
+- **`audio-recording-worklet.js`:** This file defines the `AudioProcessingWorklet` class, which extends `AudioWorkletProcessor`. It performs the following:
+
+ - **Float32 to Int16 Conversion:** Converts the raw audio data from Float32 format (used by the Web Audio API) to Int16 format (required by the Gemini API for PCM audio). The conversion involves scaling the Float32 values (ranging from -1.0 to 1.0) to the Int16 range (-32768 to 32767).
+ ```javascript
+ // convert float32 -1 to 1 to int16 -32768 to 32767
+ const int16Value = float32Array[i] * 32768;
+ ```
+ - **Chunking:** Buffers audio samples and sends them in chunks. This is where the frequency of audio transmission is determined. The `buffer` has a fixed length of **2048 samples**. When the `bufferWriteIndex` reaches the end of the buffer, the `sendAndClearBuffer` function is called. The buffer is sent via `postMessage` and then cleared, ready for new data.
+
+ ```javascript
+ // send and clear buffer every 2048 samples,
+ buffer = new Int16Array(2048);
+
+ // ...
+
+ if(this.bufferWriteIndex >= this.buffer.length) {
+ this.sendAndClearBuffer();
+ }
+
+ // ...
+
+ sendAndClearBuffer() {
+ this.port.postMessage({
+ event: "chunk",
+ data: {
+ int16arrayBuffer: this.buffer.slice(0, this.bufferWriteIndex).buffer,
+ },
+ });
+ this.bufferWriteIndex = 0;
+ }
+ ```
+
+ **At the input sample rate of 16000 Hz, a chunk of 2048 samples is created and sent approximately every 128 milliseconds (2048 / 16000 = 0.128 seconds).**
+
+- **EventEmitter3:** The `AudioRecorder` class extends `EventEmitter3`, allowing it to emit events. Specifically, it emits a `data` event whenever a chunk of audio data is ready to be sent. Other parts of the application can listen for this event to receive the audio data.
+- **`start()` and `stop()` Methods:** These methods control the recording process, starting and stopping the microphone capture and managing the associated resources.
+
+**2. WebSocket Communication (`index.html`)**
+
+- **`ws.onopen`:** Sends the initial `setup` message to the Gemini API, specifying the model, audio output as the response modality, and the desired voice.
+- **`ws.onmessage`:** Handles incoming messages from the API:
+ - **`setupComplete`:** Enables the microphone button, indicating that the connection is ready.
+ - **`serverContent`:** Processes audio data, handles interruptions, and sends continuation signals as needed.
+- **`sendAudioChunk()`:** This function is triggered by the `data` event emitted by the `AudioRecorder`. It takes a chunk of audio data (which has already been converted to Int16 and then to base64 in the `AudioRecorder`), constructs a `realtime_input` message, and sends it to the API via `ws.send()`. The message format adheres to the `BidiGenerateContentRealtimeInput` structure defined in the API documentation.
+- **`sendEndMessage()` and `sendContinueSignal()`:** These are crucial for managing the conversation flow.
+ - **`sendEndMessage()`:** Sends a message with `turn_complete: true` when the user stops recording (by clicking the "Stop Mic" button). This signals to the API that the user's turn is finished.
+ ```javascript
+ const message = {
+ client_content: {
+ turns: [
+ {
+ role: "user",
+ parts: [], // no more audio for this turn
+ },
+ ],
+ turn_complete: true, // end of turn
+ },
+ };
+ ```
+ - **`sendContinueSignal()`:** Sends a message with `turn_complete: false` immediately after receiving an audio chunk from the model, _unless_ the model indicates `turnComplete: true`. This serves as a keep-alive, letting the API know that the client is still listening and ready for more audio data. This is important for the low-latency, real-time nature of the interaction.
+ ```javascript
+ const message = {
+ client_content: {
+ turns: [
+ {
+ role: "user",
+ parts: [], // no more audio for this turn
+ },
+ ],
+ turn_complete: false, // not the end of turn, keep going
+ },
+ };
+ ```
+- **`toggleMicrophone()`:** Starts and stops the recording process, calling the appropriate methods in `AudioRecorder`.
+
+**3. Audio Playback and `AudioStreamer`:**
+
+- **`AudioStreamer` Class:** This class manages the playback of audio chunks received from the Gemini API.
+- **`AudioContext`:** It utilizes the Web Audio APIs `AudioContext` for handling audio playback. The `AudioContext` is initialized only when the first audio chunk is received to comply with browser autoplay policies. It sets a sample rate of 24000 Hz.
+ - **Lazy Initialization:** The `AudioContext` is only created when the first audio chunk is received. This is because some browsers restrict audio playback unless it's initiated by a user action.
+ - **Sample Rate:** The sample rate is set to 24000 Hz, which is a common sample rate for speech audio.
+- **`addPCM16()`:** This method receives PCM16 audio chunks, converts them back to Float32, creates `AudioBuffer` objects, and adds them to an internal queue (`audioQueue`).
+- **`playNextBuffer()`:** This method retrieves audio buffers from the queue and plays them using an `AudioBufferSourceNode`. It ensures that chunks are played sequentially, one after the other, using the `onended` event of the source node and a small delay.
+- **`isPlaying` Flag:** This flag tracks whether audio is currently being played, preventing overlapping playback.
+- **`stop()` and `resume()`:** These methods provide control over stopping and resuming audio playback.
+- **`complete()`:** This method is called to signal the end of an audio stream, allowing any remaining buffers in the queue to be played out.
+- **Stall Detection:** Implements a mechanism to detect and recover from playback stalls, ensuring continuous audio flow. The `checkPlaybackStatus()` function periodically checks if audio playback has stalled (by comparing the current time with the last playback time). If a stall is detected and there are still buffers in the queue, it attempts to restart playback by calling `playNextBuffer()`. This is a safety net to handle situations where the `onended` event might not fire reliably or if there are unexpected delays in audio processing.
+
+ ```javascript
+ checkPlaybackStatus() {
+ // Clear any existing timeout
+ if (this.playbackTimeout) {
+ clearTimeout(this.playbackTimeout);
+ }
+
+ // Set a new timeout to check playback status
+ this.playbackTimeout = setTimeout(() => {
+ const now = this.context.currentTime;
+ const timeSinceLastPlayback = now - this.lastPlaybackTime;
+
+ // If more than 1 second has passed since last playback and we have buffers to play
+ if (timeSinceLastPlayback > 1 && this.audioQueue.length > 0 && this.isPlaying) {
+ console.log('Playback appears to have stalled, restarting...');
+ this.playNextBuffer();
+ }
+
+ // Continue checking if we're still playing
+ if (this.isPlaying) {
+ this.checkPlaybackStatus();
+ }
+ }, 1000);
+ }
+ ```
+
+**4. Interruption Handling:**
+
+- **Detection:** The API signals an interruption by sending a `serverContent` message with the `interrupted` flag set to `true`. This typically happens when the APIs VAD detects speech from the user while the model is still speaking.
+ ```javascript
+ if (wsResponse.serverContent.interrupted) {
+ logMessage("Gemini: Interrupted");
+ isInterrupted = true;
+ audioStreamer.stop();
+ return;
+ }
+ ```
+- **Client-Side Handling:** When the `interrupted` flag is received:
+ 1. The `isInterrupted` flag is set to `true`.
+ 2. The `AudioStreamer`'s `stop()` method is called to immediately halt any ongoing audio playback. This ensures that the interrupted audio is not played.
+- **Latency:** The latency for interruption detection is primarily determined by the APIs VAD and the network latency. The client-side processing adds minimal delay. On a fast connection, the interruption should feel near-instantaneous.
+- **No Specific Parameter:** There is no specific parameter in this code to tune the interruption sensitivity, as that is primarily controlled by the APIs VAD.
+- **Effects of Changing VAD (if possible):** If the API provided a way to adjust VAD sensitivity (which it currently doesn't for the Multimodal Live API), the effects would be:
+ - **More Sensitive VAD:** Interruptions would be triggered more easily, potentially leading to a more responsive but also more "jumpy" conversation.
+ - **Less Sensitive VAD:** The model would be more likely to finish its turn, but it might feel less responsive to user interruptions.
+
+**5. Preventing Feedback Loop (No Echo):**
+In Chapter 2 with the Python SDK we introduced a `model_speaking` flag to prevent to model from listening to itself. In this chapter, we achieve this without an explicit flag on the client-side, **relying on the APIs built-in turn management capabilities.** Here's how it works:
+
+- **Turn Detection:** The Gemini API uses its Voice Activity Detection (VAD) to determine when a user's turn begins and ends. When the user starts speaking, the VAD detects this as the start of a turn. When the user stops speaking for a certain duration (a pause), the VAD determines that the user's turn has ended.
+
+- **`turn_complete` Signal:** The `turn_complete: true` signal sent in the `sendEndMessage()` function after the user stops speaking explicitly tells the API that the user's turn is over. This is important for the API to properly segment the conversation. The sending of this signal is directly tied to the user clicking the "Stop Mic" button, which in turn is only clickable when the user is speaking. This means the user has control when a turn ends.
+
+- **API-Side Management:** The API manages the conversation flow internally, ensuring that the model only processes audio input that is considered part of the user's turn. The model does not start generating its response until the user's turn is deemed complete (either by `turn_complete: true` or by the VAD detecting a sufficiently long pause).
+
+- **`sendContinueSignal()`:** The `sendContinueSignal()` function sends `turn_complete: false` after model audio is received unless the model indicated `turn_complete: true`. This is important. Without that the model would not continue to speak if the generated audio takes longer than the VAD's pause detection.
+
+Essentially, the API is designed to handle the "listen while speaking" scenario gracefully. It's not simply feeding the output audio back into the input. The VAD and turn management logic ensure that the model only processes audio it considers as user input.
+
+**6. Audio Streaming and Context Window:**
+
+- **Continuous Streaming:** As long as the microphone is active and the user is speaking, audio data is continuously sent to the Gemini API in chunks. This is necessary for real-time interaction.
+- **Chunk Size and Data Rate:**
+ - Each chunk contains 2048 samples of 16-bit PCM audio.
+ - Each sample is 2 bytes (16 bits = 2 bytes).
+ - Therefore, each chunk is 2048 samples \* 2 bytes/sample = 4096 bytes.
+ - Chunks are sent roughly every 128 milliseconds.
+ - This translates to a data rate of approximately 4096 bytes / 0.128 seconds = 32 KB/s (kilobytes per second).
+ - **VAD and Turn Boundaries:** The APIs VAD plays a crucial role in determining the boundaries of a turn. When VAD detects a significant enough pause in the user's speech, it considers the turn to be over, and the model generates a response based on that segment of audio.
+ - **Practical Implications:** For a natural conversational flow, it's generally a good practice to keep your utterances relatively concise and allow for turn-taking. This helps the API process the audio effectively and generate relevant responses.
+
+**7. User Interface (`index.html`)**
+
+- **"Start Mic"/"Stop Mic" Button:** This button controls the microphone recording. Its text toggles between "Start Mic" and "Stop Mic" depending on the recording state.
+- **Output Area:** The `div` with the ID `output` is used to display messages to the user, such as "Recording started...", "Recording stopped...", "Gemini: Speaking...", and "Gemini: Finished speaking".
+- **Visual Feedback:** The UI provides basic visual feedback about the state of the application (recording, playing audio, etc.).
+- **Initial State:** When the page loads, the microphone button is disabled. It is only enabled after the WebSocket connection is successfully established and the setup message exchange is complete.
+
+**8. Debugging**
+
+- **Browser Developer Tools:** The primary tool for debugging this application is your browser's developer tools (usually accessed by pressing F12).
+ - **Console:** Use the console to view `console.log` messages, errors, and warnings. The code includes numerous `console.log` statements to help you track the flow of execution and the data being processed.
+ - **Network Tab:** Use the Network tab to monitor WebSocket traffic. You can inspect the individual messages being sent and received, including their contents and timing. This is invaluable for understanding the communication with the API.
+ - **Debugger:** Use the JavaScript debugger to set breakpoints, step through the code, inspect variables, and analyze the call stack.
+- **`logMessage()` Function:** This function provides a simple way to display messages in the `output` div on the page, providing visual feedback within the application itself.
+
+**9. Further Considerations**
+
+- **Error Handling:** The code includes basic error handling, but it could be made more robust by handling specific error codes or messages from the API and providing more informative feedback to the user.
+- **Security:** The API key is currently hardcoded in the HTML file. For production, you should **never** expose your API key directly in client-side code. Instead, use a secure backend server to handle authentication and proxy requests to the API.
+- **Scalability:** This example is designed for a single user. For a multi-user scenario, you would need to manage multiple WebSocket connections and potentially use a server-side component to handle user sessions and routing.
+- **Audio Quality:** The audio quality depends on the microphone, network conditions, and the APIs processing. You can experiment with different sample rates and chunk sizes, but these values are often constrained by the APIs requirements and the need to balance latency and bandwidth.
+- **Network Latency:** Network latency can significantly impact the real-time performance of the application. There's no single solution to mitigate network latency, but using a server closer to the user's location and optimizing the audio processing pipeline can help.
+- **Audio Level:** There is a `gainNode` to allow for controlling the volume of the output audio in the `AudioStreamer`. This is not used yet but could be exposed to the user through the UI if needed.
+
+## Web Audio API
+
+The Web Audio API is a high-level JavaScript API for processing and synthesizing audio in web applications. It provides a powerful and flexible system for manipulating audio within the browser. It is based on the idea of an **audio graph**, where different **audio nodes** are connected to process an audio stream.
+
+**Key Concepts:**
+
+- **`AudioContext`:** The primary interface for working with the Web Audio API. It represents an audio-processing graph built from audio nodes. You can only have one `AudioContext` per document. Think of it as the container or the manager for all audio operations.
+- **Audio Nodes:** Building blocks of the audio graph. They perform specific audio processing tasks. Examples include:
+ - **`AudioBufferSourceNode`:** Represents an audio source consisting of in-memory audio data stored in an `AudioBuffer`. Used here to play the audio chunks received from the API.
+ - **`MediaStreamAudioSourceNode`:** Represents an audio source consisting of a `MediaStream` (e.g., from a microphone). Used here to capture audio from the microphone.
+ - **`GainNode`:** Controls the volume (gain) of the audio signal. Used here for potential volume adjustments.
+ - **`AudioWorkletNode`:** A special type of node that allows you to run custom audio processing JavaScript code in a separate thread (the audio rendering thread). This is essential for real-time audio processing as it prevents blocking the main thread and causing glitches. Used here (`audio-recording-worklet.js`) to handle audio chunking and format conversion in a separate thread.
+- **`AudioBuffer`:** Represents a short audio asset residing in memory. Used to hold the audio data of each chunk.
+- **`AudioParam`:** Represents a parameter of an audio node (e.g., the gain of a `GainNode`). Can be automated over time.
+- **`AudioWorklet`:** Enables developers to write custom audio processing scripts that run in a separate thread. This is crucial for performance-sensitive audio applications, as it ensures that audio processing doesn't block the main thread and cause glitches or delays. `AudioWorklet`s are defined in separate JavaScript files (like `audio-recording-worklet.js`) and are added to the `AudioContext` using `audioContext.audioWorklet.addModule()`.
+
+**How This Application Uses the Web Audio API:**
+
+- **`AudioContext`:** An `AudioContext` is created to manage the entire audio graph. It's initialized with a sample rate of 24000 Hz, matching the APIs output sample rate.
+- **`AudioWorkletNode`:** An `AudioWorkletNode` is used to run the `AudioProcessingWorklet` defined in `audio-recording-worklet.js`. This handles the real-time processing of microphone input, converting it to Int16 format and dividing it into chunks.
+- **`AudioBufferSourceNode`:** An `AudioBufferSourceNode` is created for each audio chunk received from the API. The audio data is decoded, converted to Float32, and then used to create an `AudioBuffer` that is assigned to the source node.
+- **`MediaStreamAudioSourceNode`:** A `MediaStreamAudioSourceNode` is created to capture the audio stream from the user's microphone.
+- **`GainNode`:** A `GainNode` is connected to the output for potential volume control.
+- **Connections:** The nodes are connected: `MediaStreamAudioSourceNode` -> `AudioWorkletNode` (for input processing), and `AudioBufferSourceNode` -> `GainNode` -> `AudioContext.destination` (for output).
+
+**Audio Queueing and Buffering:**
+
+- **`audioQueue`:** This array in `AudioStreamer` acts as a queue for incoming audio chunks. Chunks are added to the queue as they are received from the API.
+- **`playNextBuffer()`:** This function retrieves and plays buffers from the queue sequentially. It uses the `onended` event of the `AudioBufferSourceNode` to trigger the playback of the next chunk, ensuring a continuous stream.
+- **Buffering:** The Web Audio API internally handles some buffering, but the `audioQueue` provides an additional layer of buffering to smooth out any irregularities in the arrival of audio chunks.
+
+**Batched Sending:**
+
+- The term "batching" isn't explicitly used in the code, but the concept is present in how audio chunks are created and sent. The `AudioWorklet` buffers 2048 samples before sending a chunk. This can be considered a form of batching, as it sends data in discrete units rather than a continuous stream of individual samples. This approach balances the need for real-time responsiveness with the efficiency of sending data in larger packets.
+
+## Configuration and Parameters
+
+The following parameters and values are used in this application and can be customized:
+
+- **`model`:** `"models/gemini-2.0-flash-exp"` (specifies the Gemini model).
+- **`response_modalities`:** `["audio"]` (requests audio output from the API).
+- **`speech_config`:**
+ - **`voice_config`**:
+ - **`prebuilt_voice_config`**:
+ - **`voice_name`**: `Aoede` (specifies which voice to use).
+ Possible values: `Aoede`, `Charon`, `Fenrir`, `Kore`, `Puck`
+- **`sampleRate`:**
+ The sample rate is set to 16000 Hz for the input and 24000 Hz for the output. This is dictated by the APIs requirements.
+ - **Input (Microphone):** 16000 Hz (set in `audio-recorder.js`). This is a common sample rate for speech recognition.
+ - **Why 16000 Hz for input?** 16000 Hz is a standard sample rate for speech processing and is often used in speech recognition systems because it captures most of the relevant frequency information in human speech while keeping computational costs manageable. Using a higher sample rate for input might not provide significant improvements in speech recognition accuracy for this application.
+ - **Output (API):** 24000 Hz (specified in the API documentation and when creating the `AudioContext`). This is a higher sample rate, providing better audio quality for playback.
+ - **Why 24000 Hz for output?** 24000 Hz is chosen because it's the sample rate at which the API provides audio output. Using this rate ensures that the audio is played back at the correct speed and pitch.
+- **`CHUNK_SIZE` (in `audio-recording-worklet.js`):** 2048 samples. This determines the size of the audio chunks sent to the API. It represents a good balance between latency and processing overhead.
+ - **Calculation:** With a sample rate of 16000 Hz, a 2048-sample chunk corresponds to 2048 / 16000 = 0.128 seconds, or 128 milliseconds.
+ - **Why 2048 samples per chunk?** This value is chosen to balance the need for low latency with the overhead of sending frequent messages. Smaller chunks would result in lower latency but would increase the number of messages sent to the API, potentially leading to higher processing overhead and network congestion. Larger chunks would reduce the frequency of messages but increase latency.
+ - **Effects of Changing `CHUNK_SIZE`:**
+ - **Smaller `CHUNK_SIZE` (e.g., 1024 samples):**
+ - **Pros:** Lower latency (around 64 milliseconds per chunk). The application would feel more responsive.
+ - **Cons:** Increased processing overhead on both the client and server sides due to more frequent message sending and handling. Increased network traffic. The audio might also start to sound choppy and distorted due to potential buffer underruns.
+ - **Larger `CHUNK_SIZE` (e.g., 4096 samples):**
+ - **Pros:** Reduced processing overhead and network traffic.
+ - **Cons:** Higher latency (around 256 milliseconds per chunk). The application would feel less responsive, and the conversation might feel sluggish.
+- **Audio Format:**
+ - **Input:** The microphone provides audio data in Float32 format.
+ - **API Input:** The API expects audio data in 16-bit linear PCM (Int16) format, little-endian.
+ - **API Output:** The API provides audio data in base64-encoded 16-bit linear PCM (Int16) format, little-endian.
+ - **Output:** The `AudioContext` works with Float32 audio data.
+
+## Lessons Learned and Best Practices
+
+Through the development of this real-time audio streaming application, several important lessons were learned, and best practices were discovered:
+
+### Audio Context Setup
+
+- **Lazy Initialization:** Initialize the `AudioContext` only when needed, typically in response to a user interaction, to comply with browser autoplay policies.
+
+### Audio Buffer Management
+
+- **Avoid Fixed Buffer Sizes:** Instead of using fixed buffer sizes and trying to manage partial buffers, adapt to the natural chunk sizes provided by the API. Process each chunk as it arrives. This simplifies buffer management and improves playback smoothness.
+- **Don't Overcomplicate:** Simple sequential playback using the `onended` event is often more effective and easier to manage than complex scheduling logic.
+
+### PCM16 Data Handling
+
+- **Correct Conversion:** Ensure that PCM16 data is correctly interpreted and converted to Float32 format for the Web Audio API. The conversion involves normalizing the 16-bit integer values to the range [-1, 1].
+
+### Playback Timing and Scheduling
+
+- **Sequential Playback:** Use the `onended` event of `AudioBufferSourceNode` to trigger the playback of the next audio chunk. This ensures that chunks are played sequentially without overlap.
+- **Avoid Aggressive Scheduling:** Do not schedule buffers too far in advance. This can lead to memory issues and make it difficult to handle interruptions.
+
+### Interruption Handling
+
+- **Immediate Stop:** When an interruption is detected (using the `interrupted` flag from the API), stop the current audio playback immediately using `audioStreamer.stop()`.
+- **State Reset:** Reset the `isInterrupted` flag and any other relevant state variables to prepare for new audio input.
+- **Clear Buffers:** Ensure that any pending audio buffers are cleared to prevent stale audio from playing.
+
+### Protocol Management
+
+- **Setup Message:** Send the `setup` message as the very first message after establishing the WebSocket connection. This configures the session with the API.
+- **Voice Selection:** In the setup message, select a voice in the speech config, which determines the voice of the audio response.
+- **Continue Signals:** Send `client_content` messages with `turn_complete: false` to maintain the streaming connection and signal that the client is ready for more audio data. Send these signals immediately after receiving and processing an audio chunk from the model.
+- **Turn Completion:** Send a `client_content` message with `turn_complete: true` to indicate the end of the user's turn.
+
+### State Management
+
+- **Track Essential States:** Keep track of states like `isRecording`, `initialized`, and `isInterrupted` to manage the application flow correctly.
+- **Reset States Appropriately:** Reset these states at the appropriate times, such as when starting a new recording or after an interruption.
+
+### Technical Requirements and Best Practices
+
+- **`AudioContext` Sample Rate:** Always initialize the `AudioContext` with a sample rate of 24000 Hz for compatibility with the Gemini API.
+- **WebSocket Configuration:** Ensure the WebSocket connection is properly configured with the correct API endpoint and API key.
+- **Event Handling:** Implement proper event handling for all relevant audio and WebSocket events, including `onopen`, `onmessage`, `onerror`, `onclose`, `onended`, and custom events like the `data` event from `AudioRecorder`.
+- **State Management:** Implement robust state management to track the recording state, initialization state, interruption state, and other relevant flags.
+
+### Common Pitfalls to Avoid
+
+- **Overly Complex Buffer Management:** Avoid using fixed buffer sizes or complex buffering logic when a simpler sequential approach is sufficient.
+- **Aggressive Buffer Scheduling:** Don't schedule audio buffers too far in advance, as this can lead to memory issues and complicate interruption handling.
+- **Incorrect PCM16 Handling:** Ensure that PCM16 data is correctly converted to Float32 format, and that the sample rate is properly considered.
+- **Ignoring `turn_complete`:** Always handle the `turn_complete` signal from the API to properly manage turn-taking.
+- **Neglecting State Management:** Failing to properly manage and reset state variables can lead to unexpected behavior and bugs.
+- **Forgetting Continue Signals:** Remember to send continue signals to maintain the streaming connection, especially during long audio generation.
+
+## Summary
+
+This chapter provides a real-world example of building a real-time, audio-to-audio chat application with the Gemini Multimodal Live API using WebSockets and the Web Audio API. It demonstrates the complexities of handling live audio streams, managing bidirectional communication, and performing necessary audio format conversions, all within a browser environment. It also highlights common pitfalls and best practices discovered during the development process.
diff --git a/genai-on-vertex-ai/gemini_2_0/gemini-multimodal-live-api-dev-guide/part_2_dev_api/chapter_05/index.html b/genai-on-vertex-ai/gemini_2_0/gemini-multimodal-live-api-dev-guide/part_2_dev_api/chapter_05/index.html
new file mode 100644
index 00000000..7f7d3176
--- /dev/null
+++ b/genai-on-vertex-ai/gemini_2_0/gemini-multimodal-live-api-dev-guide/part_2_dev_api/chapter_05/index.html
@@ -0,0 +1,218 @@
+
+
+
+
+
+ Gemini Audio-to-Audio WebSocket Demo (Dev API)
+
+
+
+
+
+
Gemini Live Audio Chat (Dev API)
+
+ This application demonstrates real-time audio-to-audio chat using the
+ Gemini API and WebSockets. Speak into your microphone and receive audio
+ responses in real time. The app uses the Web Audio API for capturing
+ microphone input and playing back responses, with support for natural
+ conversation flow and interruptions.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/genai-on-vertex-ai/gemini_2_0/gemini-multimodal-live-api-dev-guide/part_2_dev_api/chapter_06/README.md b/genai-on-vertex-ai/gemini_2_0/gemini-multimodal-live-api-dev-guide/part_2_dev_api/chapter_06/README.md
new file mode 100644
index 00000000..1bddb52b
--- /dev/null
+++ b/genai-on-vertex-ai/gemini_2_0/gemini-multimodal-live-api-dev-guide/part_2_dev_api/chapter_06/README.md
@@ -0,0 +1,123 @@
+# Chapter 6: Gemini Live Chat - Real-time Multimodal Interaction with WebSockets
+
+This chapter takes the real-time audio chat application from **Chapter 5** and significantly enhances it by incorporating **live video input** from the user's webcam or screen. This creates a truly **multimodal** interaction with the Gemini API, demonstrating a more sophisticated and engaging use case. We'll be building upon the existing WebSocket communication and Web Audio API infrastructure to handle both audio and video streams simultaneously.
+
+**Building Upon Previous Chapters:**
+
+Chapter 6 leverages the foundational concepts and components established in earlier chapters:
+
+- **Chapter 2 (Live Audio Chat with Gemini):** Provided the basis for real-time audio interaction, which we extend here.
+- **Chapter 3 (Low-Level WebSocket Interaction):** Introduced the core WebSocket communication principles that are essential for this chapter.
+- **Chapter 4 (Text-to-Speech with WebSockets):** Demonstrated basic audio handling with WebSockets, which we build upon for live audio streaming.
+- **Chapter 5 (Real-time Audio-to-Audio):** Established the foundation for real-time audio streaming using WebSockets and the Web Audio API. Chapter 6 extends this by adding video capabilities. We'll reuse the `AudioRecorder`, `AudioStreamer`, and WebSocket communication logic from this chapter.
+
+**New Functionalities in Chapter 6:**
+
+This chapter introduces the following key additions:
+
+1. **Video Capture and Management:**
+
+ - **`MediaHandler` Class:** A new `MediaHandler` class is introduced to manage user media, specifically for webcam and screen capture. It's responsible for:
+ - Requesting access to the user's webcam or screen using `navigator.mediaDevices.getUserMedia()` and `navigator.mediaDevices.getDisplayMedia()`.
+ - Starting and stopping video streams.
+ - Capturing individual frames from the video stream.
+ - Managing the active state of the webcam and screen sharing (using `isWebcamActive` and `isScreenActive` flags).
+ - **Webcam and Screen Sharing Toggle:** The UI now includes two new buttons with material symbol icons:
+ - **Webcam Button:** Toggles the webcam on and off.
+ - **Screen Sharing Button:** Toggles screen sharing on and off.
+ - **Video Preview:** A `