Skip to content

Commit ecb8c27

Browse files
authored
feat: add Vertex AI Search and Vector Search data connectors for agentic_rag (#791)
* feat: replace Vertex AI Pipeline data ingestion with GCS Data Connector for vertex_ai_search - Switch agentic_rag agent from LangChain retriever to native ADK VertexAiSearchTool for vertex_ai_search datastore type - Replace Vertex AI Pipeline-based ingestion with GCS Data Connector approach using Discovery Engine setUpDataConnectorV2 API - Add Terraform null_resource + external data source pattern for managing data connectors and retrieving auto-created data store IDs - Add shell scripts for connector setup, data store ID retrieval, and on-demand sync - Add Makefile sync-data target for triggering manual data syncs - Skip data_ingestion pipeline file copy for vertex_ai_search projects - Update CI/CD pipelines to only include pipeline steps for vertex_ai_vector_search - Update deployment target service.tf files to use external data source references * feat: add Vertex AI Search and Vector Search data connectors for agentic_rag - Migrate data ingestion from shared location to agent-specific `agents/agentic_rag/data_ingestion/` - Replace shell scripts with Python scripts for data connector setup and management - Add Vector Search 2.0 Collections API support alongside Vertex AI Search - Add `setup-datastore` CLI command and sample data for both datastore types - Use dedicated asp-rag GCP projects for agentic_rag e2e tests - Conditionally enable vectorsearch API only when datastore type requires it - Add --wait flag to connector run for blocking sync support
1 parent 6198e82 commit ecb8c27

File tree

86 files changed

+7096
-3609
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

86 files changed

+7096
-3609
lines changed

.cloudbuild/terraform/apis.tf

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,19 @@ resource "google_project_service" "cloud_resource_manager_api" {
2525
disable_on_destroy = false
2626
}
2727

28+
# Enable Cloud Resource Manager API for each project in e2e_rag_project_mapping
29+
resource "google_project_service" "cloud_resource_manager_api_rag" {
30+
for_each = {
31+
"dev" = var.e2e_rag_project_mapping.dev
32+
"staging" = var.e2e_rag_project_mapping.staging
33+
"prod" = var.e2e_rag_project_mapping.prod
34+
}
35+
36+
project = each.value
37+
service = "cloudresourcemanager.googleapis.com"
38+
disable_on_destroy = false
39+
}
40+
2841
# Enable Cloud Scheduler API for scheduled cleanup jobs
2942
resource "google_project_service" "cloud_scheduler_api" {
3043
project = var.cicd_runner_project_id

.cloudbuild/terraform/build_triggers.tf

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ locals {
3333
"agent_starter_pack/agents/**",
3434
"agent_starter_pack/cli/**",
3535
"tests/**",
36-
"agent_starter_pack/data_ingestion/**",
36+
"agent_starter_pack/agents/agentic_rag/data_ingestion/**",
3737
"pyproject.toml",
3838
"uv.lock",
3939
".cloudbuild/**",
@@ -42,7 +42,7 @@ locals {
4242
lint_templated_agents_included_files = [
4343
"agent_starter_pack/cli/**",
4444
"agent_starter_pack/base_templates/**",
45-
"agent_starter_pack/data_ingestion/**",
45+
"agent_starter_pack/agents/agentic_rag/data_ingestion/**",
4646
"agent_starter_pack/deployment_targets/**",
4747
"tests/integration/test_template_linting.py",
4848
"tests/integration/test_templated_patterns.py",
@@ -82,11 +82,11 @@ locals {
8282
},
8383
{
8484
name = "agentic_rag-agent_engine-vertex_ai_search"
85-
value = "agentic_rag,agent_engine,--include-data-ingestion,--datastore,vertex_ai_search"
85+
value = "agentic_rag,agent_engine,--datastore,vertex_ai_search"
8686
},
8787
{
8888
name = "agentic_rag-cloud_run-vertex_ai_vector_search"
89-
value = "agentic_rag,cloud_run,--include-data-ingestion,--datastore,vertex_ai_vector_search"
89+
value = "agentic_rag,cloud_run,--datastore,vertex_ai_vector_search"
9090
},
9191
{
9292
name = "adk_live-agent_engine"
@@ -237,7 +237,7 @@ locals {
237237
},
238238
{
239239
name = "agentic_rag-agent_engine-vertex_ai_search-github"
240-
value = "agentic_rag,agent_engine,--include-data-ingestion,--datastore,vertex_ai_search,--cicd-runner,github_actions"
240+
value = "agentic_rag,agent_engine,--datastore,vertex_ai_search,--cicd-runner,github_actions"
241241
},
242242
{
243243
name = "adk_live-agent_engine-github"
@@ -257,11 +257,11 @@ locals {
257257
},
258258
{
259259
name = "agentic_rag-agent_engine-vertex_ai_search"
260-
value = "agentic_rag,agent_engine,--include-data-ingestion,--datastore,vertex_ai_search"
260+
value = "agentic_rag,agent_engine,--datastore,vertex_ai_search"
261261
},
262262
{
263263
name = "agentic_rag-cloud_run-vertex_ai_vector_search"
264-
value = "agentic_rag,cloud_run,--include-data-ingestion,--datastore,vertex_ai_vector_search"
264+
value = "agentic_rag,cloud_run,--datastore,vertex_ai_vector_search"
265265
},
266266
{
267267
name = "adk_live-agent_engine"
@@ -388,7 +388,7 @@ locals {
388388
"pyproject.toml",
389389
] : substr(combo.name, 0, 11) == "agentic_rag" ? [
390390
"agent_starter_pack/agents/agentic_rag/**",
391-
"agent_starter_pack/data_ingestion/**",
391+
"agent_starter_pack/agents/agentic_rag/data_ingestion/**",
392392
"pyproject.toml",
393393
] : substr(combo.name, 0, 8) == "adk_live" ? [
394394
"agent_starter_pack/agents/adk_live/**",
@@ -401,7 +401,7 @@ locals {
401401
# Shared and Python base templates only (not Go/Java/TypeScript)
402402
"agent_starter_pack/base_templates/_shared/**",
403403
"agent_starter_pack/base_templates/python/**",
404-
"agent_starter_pack/data_ingestion/**",
404+
"agent_starter_pack/agents/agentic_rag/data_ingestion/**",
405405
# Python deployment targets only (not Go/Java/TypeScript)
406406
"agent_starter_pack/deployment_targets/agent_engine/_shared/**",
407407
"agent_starter_pack/deployment_targets/agent_engine/python/**",
@@ -562,9 +562,9 @@ resource "google_cloudbuild_trigger" "main_e2e_deployment_test" {
562562

563563
substitutions = {
564564
_TEST_AGENT_COMBINATION = each.value.value
565-
_E2E_DEV_PROJECT = var.e2e_test_project_mapping.dev
566-
_E2E_STAGING_PROJECT = var.e2e_test_project_mapping.staging
567-
_E2E_PROD_PROJECT = var.e2e_test_project_mapping.prod
565+
_E2E_DEV_PROJECT = startswith(each.key, "agentic_rag") ? var.e2e_rag_project_mapping.dev : var.e2e_test_project_mapping.dev
566+
_E2E_STAGING_PROJECT = startswith(each.key, "agentic_rag") ? var.e2e_rag_project_mapping.staging : var.e2e_test_project_mapping.staging
567+
_E2E_PROD_PROJECT = startswith(each.key, "agentic_rag") ? var.e2e_rag_project_mapping.prod : var.e2e_test_project_mapping.prod
568568
_SECRETS_PROJECT_ID = "asp-e2e-vars"
569569
_COMMIT_MESSAGE = "$(push.head_commit.message)"
570570
}

.cloudbuild/terraform/service_account.tf

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,25 @@ resource "google_project_iam_member" "cicd_runner_e2e_project_roles" {
6060
member = "serviceAccount:${google_service_account.cicd_runner_sa.email}"
6161
}
6262

63+
# Grant permissions to the service account for RAG E2E project environments
64+
resource "google_project_iam_member" "cicd_runner_rag_project_roles" {
65+
for_each = {
66+
for idx, proj_role in flatten([
67+
for env, project_id in var.e2e_rag_project_mapping : [
68+
for role in local.e2e_project_roles : {
69+
project = project_id
70+
env = env
71+
role = role
72+
}
73+
]
74+
]) : "${proj_role.env}-${proj_role.role}" => proj_role
75+
}
76+
77+
project = each.value.project
78+
role = each.value.role
79+
member = "serviceAccount:${google_service_account.cicd_runner_sa.email}"
80+
}
81+
6382
# Grant owner permissions to the service account for all cleanup projects
6483
resource "google_project_iam_member" "cicd_runner_cleanup_project_roles" {
6584
for_each = toset(var.cleanup_project_ids)

.cloudbuild/terraform/variables.tf

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,15 @@ variable "e2e_test_project_mapping" {
4242
})
4343
}
4444

45+
variable "e2e_rag_project_mapping" {
46+
description = "Mapping of project IDs for agentic_rag E2E tests (separate projects with datastore resources)"
47+
type = object({
48+
dev = string
49+
prod = string
50+
staging = string
51+
})
52+
}
53+
4554
variable "cleanup_project_ids" {
4655
description = "List of all project IDs that need cleanup (for scheduled cleanup job)"
4756
type = list(string)
@@ -51,6 +60,9 @@ variable "cleanup_project_ids" {
5160
"asp-e2e-prd",
5261
"asp-test-dev",
5362
"asp-test-prd",
54-
"asp-test-stg"
63+
"asp-test-stg",
64+
"asp-rag-dev",
65+
"asp-rag-stg",
66+
"asp-rag-prd"
5567
]
5668
}

.cloudbuild/terraform/vars/env.tfvars

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,3 +11,9 @@ e2e_test_project_mapping = {
1111
staging = "asp-e2e-stg"
1212
prod = "asp-e2e-prd"
1313
}
14+
15+
e2e_rag_project_mapping = {
16+
dev = "asp-rag-dev"
17+
staging = "asp-rag-stg"
18+
prod = "asp-rag-prd"
19+
}

GEMINI.md

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -707,7 +707,6 @@ asp_version = "0.25.0"
707707
deployment_target = "cloud_run"
708708
session_type = "in_memory"
709709
cicd_runner = "skip"
710-
include_data_ingestion = false
711710
```
712711

713712
The `create_params` section enables the `enhance` command to recreate identical scaffolding with the locked ASP version.

agent_starter_pack/agents/agentic_rag/.template/templateconfig.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ settings:
2020
deployment_targets: ["agent_engine", "cloud_run", "none"]
2121
extra_dependencies: [
2222
"google-adk>=1.15.0,<2.0.0",
23+
"google-cloud-vectorsearch",
2324
"langchain-google-vertexai~=2.0.7",
2425
"langchain~=0.3.24",
2526
"langchain-core~=0.3.55",

agent_starter_pack/agents/agentic_rag/app/agent.py

Lines changed: 30 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -29,13 +29,15 @@
2929
)
3030
from google.cloud import bigquery
3131
{%- endif %}
32+
{%- if cookiecutter.datastore_type == "vertex_ai_search" %}
33+
from google.adk.tools import VertexAiSearchTool
34+
{%- endif %}
3235
from google.genai import types
33-
from langchain_google_vertexai import VertexAIEmbeddings
36+
{%- if cookiecutter.datastore_type == "vertex_ai_vector_search" %}
3437

35-
from {{cookiecutter.agent_directory}}.retrievers import get_compressor, get_retriever
36-
from {{cookiecutter.agent_directory}}.templates import format_docs
38+
from {{cookiecutter.agent_directory}}.retrievers import search_collection
39+
{%- endif %}
3740

38-
EMBEDDING_MODEL = "text-embedding-005"
3941
LLM_LOCATION = "global"
4042
LOCATION = "us-central1"
4143
LLM = "gemini-3-flash-preview"
@@ -46,47 +48,22 @@
4648
os.environ["GOOGLE_GENAI_USE_VERTEXAI"] = "True"
4749

4850
vertexai.init(project=project_id, location=LOCATION)
49-
embedding = VertexAIEmbeddings(
50-
project=project_id, location=LOCATION, model_name=EMBEDDING_MODEL
51-
)
5251

5352
{% if cookiecutter.datastore_type == "vertex_ai_search" %}
54-
EMBEDDING_COLUMN = "embedding"
55-
TOP_K = 5
56-
57-
data_store_region = os.getenv("DATA_STORE_REGION", "us")
58-
data_store_id = os.getenv("DATA_STORE_ID", "{{cookiecutter.project_name}}-datastore")
59-
60-
retriever = get_retriever(
61-
project_id=project_id,
62-
data_store_id=data_store_id,
63-
data_store_region=data_store_region,
64-
embedding=embedding,
65-
embedding_column=EMBEDDING_COLUMN,
66-
max_documents=10,
53+
data_store_region = os.getenv("DATA_STORE_REGION", "global")
54+
data_store_id = os.getenv(
55+
"DATA_STORE_ID", "{{cookiecutter.project_name}}-collection_documents"
6756
)
68-
{% elif cookiecutter.datastore_type == "vertex_ai_vector_search" %}
69-
vector_search_index = os.getenv(
70-
"VECTOR_SEARCH_INDEX", "{{cookiecutter.project_name}}-vector-search"
71-
)
72-
vector_search_index_endpoint = os.getenv(
73-
"VECTOR_SEARCH_INDEX_ENDPOINT", "{{cookiecutter.project_name}}-vector-search-endpoint"
74-
)
75-
vector_search_bucket = os.getenv(
76-
"VECTOR_SEARCH_BUCKET", f"{project_id}-{{cookiecutter.project_name}}-vs"
57+
data_store_path = (
58+
f"projects/{project_id}/locations/{data_store_region}"
59+
f"/collections/default_collection/dataStores/{data_store_id}"
7760
)
7861

79-
retriever = get_retriever(
80-
project_id=project_id,
81-
region=LOCATION,
82-
vector_search_bucket=vector_search_bucket,
83-
vector_search_index=vector_search_index,
84-
vector_search_index_endpoint=vector_search_index_endpoint,
85-
embedding=embedding,
86-
)
87-
{% endif %}
88-
compressor = get_compressor(
89-
project_id=project_id,
62+
vertex_search_tool = VertexAiSearchTool(data_store_id=data_store_path)
63+
{% elif cookiecutter.datastore_type == "vertex_ai_vector_search" %}
64+
vector_search_collection = os.getenv(
65+
"VECTOR_SEARCH_COLLECTION",
66+
f"projects/{project_id}/locations/{LOCATION}/collections/{{cookiecutter.project_name}}-collection",
9067
)
9168

9269

@@ -99,22 +76,19 @@ def retrieve_docs(query: str) -> str:
9976
query (str): The user's question or search query.
10077
10178
Returns:
102-
str: Formatted string containing relevant document content retrieved and ranked based on the query.
79+
str: Formatted string containing relevant document content.
10380
"""
10481
try:
105-
# Use the retriever to fetch relevant documents based on the query
106-
retrieved_docs = retriever.invoke(query)
107-
# Re-rank docs with Vertex AI Rank for better relevance
108-
ranked_docs = compressor.compress_documents(
109-
documents=retrieved_docs, query=query
82+
return search_collection(
83+
query=query,
84+
collection_path=vector_search_collection,
11085
)
111-
# Format ranked documents into a consistent structure for LLM consumption
112-
formatted_docs = format_docs.format(docs=ranked_docs)
11386
except Exception as e:
114-
return f"Calling retrieval tool with query:\n\n{query}\n\nraised the following error:\n\n{type(e)}: {e}"
115-
116-
return formatted_docs
117-
87+
return (
88+
f"Calling retrieval tool with query:\n\n{query}\n\n"
89+
f"raised the following error:\n\n{type(e)}: {e}"
90+
)
91+
{% endif %}
11892

11993
instruction = """You are an AI assistant for question-answering tasks.
12094
Answer to the best of your ability using the context provided.
@@ -129,7 +103,11 @@ def retrieve_docs(query: str) -> str:
129103
retry_options=types.HttpRetryOptions(attempts=3),
130104
),
131105
instruction=instruction,
106+
{%- if cookiecutter.datastore_type == "vertex_ai_search" %}
107+
tools=[vertex_search_tool],
108+
{%- elif cookiecutter.datastore_type == "vertex_ai_vector_search" %}
132109
tools=[retrieve_docs],
110+
{%- endif %}
133111
)
134112

135113
{%- if cookiecutter.bq_analytics %}

0 commit comments

Comments
 (0)