Skip to content

Commit 78d676a

Browse files
added logic to vectorize docs and github action
1 parent ce35929 commit 78d676a

File tree

3 files changed

+240
-0
lines changed

3 files changed

+240
-0
lines changed

.github/scripts/vectorize.py

Lines changed: 184 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,184 @@
1+
from langchain.text_splitter import RecursiveCharacterTextSplitter
2+
from pathlib import Path
3+
import google.generativeai as genai
4+
from google.generativeai import embed_content
5+
from google.cloud import bigquery
6+
import hashlib
7+
import time
8+
import random
9+
from google.api_core.exceptions import ResourceExhausted, GoogleAPIError
10+
import os
11+
from dotenv import load_dotenv
12+
13+
DOC_TABLES = {'doc_text', 'doc_text_vector'}
14+
load_dotenv()
15+
16+
genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))
17+
bq_client = bigquery.Client()
18+
19+
def chunk_markdown_file(markdown_text, chunk_size=500, chunk_overlap=50):
20+
21+
# Create the text splitter with the Chroma-recommended separators
22+
text_splitter = RecursiveCharacterTextSplitter(
23+
chunk_size=chunk_size,
24+
chunk_overlap=chunk_overlap,
25+
separators=["\n\n", "\n", ".", "?", "!", " ", ""]
26+
)
27+
28+
# Split the document into chunks
29+
chunks = text_splitter.split_text(markdown_text)
30+
31+
return chunks
32+
33+
class Chunk:
34+
def __init__(self, chunk_vector, chunk_text):
35+
self.chunk_vector = chunk_vector
36+
self.chunk_text = chunk_text
37+
38+
def vectorize_chunk_with_retry(chunk, max_retries=5):
39+
model = 'models/gemini-embedding-exp-03-07'
40+
for attempt in range(max_retries):
41+
try:
42+
print(f"Embedding chunk (attempt {attempt + 1})...")
43+
response = embed_content(
44+
model=model,
45+
content=chunk,
46+
task_type="retrieval_document"
47+
)
48+
embedding = response["embedding"]
49+
return Chunk(embedding, chunk)
50+
except ResourceExhausted as e:
51+
wait = 2 ** attempt + random.uniform(0.5, 1.5)
52+
print(f"🚧 Rate limit hit (429). Retrying in {wait:.2f}s...")
53+
time.sleep(wait)
54+
except GoogleAPIError as e:
55+
print(f"🚨 Unexpected API error: {e}")
56+
raise e
57+
raise RuntimeError("❌ Max retries exceeded for Gemini embedding.")
58+
59+
60+
def vectorize_chunk(chunk):
61+
model = 'models/gemini-embedding-exp-03-07'
62+
response = embed_content(
63+
model=model,
64+
content=chunk,
65+
task_type="retrieval_document" # use "retrieval_query" for queries
66+
)
67+
embedding = response["embedding"]
68+
return Chunk(embedding, chunk)
69+
70+
def compute_file_hash(file_path):
71+
content = file_path.read_bytes()
72+
return hashlib.sha256(content).hexdigest()
73+
74+
def write_to_bq_doc_text(file_hash, content):
75+
bq_client = bigquery.Client()
76+
table = bq_client.get_table("flutterflow-io-6f20.documentation.doc_text")
77+
errors =bq_client.insert_rows(table, [{"doc_id": file_hash, "doc_text": content}])
78+
if errors:
79+
print(f"Encountered errors while inserting rows: {errors}")
80+
else:
81+
print("Successfully inserted rows into bigquery")
82+
83+
def file_already_uploaded(table, file_hash):
84+
if table not in DOC_TABLES:
85+
raise ValueError(f"Invalid table: {table}")
86+
dataset_table = f"flutterflow-io-6f20.documentation.{table}"
87+
query = f"""
88+
SELECT COUNT(*) as count
89+
FROM `{dataset_table}`
90+
WHERE doc_id = @file_hash
91+
"""
92+
job_config = bigquery.QueryJobConfig(
93+
query_parameters=[
94+
bigquery.ScalarQueryParameter("file_hash", "STRING", file_hash),
95+
]
96+
)
97+
result = bq_client.query(query, job_config=job_config).result()
98+
return next(result).count > 0
99+
100+
def batch_write_to_bq(table, rows, batch_size=10):
101+
for i in range(0, len(rows), batch_size):
102+
batch = rows[i:i+batch_size]
103+
errors = bq_client.insert_rows(table, batch)
104+
if errors:
105+
print(f"Errors inserting batch {i//batch_size}: {errors}")
106+
else:
107+
print(f"Successfully inserted batch {i//batch_size}")
108+
109+
def write_to_bq_doc_text_vector(file_hash, content, chunks):
110+
bq_client = bigquery.Client()
111+
table_id = "flutterflow-io-6f20.documentation.doc_text_vector"
112+
113+
# Step 1: Delete existing rows for the doc_id (now safe since we're not using streaming)
114+
delete_query = f"""
115+
DELETE FROM `{table_id}`
116+
WHERE doc_id = @file_hash
117+
"""
118+
job_config = bigquery.QueryJobConfig(
119+
query_parameters=[
120+
bigquery.ScalarQueryParameter("file_hash", "STRING", file_hash)
121+
]
122+
)
123+
bq_client.query(delete_query, job_config=job_config).result()
124+
125+
# Step 2: Generate vectorized rows
126+
to_insert = []
127+
for i, chunk_text in enumerate(chunks):
128+
chunk = vectorize_chunk(chunk_text)
129+
row = {
130+
"doc_id": file_hash,
131+
"chunk_id": i,
132+
"vectorized_chunk": chunk.chunk_vector,
133+
"text": chunk.chunk_text
134+
}
135+
to_insert.append(row)
136+
137+
# Step 3: Batch insert via load_table_from_json
138+
job = bq_client.load_table_from_json(
139+
to_insert,
140+
table_id,
141+
job_config=bigquery.LoadJobConfig(
142+
write_disposition="WRITE_APPEND", # Append instead of overwrite
143+
source_format=bigquery.SourceFormat.NEWLINE_DELIMITED_JSON
144+
)
145+
)
146+
job.result()
147+
print(f"Inserted {len(to_insert)} rows into {table_id}")
148+
149+
150+
def backfill():
151+
current_file = Path(__file__).resolve()
152+
153+
# Navigate to project root: `.github/scripts/` → project root → `docs/`
154+
project_root = current_file.parents[2]
155+
docs_dir = project_root / 'docs'
156+
for file_path in docs_dir.rglob("*.md"):
157+
file_hash = compute_file_hash(file_path)
158+
with open(file_path, 'r', encoding='utf-8') as file:
159+
markdown_text = file.read()
160+
if not file_already_uploaded('doc_text', file_hash):
161+
write_to_bq_doc_text(file_hash, markdown_text)
162+
chunks = chunk_markdown_file(markdown_text)
163+
if not file_already_uploaded('doc_text_vector', file_hash):
164+
write_to_bq_doc_text_vector(file_hash, markdown_text, chunks)
165+
else:
166+
print(f"Skipping {file_path} because it already exists in BigQuery")
167+
print("Finished vectorizing all files")
168+
169+
def process_single_file(file_path: Path):
170+
file_hash = compute_file_hash(file_path)
171+
with open(file_path, 'r', encoding='utf-8') as file:
172+
markdown_text = file.read()
173+
chunks = chunk_markdown_file(markdown_text)
174+
if not file_already_uploaded('doc_text', file_hash):
175+
write_to_bq_doc_text(file_hash, markdown_text)
176+
if not file_already_uploaded('doc_text_vector', file_hash):
177+
write_to_bq_doc_text_vector(file_hash, markdown_text, chunks)
178+
else:
179+
print(f"Skipping {file_path} because it already exists in BigQuery")
180+
181+
backfill()
182+
183+
184+

.github/scripts/vectorize_action.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
# .github/scripts/vectorize_action.py
2+
import sys
3+
from pathlib import Path
4+
from vectorize import process_single_file
5+
6+
if __name__ == "__main__":
7+
if len(sys.argv) < 2:
8+
print("Please provide a file path to process.")
9+
sys.exit(1)
10+
11+
file_path = Path(sys.argv[1])
12+
if file_path.exists():
13+
process_single_file(file_path)
14+
else:
15+
print(f"File not found: {file_path}")
16+
sys.exit(1)

.github/workflows/vectorize-docs.yml

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
name: Vectorize Updated Docs
2+
3+
on:
4+
push:
5+
branches:
6+
- '**' # TODO: set to main when done testing
7+
paths:
8+
- 'flutterflow-documentation/docs/**/*.md'
9+
10+
jobs:
11+
vectorize:
12+
runs-on: ubuntu-latest
13+
14+
steps:
15+
- name: Checkout repository
16+
uses: actions/checkout@v3
17+
18+
- name: Set up Python
19+
uses: actions/setup-python@v4
20+
with:
21+
python-version: 3.11
22+
23+
- name: Install dependencies
24+
run: |
25+
python -m pip install --upgrade pip
26+
pip install langchain google-cloud-bigquery google-generativeai
27+
28+
- name: Authenticate GCP
29+
env:
30+
GOOGLE_APPLICATION_CREDENTIALS: ${{ secrets.GCP_CREDENTIALS }}
31+
run: |
32+
echo "${GOOGLE_APPLICATION_CREDENTIALS}" > gcp-key.json
33+
export GOOGLE_APPLICATION_CREDENTIALS=gcp-key.json
34+
35+
- name: Vectorize modified files
36+
run: |
37+
for file in $(git diff --name-only ${{ github.event.before }} ${{ github.sha }} | grep '^flutterflow-documentation/docs/.*\.md$'); do
38+
echo "Processing $file..."
39+
python .github/scripts/vectorize_action.py "$file"
40+
done

0 commit comments

Comments
 (0)