|
| 1 | +from langchain.text_splitter import RecursiveCharacterTextSplitter |
| 2 | +from pathlib import Path |
| 3 | +import google.generativeai as genai |
| 4 | +from google.generativeai import embed_content |
| 5 | +from google.cloud import bigquery |
| 6 | +import hashlib |
| 7 | +import time |
| 8 | +import random |
| 9 | +from google.api_core.exceptions import ResourceExhausted, GoogleAPIError |
| 10 | +import os |
| 11 | +from dotenv import load_dotenv |
| 12 | + |
| 13 | +DOC_TABLES = {'doc_text', 'doc_text_vector'} |
| 14 | +load_dotenv() |
| 15 | + |
| 16 | +genai.configure(api_key=os.getenv("GOOGLE_API_KEY")) |
| 17 | +bq_client = bigquery.Client() |
| 18 | + |
| 19 | +def chunk_markdown_file(markdown_text, chunk_size=500, chunk_overlap=50): |
| 20 | + |
| 21 | + # Create the text splitter with the Chroma-recommended separators |
| 22 | + text_splitter = RecursiveCharacterTextSplitter( |
| 23 | + chunk_size=chunk_size, |
| 24 | + chunk_overlap=chunk_overlap, |
| 25 | + separators=["\n\n", "\n", ".", "?", "!", " ", ""] |
| 26 | + ) |
| 27 | + |
| 28 | + # Split the document into chunks |
| 29 | + chunks = text_splitter.split_text(markdown_text) |
| 30 | + |
| 31 | + return chunks |
| 32 | + |
| 33 | +class Chunk: |
| 34 | + def __init__(self, chunk_vector, chunk_text): |
| 35 | + self.chunk_vector = chunk_vector |
| 36 | + self.chunk_text = chunk_text |
| 37 | + |
| 38 | +def vectorize_chunk_with_retry(chunk, max_retries=5): |
| 39 | + model = 'models/gemini-embedding-exp-03-07' |
| 40 | + for attempt in range(max_retries): |
| 41 | + try: |
| 42 | + print(f"Embedding chunk (attempt {attempt + 1})...") |
| 43 | + response = embed_content( |
| 44 | + model=model, |
| 45 | + content=chunk, |
| 46 | + task_type="retrieval_document" |
| 47 | + ) |
| 48 | + embedding = response["embedding"] |
| 49 | + return Chunk(embedding, chunk) |
| 50 | + except ResourceExhausted as e: |
| 51 | + wait = 2 ** attempt + random.uniform(0.5, 1.5) |
| 52 | + print(f"🚧 Rate limit hit (429). Retrying in {wait:.2f}s...") |
| 53 | + time.sleep(wait) |
| 54 | + except GoogleAPIError as e: |
| 55 | + print(f"🚨 Unexpected API error: {e}") |
| 56 | + raise e |
| 57 | + raise RuntimeError("❌ Max retries exceeded for Gemini embedding.") |
| 58 | + |
| 59 | + |
| 60 | +def vectorize_chunk(chunk): |
| 61 | + model = 'models/gemini-embedding-exp-03-07' |
| 62 | + response = embed_content( |
| 63 | + model=model, |
| 64 | + content=chunk, |
| 65 | + task_type="retrieval_document" # use "retrieval_query" for queries |
| 66 | + ) |
| 67 | + embedding = response["embedding"] |
| 68 | + return Chunk(embedding, chunk) |
| 69 | + |
| 70 | +def compute_file_hash(file_path): |
| 71 | + content = file_path.read_bytes() |
| 72 | + return hashlib.sha256(content).hexdigest() |
| 73 | + |
| 74 | +def write_to_bq_doc_text(file_hash, content): |
| 75 | + bq_client = bigquery.Client() |
| 76 | + table = bq_client.get_table("flutterflow-io-6f20.documentation.doc_text") |
| 77 | + errors =bq_client.insert_rows(table, [{"doc_id": file_hash, "doc_text": content}]) |
| 78 | + if errors: |
| 79 | + print(f"Encountered errors while inserting rows: {errors}") |
| 80 | + else: |
| 81 | + print("Successfully inserted rows into bigquery") |
| 82 | + |
| 83 | +def file_already_uploaded(table, file_hash): |
| 84 | + if table not in DOC_TABLES: |
| 85 | + raise ValueError(f"Invalid table: {table}") |
| 86 | + dataset_table = f"flutterflow-io-6f20.documentation.{table}" |
| 87 | + query = f""" |
| 88 | + SELECT COUNT(*) as count |
| 89 | + FROM `{dataset_table}` |
| 90 | + WHERE doc_id = @file_hash |
| 91 | + """ |
| 92 | + job_config = bigquery.QueryJobConfig( |
| 93 | + query_parameters=[ |
| 94 | + bigquery.ScalarQueryParameter("file_hash", "STRING", file_hash), |
| 95 | + ] |
| 96 | + ) |
| 97 | + result = bq_client.query(query, job_config=job_config).result() |
| 98 | + return next(result).count > 0 |
| 99 | + |
| 100 | +def batch_write_to_bq(table, rows, batch_size=10): |
| 101 | + for i in range(0, len(rows), batch_size): |
| 102 | + batch = rows[i:i+batch_size] |
| 103 | + errors = bq_client.insert_rows(table, batch) |
| 104 | + if errors: |
| 105 | + print(f"Errors inserting batch {i//batch_size}: {errors}") |
| 106 | + else: |
| 107 | + print(f"Successfully inserted batch {i//batch_size}") |
| 108 | + |
| 109 | +def write_to_bq_doc_text_vector(file_hash, content, chunks): |
| 110 | + bq_client = bigquery.Client() |
| 111 | + table_id = "flutterflow-io-6f20.documentation.doc_text_vector" |
| 112 | + |
| 113 | + # Step 1: Delete existing rows for the doc_id (now safe since we're not using streaming) |
| 114 | + delete_query = f""" |
| 115 | + DELETE FROM `{table_id}` |
| 116 | + WHERE doc_id = @file_hash |
| 117 | + """ |
| 118 | + job_config = bigquery.QueryJobConfig( |
| 119 | + query_parameters=[ |
| 120 | + bigquery.ScalarQueryParameter("file_hash", "STRING", file_hash) |
| 121 | + ] |
| 122 | + ) |
| 123 | + bq_client.query(delete_query, job_config=job_config).result() |
| 124 | + |
| 125 | + # Step 2: Generate vectorized rows |
| 126 | + to_insert = [] |
| 127 | + for i, chunk_text in enumerate(chunks): |
| 128 | + chunk = vectorize_chunk(chunk_text) |
| 129 | + row = { |
| 130 | + "doc_id": file_hash, |
| 131 | + "chunk_id": i, |
| 132 | + "vectorized_chunk": chunk.chunk_vector, |
| 133 | + "text": chunk.chunk_text |
| 134 | + } |
| 135 | + to_insert.append(row) |
| 136 | + |
| 137 | + # Step 3: Batch insert via load_table_from_json |
| 138 | + job = bq_client.load_table_from_json( |
| 139 | + to_insert, |
| 140 | + table_id, |
| 141 | + job_config=bigquery.LoadJobConfig( |
| 142 | + write_disposition="WRITE_APPEND", # Append instead of overwrite |
| 143 | + source_format=bigquery.SourceFormat.NEWLINE_DELIMITED_JSON |
| 144 | + ) |
| 145 | + ) |
| 146 | + job.result() |
| 147 | + print(f"Inserted {len(to_insert)} rows into {table_id}") |
| 148 | + |
| 149 | + |
| 150 | +def backfill(): |
| 151 | + current_file = Path(__file__).resolve() |
| 152 | + |
| 153 | + # Navigate to project root: `.github/scripts/` → project root → `docs/` |
| 154 | + project_root = current_file.parents[2] |
| 155 | + docs_dir = project_root / 'docs' |
| 156 | + for file_path in docs_dir.rglob("*.md"): |
| 157 | + file_hash = compute_file_hash(file_path) |
| 158 | + with open(file_path, 'r', encoding='utf-8') as file: |
| 159 | + markdown_text = file.read() |
| 160 | + if not file_already_uploaded('doc_text', file_hash): |
| 161 | + write_to_bq_doc_text(file_hash, markdown_text) |
| 162 | + chunks = chunk_markdown_file(markdown_text) |
| 163 | + if not file_already_uploaded('doc_text_vector', file_hash): |
| 164 | + write_to_bq_doc_text_vector(file_hash, markdown_text, chunks) |
| 165 | + else: |
| 166 | + print(f"Skipping {file_path} because it already exists in BigQuery") |
| 167 | + print("Finished vectorizing all files") |
| 168 | + |
| 169 | +def process_single_file(file_path: Path): |
| 170 | + file_hash = compute_file_hash(file_path) |
| 171 | + with open(file_path, 'r', encoding='utf-8') as file: |
| 172 | + markdown_text = file.read() |
| 173 | + chunks = chunk_markdown_file(markdown_text) |
| 174 | + if not file_already_uploaded('doc_text', file_hash): |
| 175 | + write_to_bq_doc_text(file_hash, markdown_text) |
| 176 | + if not file_already_uploaded('doc_text_vector', file_hash): |
| 177 | + write_to_bq_doc_text_vector(file_hash, markdown_text, chunks) |
| 178 | + else: |
| 179 | + print(f"Skipping {file_path} because it already exists in BigQuery") |
| 180 | + |
| 181 | +backfill() |
| 182 | + |
| 183 | + |
| 184 | + |
0 commit comments