diff --git a/.gitignore b/.gitignore index d14eab8..bc6d753 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,5 @@ dist node_modules -.env \ No newline at end of file +.env +ingest/build +ingest/postgres diff --git a/ingest/postgres_docs.py b/ingest/postgres_docs.py new file mode 100644 index 0000000..a37f16e --- /dev/null +++ b/ingest/postgres_docs.py @@ -0,0 +1,416 @@ +from dataclasses import dataclass +from dotenv import load_dotenv +from bs4 import BeautifulSoup, element as BeautifulSoupElement +import json +from markdownify import markdownify +import openai +import os +from pathlib import Path +import psycopg +import re +import shutil +import subprocess +import tiktoken + + +THIS_DIR = Path(__file__).parent.resolve() + +load_dotenv(dotenv_path=os.path.join(THIS_DIR, "..", ".env")) + +OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") + +POSTGRES_DIR = THIS_DIR / "postgres" +SMGL_DIR = POSTGRES_DIR / "doc" / "src" / "sgml" +HTML_DIR = SMGL_DIR / "html" +BUILD_DIR = THIS_DIR / "build" +BUILD_DIR.mkdir(exist_ok=True) +MD_DIR = BUILD_DIR / "md" + +POSTGRES_BASE_URL = "https://www.postgresql.org/docs" + +ENC = tiktoken.get_encoding("cl100k_base") +MAX_CHUNK_TOKENS = 7000 + +def update_repo(): + if not POSTGRES_DIR.exists(): + subprocess.run( + "git clone https://github.com/postgres/postgres.git postgres", + shell=True, + check=True, + env=os.environ, + text=True, + ) + else: + subprocess.run( + "git fetch", + shell=True, + check=True, + env=os.environ, + text=True, + cwd=POSTGRES_DIR, + ) + + +def build_html(version: int, tag: str) -> None: + html_stamp = SMGL_DIR / "html-stamp" + + # make uses the presence of html-stamp to determine if it needs to + # rebuild the html docs. + if html_stamp.exists(): + html_stamp.unlink() + + if HTML_DIR.exists(): + shutil.rmtree(HTML_DIR) + + print(f"checking out version {version} at {tag}...") + subprocess.run( + f"git checkout {tag}", + shell=True, + check=True, + env=os.environ, + text=True, + cwd=POSTGRES_DIR, + ) + + print("configuring postgres build...") + environ = os.environ.copy() + # Shim for macOS and icu4c installed via homebrew, where it's not linked into + # /usr/local by default. + if Path("/opt/homebrew/opt/icu4c/lib/pkgconfig").exists(): + environ["PKG_CONFIG_PATH"] = "/opt/homebrew/opt/icu4c/lib/pkgconfig" + subprocess.run( + "./configure --without-readline --without-zlib", + shell=True, + check=True, + env=environ, + text=True, + cwd=POSTGRES_DIR, + ) + + print("building postgres docs...") + subprocess.run( + "make html", + shell=True, + check=True, + env=os.environ, + text=True, + cwd=SMGL_DIR, + ) + + +def build_markdown() -> None: + print("converting to markdown...") + if MD_DIR.exists(): + shutil.rmtree(MD_DIR) + MD_DIR.mkdir() + + for html_file in HTML_DIR.glob("*.html"): + if html_file.name in [ + "legalnotice.html", + "appendix-obsolete.md", + "appendixes.md", + "biblio.html", + "bookindex.html", + "bug-reporting.html", + "source-format.html", + "error-message-reporting.html", + "error-style-guide.html", + "source-conventions.html", + "sourcerepo.html", + ] or html_file.name.startswith("docguide"): + continue + md_file = MD_DIR / (html_file.stem + ".md") + + html_content = html_file.read_text(encoding="utf-8") + html_content = html_content.replace( + '', "" + ) + + soup = BeautifulSoup(html_content, "html.parser") + + is_refentry = bool(soup.find("div", class_="refentry")) + + elem = soup.find("div", attrs={"id": True}) + if elem and isinstance(elem, BeautifulSoupElement.Tag): + slug = str(elem["id"]).lower() + ".html" + else: + raise SystemError(f"No div with id found in {html_file}") + + title = soup.find("title") + title_text = ( + str(title.string).strip() + if title and isinstance(title, BeautifulSoupElement.Tag) + else "PostgreSQL Documentation" + ) + if title: + title.decompose() + for class_name in ["navheader", "navfooter"]: + for div in soup.find_all("div", class_=class_name): + div.decompose() + + # Convert first h3 to h4 in notice/warning/tip divs + if not is_refentry: + for class_name in ["caution", "important", "notice", "warning", "tip", "note"]: + for div in soup.find_all("div", class_=class_name): + if div is None or not isinstance(div, BeautifulSoupElement.Tag): + continue + h3 = div.find("h3") + if h3 and isinstance(h3, BeautifulSoupElement.Tag): + h3.name = "h4" + + md_content = markdownify(str(soup), heading_style="ATX") + md_content = f"""--- +title: {title_text} +slug: {slug} +refentry: {is_refentry} +--- +{md_content}""" + md_file.write_text(md_content, encoding="utf-8") + + +@dataclass +class Page: + id: int + version: int + url: str + domain: str + filename: str + + +@dataclass +class Chunk: + idx: int + header: str + header_path: list[str] + content: str + token_count: int = 0 + subindex: int = 0 + + +def insert_page( + conn: psycopg.Connection, + page: Page, +) -> None: + print('inserting page', page.filename, page.url) + result = conn.execute( + "insert into docs.postgres_pages_tmp (version, url, domain, filename, content_length, chunks_count) values (%s,%s,%s,%s,%s,%s) RETURNING id", + [ + page.version, + page.url, + page.domain, + page.filename, + 0, + 0, + ], + ) + row = result.fetchone() + assert row is not None + page.id = row[0] + + +def update_page_stats( + conn: psycopg.Connection, + page: Page, +) -> None: + conn.execute(""" + update docs.postgres_pages_tmp p + set + content_length = coalesce(chunks_stats.total_length, 0), + chunks_count = coalesce(chunks_stats.chunks_count, 0) + from ( + select + page_id, + sum(char_length(content)) as total_length, + count(*) as chunks_count + from docs.postgres_chunks_tmp + group by page_id + ) as chunks_stats + where p.id = chunks_stats.page_id + """, [page.id, page.id]) + + +def insert_chunk( + conn: psycopg.Connection, + page: Page, + chunk: Chunk, +) -> None: + client = openai.OpenAI(api_key=OPENAI_API_KEY) + content = '' + for i in range(len(chunk.header_path)): + content += ''.join(['#' for _ in range(i + 1)]) + ' ' + chunk.header_path[i] + '\n\n' + content += chunk.content + embedding = client.embeddings.create( + model="text-embedding-3-small", + input=chunk.content, + ).data[0].embedding + content = chunk.content + # token_count, embedding = embed(header_path, content) + print(f"header: {chunk.header}") + url = page.url + if len(chunk.header_path) > 1: + pattern = r'\((#\S+)\)' + match = re.search(pattern, chunk.header_path[-1]) + if match: + url += match.group(1).lower() + conn.execute( + "insert into docs.postgres_chunks_tmp (page_id, chunk_index, sub_chunk_index, content, metadata, embedding) values (%s,%s,%s,%s,%s,%s)", + [ + page.id, + chunk.idx, + chunk.subindex, + chunk.content, + json.dumps({ + "header": chunk.header, + "header_path": chunk.header_path, + "source_url": url, + "token_count": chunk.token_count, + }), + embedding, + ], + ) + + +def split_chunk(chunk: Chunk) -> list[Chunk]: + num_subchunks = (chunk.token_count // MAX_CHUNK_TOKENS) + 1 + input_ids = ENC.encode(chunk.content) + + tokens_per_chunk = len(input_ids) // num_subchunks + + subchunks = [] + subindex = 0 + idx = 0 + while idx < len(input_ids): + cur_idx = min(idx + tokens_per_chunk, len(input_ids)) + chunk_ids = input_ids[idx:cur_idx] + if not chunk_ids: + break + decoded = ENC.decode(chunk_ids) + if decoded: + subchunks.append(Chunk( + idx=chunk.idx, + header=chunk.header, + header_path=chunk.header_path, + content=decoded, + token_count=len(chunk_ids), + subindex=subindex, + )) + subindex += 1 + if cur_idx == len(input_ids): + break + idx += tokens_per_chunk + return subchunks + + +def process_chunk(conn: psycopg.Connection, page: Page, chunk: Chunk) -> None: + + if chunk.content == "": # discard empty chunks + return + + chunk.token_count = len(ENC.encode(chunk.content)) + if chunk.token_count < 10: # discard chunks that are too tiny to be useful + return + + chunks = [chunk] + + if chunk.token_count > MAX_CHUNK_TOKENS: + print(f"Chunk {chunk.header} too large ({chunk.token_count} tokens), splitting...") + chunks = split_chunk(chunk) + + for chunk in chunks: + insert_chunk(conn, page, chunk) + conn.commit() + + +def chunk_files(conn: psycopg.Connection, version: int) -> None: + conn.execute("drop table if exists docs.postgres_chunks_tmp") + conn.execute("drop table if exists docs.postgres_pages_tmp") + conn.execute("create table docs.postgres_pages_tmp (like docs.postgres_pages including all)") + conn.execute("create table docs.postgres_chunks_tmp (like docs.postgres_chunks including all)") + conn.execute("alter table docs.postgres_chunks_tmp add foreign key (page_id) references docs.postgres_pages_tmp(id)") + conn.commit() + + header_pattern = re.compile('^(#{1,3}) .+$') + codeblock_pattern = re.compile('^```') + + section_prefix = r'^[A-Za-z0-9.]+\.\s*' + chapter_prefix = r'^Chapter\s+[0-9]+\.\s*' + + for md in MD_DIR.glob("*.md"): + print(f"chunking {md}...") + with md.open() as f: + # process the frontmatter + f.readline() + f.readline() # title line + slug = f.readline().split(":", 1)[1].strip() + refentry = f.readline().split(":", 1)[1].strip().lower() == "true" + f.readline() + + page = Page( + id=0, + version=version, + url=f"{POSTGRES_BASE_URL}/{version}/{slug}", + domain="postgresql.org", + filename=md.name, + ) + + insert_page(conn, page) + + header_path = [] + idx = 0 + chunk: Chunk | None = None + in_codeblock = False + while True: + line = f.readline() + if line == "": + if chunk is not None: + process_chunk(conn, page, chunk) + break + match = header_pattern.match(line) + if match is None or in_codeblock or (refentry and chunk is not None): + assert chunk is not None + if codeblock_pattern.match(line): + in_codeblock = not in_codeblock + chunk.content += line + continue + header_hases = match.group(1) + depth = len(header_hases) + header_path = header_path[: (depth - 1)] + header = line.lstrip("#").strip() + header = re.sub(section_prefix, '', header).strip() + header = re.sub(chapter_prefix, '', header).strip() + header_path.append(header) + if chunk is not None: + process_chunk(conn, page, chunk) + chunk = Chunk( + idx=idx, + header=header, + header_path=header_path.copy(), + content="", + ) + idx += 1 + update_page_stats(conn, page) + conn.commit() + + with conn.cursor() as cur: + cur.execute("drop table docs.postgres_chunks") + cur.execute("drop table docs.postgres_pages") + cur.execute("alter table docs.postgres_chunks_tmp rename to postgres_chunks") + cur.execute("alter table docs.postgres_pages_tmp rename to postgres_pages") + conn.commit() + + +if __name__ == "__main__": + update_repo() + postgres_versions = [ + (17, "REL_17_6"), + (16, "REL_16_9"), + (15, "REL_15_13"), + (14, "REL_14_18") + ] + db_uri = f"postgresql://{os.environ['PGUSER']}:{os.environ['PGPASSWORD']}@{os.environ['PGHOST']}:{os.environ['PGPORT']}/{os.environ['PGDATABASE']}" + with psycopg.connect(db_uri) as conn: + for version, tag in postgres_versions: + print(f"Building Postgres {version} documentation...") + build_html(version, tag) + build_markdown() + chunk_files(conn, version)