From d399fb1d415db0f3960c7a37a2f8c8b60e36be08 Mon Sep 17 00:00:00 2001
From: Matthew Peveler <mpeveler@tigerdata.com>
Date: Tue, 16 Sep 2025 15:39:19 -0600
Subject: [PATCH 1/5] feat: add postgres ingest

Signed-off-by: Matthew Peveler <mpeveler@tigerdata.com>
---
 .gitignore              |   4 +-
 ingest/postgres_docs.py | 289 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 292 insertions(+), 1 deletion(-)
 create mode 100644 ingest/postgres_docs.py

diff --git a/.gitignore b/.gitignore
index d14eab8..bc6d753 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,5 @@
 dist
 node_modules
-.env
\ No newline at end of file
+.env
+ingest/build
+ingest/postgres
diff --git a/ingest/postgres_docs.py b/ingest/postgres_docs.py
new file mode 100644
index 0000000..8a57603
--- /dev/null
+++ b/ingest/postgres_docs.py
@@ -0,0 +1,289 @@
+from dataclasses import dataclass
+from dotenv import load_dotenv
+from bs4 import BeautifulSoup, element as BeautifulSoupElement
+from markdownify import markdownify
+import openai
+import os
+from pathlib import Path
+import psycopg
+import re
+import shutil
+import subprocess
+import tiktoken
+
+
+THIS_DIR = Path(__file__).parent.resolve()
+
+load_dotenv(dotenv_path=os.path.join(THIS_DIR, "..", ".env"))
+
+OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
+
+POSTGRES_DIR = THIS_DIR / "postgres"
+SMGL_DIR = POSTGRES_DIR / "doc" / "src" / "sgml"
+HTML_DIR = SMGL_DIR / "html"
+BUILD_DIR = THIS_DIR / "build"
+BUILD_DIR.mkdir(exist_ok=True)
+MD_DIR = BUILD_DIR / "md"
+
+POSTGRES_BASE_URL = "https://www.postgresql.org/docs"
+
+ENC = tiktoken.get_encoding("o200k_base")
+
+def update_repo():
+    if not POSTGRES_DIR.exists():
+        subprocess.run(
+            "git clone https://github.com/postgres/postgres.git postgres",
+            shell=True,
+            check=True,
+            env=os.environ,
+            text=True,
+        )
+    else:
+        subprocess.run(
+            "git fetch",
+            shell=True,
+            check=True,
+            env=os.environ,
+            text=True,
+            cwd=POSTGRES_DIR,
+        )
+
+
+def build_html(version: int, tag: str) -> None:
+    html_stamp = SMGL_DIR / "html-stamp"
+
+    # make uses the presence of html-stamp to determine if it needs to
+    # rebuild the html docs.
+    if html_stamp.exists():
+        html_stamp.unlink()
+
+    if HTML_DIR.exists():
+        shutil.rmtree(HTML_DIR)
+
+    print(f"checking out version {version} at {tag}...")
+    subprocess.run(
+        f"git checkout {tag}",
+        shell=True,
+        check=True,
+        env=os.environ,
+        text=True,
+        cwd=POSTGRES_DIR,
+    )
+
+    print("configuring postgres build...")
+    environ = os.environ.copy()
+    # Shim for macOS and icu4c installed via homebrew, where it's not linked into
+    # /usr/local by default.
+    if Path("/opt/homebrew/opt/icu4c/lib/pkgconfig").exists():
+        environ["PKG_CONFIG_PATH"] = "/opt/homebrew/opt/icu4c/lib/pkgconfig"
+    subprocess.run(
+        "./configure --without-readline --without-zlib",
+        shell=True,
+        check=True,
+        env=environ,
+        text=True,
+        cwd=POSTGRES_DIR,
+    )
+
+    print("building postgres docs...")
+    subprocess.run(
+        "make html",
+        shell=True,
+        check=True,
+        env=os.environ,
+        text=True,
+        cwd=SMGL_DIR,
+    )
+
+
+def build_markdown() -> None:
+    print("converting to markdown...")
+    if MD_DIR.exists():
+        shutil.rmtree(MD_DIR)
+    MD_DIR.mkdir()
+
+    for html_file in HTML_DIR.glob("*.html"):
+        if html_file.name in [
+            "legalnotice.html",
+            "appendix-obsolete.md",
+            "appendixes.md",
+            "biblio.html",
+            "bookindex.html",
+            "bug-reporting.html",
+            "source-format.html",
+            "error-message-reporting.html",
+            "error-style-guide.html",
+            "source-conventions.html",
+            "sourcerepo.html",
+        ] or html_file.name.startswith("docguide"):
+            continue
+        md_file = MD_DIR / (html_file.stem + ".md")
+
+        html_content = html_file.read_text(encoding="utf-8")
+        html_content = html_content.replace(
+            '<?xml version="1.0" encoding="UTF-8" standalone="no"?>', ""
+        )
+
+        soup = BeautifulSoup(html_content, "html.parser")
+
+        is_refentry = bool(soup.find("div", class_="refentry"))
+
+        elem = soup.find("div", attrs={"id": True})
+        if elem and isinstance(elem, BeautifulSoupElement.Tag):
+            slug = str(elem["id"]).lower() + ".html"
+        else:
+            raise SystemError(f"No div with id found in {html_file}")
+
+        title = soup.find("title")
+        title_text = (
+            str(title.string).strip()
+            if title and isinstance(title, BeautifulSoupElement.Tag)
+            else "PostgreSQL Documentation"
+        )
+        if title:
+            title.decompose()
+        for class_name in ["navheader", "navfooter"]:
+            for div in soup.find_all("div", class_=class_name):
+                div.decompose()
+
+        # Convert first h3 to h4 in notice/warning/tip divs
+        if not is_refentry:
+            for class_name in ["caution", "important", "notice", "warning", "tip", "note"]:
+                for div in soup.find_all("div", class_=class_name):
+                    if div is None or not isinstance(div, BeautifulSoupElement.Tag):
+                        continue
+                    h3 = div.find("h3")
+                    if h3 and isinstance(h3, BeautifulSoupElement.Tag):
+                        h3.name = "h4"
+
+        md_content = markdownify(str(soup), heading_style="ATX")
+        md_content = f"""---
+title: {title_text}
+slug: {slug}
+refentry: {is_refentry}
+---
+{md_content}"""
+        md_file.write_text(md_content, encoding="utf-8")
+
+
+@dataclass
+class Chunk:
+    header: str
+    header_path: list[str]
+    content: str
+    slug: str
+    version: int
+    token_count: int = 0
+
+def insert_chunk(
+    conn: psycopg.Connection,
+    chunk: Chunk,
+) -> None:
+    client = openai.OpenAI(api_key=OPENAI_API_KEY)
+    content = ''
+    for i in range(len(chunk.header_path)):
+        content += ''.join(['#' for _ in range(i + 1)]) + ' ' + chunk.header_path[i] + '\n\n'
+    content += chunk.content
+    embedding = client.embeddings.create(
+        model="text-embedding-3-small",
+        input=chunk.content,
+    ).data[0].embedding
+    content = chunk.content
+    # token_count, embedding = embed(header_path, content)
+    print(f"header: {chunk.header}")
+    conn.execute(
+        "insert into docs.postgres_2 (version, header, header_path, source_url, content, token_count, embedding) values (%s,%s,%s,%s,%s,%s,%s)",
+        [
+            chunk.version,
+            chunk.header,
+            chunk.header_path,
+            f"{POSTGRES_BASE_URL}/{chunk.version}/{chunk.slug}",
+            content,
+            0,
+            embedding,
+        ],
+    )
+    conn.commit()
+
+
+def process_chunk(conn: psycopg.Connection, chunk: Chunk) -> None:
+    if chunk.content == "":  # discard empty chunks
+        return
+
+    chunk.token_count = len(ENC.encode(chunk.content))
+    if chunk.token_count < 10:  # discard chunks that are too tiny to be useful
+        return
+
+    chunks = [chunk]
+
+    if chunk.token_count > 7000:
+        print(f"chunk {chunk.header} too large ({chunk.token_count} tokens), skipping...")
+        return
+        # chunks = chunk_by_term(chunk)
+
+    for chunk in chunks:
+        insert_chunk(conn, chunk)
+
+
+def chunk_files(conn: psycopg.Connection, version: int) -> None:
+    conn.execute("delete from docs.postgres_2 where version = %s", [version])
+
+    header_pattern = re.compile(
+        "^(#{1,3}) .+$"
+    )  # find lines that are markdown headers with 1-3 #
+    codeblock_pattern = re.compile("^```")
+    for md in MD_DIR.glob("*.md"):
+        print(f"chunking {md}...")
+        with md.open() as f:
+            # process the frontmatter
+            f.readline()
+            title_line = f.readline()
+            slug = f.readline().split(":", 1)[1].strip()
+            refentry = f.readline().split(":", 1)[1].strip().lower() == "true"
+            f.readline()
+            header_path = []
+            chunk: Chunk | None = None
+            in_codeblock = False
+            while True:
+                line = f.readline()
+                if line == "":
+                    if chunk is not None:
+                        process_chunk(conn, chunk)
+                    break
+                match = header_pattern.match(line)
+                if match is None or in_codeblock or (refentry and chunk is not None):
+                    assert chunk is not None
+                    if codeblock_pattern.match(line):
+                        in_codeblock = not in_codeblock
+                    chunk.content += line
+                    continue
+                header = match.group(1)
+                depth = len(header)
+                header_path = header_path[: (depth - 1)]
+                header_path.append(line.lstrip("#").strip())
+                if chunk is not None:
+                    process_chunk(conn, chunk)
+                chunk = Chunk(
+                    header=line.lstrip("#").strip(),
+                    header_path=header_path.copy(),
+                    content="",
+                    slug=slug,
+                    version=version,
+                )
+
+
+if __name__ == "__main__":
+    update_repo()
+    postgres_versions = [
+        (17, "REL_17_6"),
+        # (16, "REL_16_9"),
+        # (15, "REL_15_13"),
+        # (14, "REL_14_18")
+    ]
+    db_uri = f"postgresql://{os.environ['PGUSER']}:{os.environ['PGPASSWORD']}@{os.environ['PGHOST']}:{os.environ['PGPORT']}/{os.environ['PGDATABASE']}"
+    with psycopg.connect(db_uri) as conn:
+        for version, tag in postgres_versions:
+            print(f"Building Postgres {version} documentation...")
+            # build_html(version, tag)
+            build_markdown()
+            chunk_files(conn, version)

From 50ee21e4a97d3386d3a44066a8a1fb3b1943e5a2 Mon Sep 17 00:00:00 2001
From: Matthew Peveler <mpeveler@tigerdata.com>
Date: Tue, 16 Sep 2025 16:56:02 -0600
Subject: [PATCH 2/5] wip

Signed-off-by: Matthew Peveler <mpeveler@tigerdata.com>
---
 ingest/postgres_docs.py | 170 +++++++++++++++++++++++++++++++++-------
 1 file changed, 141 insertions(+), 29 deletions(-)

diff --git a/ingest/postgres_docs.py b/ingest/postgres_docs.py
index 8a57603..c5b9031 100644
--- a/ingest/postgres_docs.py
+++ b/ingest/postgres_docs.py
@@ -1,6 +1,7 @@
 from dataclasses import dataclass
 from dotenv import load_dotenv
 from bs4 import BeautifulSoup, element as BeautifulSoupElement
+import json
 from markdownify import markdownify
 import openai
 import os
@@ -27,7 +28,8 @@
 
 POSTGRES_BASE_URL = "https://www.postgresql.org/docs"
 
-ENC = tiktoken.get_encoding("o200k_base")
+ENC = tiktoken.get_encoding("cl100k_base")
+MAX_CHUNK_TOKENS = 7000
 
 def update_repo():
     if not POSTGRES_DIR.exists():
@@ -166,17 +168,65 @@ def build_markdown() -> None:
         md_file.write_text(md_content, encoding="utf-8")
 
 
+@dataclass
+class Page:
+    id: int
+    version: int
+    url: str
+    domain: str
+    filename: str
+    content_length: int = 0
+    chunks_count: int = 0
+
+
 @dataclass
 class Chunk:
+    idx: int
     header: str
     header_path: list[str]
     content: str
-    slug: str
-    version: int
     token_count: int = 0
+    subindex: int = 0
+
+
+def insert_page(
+    conn: psycopg.Connection,
+    page: Page,
+) -> None:
+    print('inserting page', page.filename, page.url)
+    result = conn.execute(
+        "insert into docs.postgres_pages (version, url, domain, filename, content_length, chunks_count) values (%s,%s,%s,%s,%s,%s) RETURNING id",
+        [
+            page.version,
+            page.url,
+            page.domain,
+            page.filename,
+            0,
+            0,
+        ],
+    )
+    row = result.fetchone()
+    assert row is not None
+    page.id = row[0]
+
+
+def update_page_stats(
+    conn: psycopg.Connection,
+    page: Page,
+) -> None:
+    conn.execute(
+        "update docs.postgres_pages set content_length = %s, chunks_count = %s where id = %s",
+        [
+            page.content_length,
+            page.chunks_count,
+            page.id,
+        ],
+    )
+
 
 def insert_chunk(
     conn: psycopg.Connection,
+    page: Page,
     chunk: Chunk,
 ) -> None:
     client = openai.OpenAI(api_key=OPENAI_API_KEY)
@@ -191,22 +241,65 @@ def insert_chunk(
     content = chunk.content
     # token_count, embedding = embed(header_path, content)
     print(f"header: {chunk.header}")
+    url = page.url
+    if len(chunk.header_path) > 1:
+        pattern = r'\((#\S+)\)'
+        match = re.search(pattern, chunk.header_path[-1])
+        if match:
+            url += match.group(1).lower()
     conn.execute(
-        "insert into docs.postgres_2 (version, header, header_path, source_url, content, token_count, embedding) values (%s,%s,%s,%s,%s,%s,%s)",
+        "insert into docs.postgres_chunks (page_id, chunk_index, sub_chunk_index, content, metadata, embedding) values (%s,%s,%s,%s,%s,%s)",
         [
-            chunk.version,
-            chunk.header,
-            chunk.header_path,
-            f"{POSTGRES_BASE_URL}/{chunk.version}/{chunk.slug}",
-            content,
-            0,
+            page.id,
+            chunk.idx,
+            chunk.subindex,
+            chunk.content,
+            json.dumps({
+                "header": chunk.header,
+                "header_path": chunk.header_path,
+                "source_url": url,
+                "token_count": chunk.token_count,
+            }),
             embedding,
         ],
     )
-    conn.commit()
 
 
-def process_chunk(conn: psycopg.Connection, chunk: Chunk) -> None:
+def split_chunk(chunk: Chunk) -> list[Chunk]:
+    num_subchunks = (chunk.token_count // MAX_CHUNK_TOKENS) + 1
+    input_ids = ENC.encode(chunk.content)
+
+    tokens_per_chunk = len(input_ids) // num_subchunks
+
+    subchunks = []
+    subindex = 0
+    idx = 0
+    while idx < len(input_ids):
+        cur_idx = min(idx + tokens_per_chunk, len(input_ids))
+        chunk_ids = input_ids[idx:cur_idx]
+        if not chunk_ids:
+            break
+        decoded = ENC.decode(chunk_ids)
+        if decoded:
+            subchunks.append(Chunk(
+                idx=chunk.idx,
+                header=chunk.header,
+                header_path=chunk.header_path,
+                content=decoded,
+                token_count=len(chunk_ids),
+                subindex=subindex,
+            ))
+            subindex += 1
+        if cur_idx == len(input_ids):
+            break
+        idx += tokens_per_chunk
+    return subchunks
+
+
+def process_chunk(conn: psycopg.Connection, page: Page, chunk: Chunk) -> None:
+    page.content_length += len(chunk.content)
+    page.chunks_count += 1
+
     if chunk.content == "":  # discard empty chunks
         return
 
@@ -216,39 +309,55 @@ def process_chunk(conn: psycopg.Connection, chunk: Chunk) -> None:
 
     chunks = [chunk]
 
-    if chunk.token_count > 7000:
-        print(f"chunk {chunk.header} too large ({chunk.token_count} tokens), skipping...")
-        return
-        # chunks = chunk_by_term(chunk)
+    if chunk.token_count > MAX_CHUNK_TOKENS:
+        print(f"Chunk {chunk.header} too large ({chunk.token_count} tokens), splitting...")
+        chunks = split_chunk(chunk)
 
     for chunk in chunks:
-        insert_chunk(conn, chunk)
+        insert_chunk(conn, page, chunk)
+    conn.commit()
 
 
 def chunk_files(conn: psycopg.Connection, version: int) -> None:
-    conn.execute("delete from docs.postgres_2 where version = %s", [version])
+    conn.execute("delete from docs.postgres_chunks where page_id IN (select id from docs.postgres_pages where version = %s)", [version])
+    conn.execute("delete from docs.postgres_pages where version = %s", [version])
+    conn.commit()
 
     header_pattern = re.compile(
         "^(#{1,3}) .+$"
-    )  # find lines that are markdown headers with 1-3 #
+    )
+    section_prefix = r'^[A-Za-z0-9.]+\.\s*'
+    # TODO: trim Chapter ##. prefix from headers too, e.g.: Chapter 65. Database Physical Storage
     codeblock_pattern = re.compile("^```")
     for md in MD_DIR.glob("*.md"):
         print(f"chunking {md}...")
         with md.open() as f:
             # process the frontmatter
             f.readline()
-            title_line = f.readline()
+            f.readline()  # title line
             slug = f.readline().split(":", 1)[1].strip()
             refentry = f.readline().split(":", 1)[1].strip().lower() == "true"
             f.readline()
+
+            page = Page(
+                id=0,
+                version=version,
+                url=f"{POSTGRES_BASE_URL}/{version}/{slug}",
+                domain="postgresql.org",
+                filename=md.name,
+            )
+
+            insert_page(conn, page)
+
             header_path = []
+            idx = 0
             chunk: Chunk | None = None
             in_codeblock = False
             while True:
                 line = f.readline()
                 if line == "":
                     if chunk is not None:
-                        process_chunk(conn, chunk)
+                        process_chunk(conn, page, chunk)
                     break
                 match = header_pattern.match(line)
                 if match is None or in_codeblock or (refentry and chunk is not None):
@@ -257,19 +366,22 @@ def chunk_files(conn: psycopg.Connection, version: int) -> None:
                         in_codeblock = not in_codeblock
                     chunk.content += line
                     continue
-                header = match.group(1)
-                depth = len(header)
+                header_hases = match.group(1)
+                depth = len(header_hases)
                 header_path = header_path[: (depth - 1)]
-                header_path.append(line.lstrip("#").strip())
+                header = re.sub(section_prefix, '', line.lstrip("#").strip()).strip()
+                header_path.append(header)
                 if chunk is not None:
-                    process_chunk(conn, chunk)
+                    process_chunk(conn, page, chunk)
                 chunk = Chunk(
-                    header=line.lstrip("#").strip(),
+                    idx=idx,
+                    header=header,
                     header_path=header_path.copy(),
                     content="",
-                    slug=slug,
-                    version=version,
                 )
+                idx += 1
+            update_page_stats(conn, page)
+            conn.commit()
 
 
 if __name__ == "__main__":
@@ -285,5 +397,5 @@ def chunk_files(conn: psycopg.Connection, version: int) -> None:
         for version, tag in postgres_versions:
             print(f"Building Postgres {version} documentation...")
             # build_html(version, tag)
-            build_markdown()
+            # build_markdown()
             chunk_files(conn, version)

From c423ef542af9a2704b6467b98785b9005db1edc7 Mon Sep 17 00:00:00 2001
From: Matthew Peveler <mpeveler@tigerdata.com>
Date: Tue, 16 Sep 2025 17:46:13 -0600
Subject: [PATCH 3/5] wip

Signed-off-by: Matthew Peveler <mpeveler@tigerdata.com>
---
 ingest/postgres_docs.py | 44 ++++++++++++++++++++++++++---------------
 1 file changed, 28 insertions(+), 16 deletions(-)

diff --git a/ingest/postgres_docs.py b/ingest/postgres_docs.py
index c5b9031..f336e79 100644
--- a/ingest/postgres_docs.py
+++ b/ingest/postgres_docs.py
@@ -195,7 +195,7 @@ def insert_page(
 ) -> None:
     print('inserting page', page.filename, page.url)
     result = conn.execute(
-        "insert into docs.postgres_pages (version, url, domain, filename, content_length, chunks_count) values (%s,%s,%s,%s,%s,%s) RETURNING id",
+        "insert into docs.postgres_pages_tmp (version, url, domain, filename, content_length, chunks_count) values (%s,%s,%s,%s,%s,%s) RETURNING id",
         [
             page.version,
             page.url,
@@ -215,7 +215,7 @@ def update_page_stats(
     page: Page,
 ) -> None:
     conn.execute(
-        "update docs.postgres_pages set content_length = %s, chunks_count = %s where id = %s",
+        "update docs.postgres_pages_tmp set content_length = %s, chunks_count = %s where id = %s",
         [
             page.content_length,
             page.chunks_count,
@@ -248,7 +248,7 @@ def insert_chunk(
         if match:
             url += match.group(1).lower()
     conn.execute(
-        "insert into docs.postgres_chunks (page_id, chunk_index, sub_chunk_index, content, metadata, embedding) values (%s,%s,%s,%s,%s,%s)",
+        "insert into docs.postgres_chunks_tmp (page_id, chunk_index, sub_chunk_index, content, metadata, embedding) values (%s,%s,%s,%s,%s,%s)",
         [
             page.id,
             chunk.idx,
@@ -319,16 +319,19 @@ def process_chunk(conn: psycopg.Connection, page: Page, chunk: Chunk) -> None:
 
 
 def chunk_files(conn: psycopg.Connection, version: int) -> None:
-    conn.execute("delete from docs.postgres_chunks where page_id IN (select id from docs.postgres_pages where version = %s)", [version])
-    conn.execute("delete from docs.postgres_pages where version = %s", [version])
+    conn.execute("drop table if exists docs.postgres_chunks_tmp")
+    conn.execute("drop table if exists docs.postgres_pages_tmp")
+    conn.execute("create table docs.postgres_pages_tmp (like docs.postgres_pages including all)")
+    conn.execute("create table docs.postgres_chunks_tmp (like docs.postgres_chunks including all)")
+    conn.execute("alter table docs.postgres_chunks_tmp add foreign key (page_id) references docs.postgres_pages_tmp(id)")
     conn.commit()
 
-    header_pattern = re.compile(
-        "^(#{1,3}) .+$"
-    )
+    header_pattern = re.compile('^(#{1,3}) .+$')
+    codeblock_pattern = re.compile('^```')
+
     section_prefix = r'^[A-Za-z0-9.]+\.\s*'
-    # TODO: trim Chapter ##. prefix from headers too, e.g.: Chapter 65. Database Physical Storage
-    codeblock_pattern = re.compile("^```")
+    chapter_prefix = r'^Chapter\s+[0-9]+\.\s*'
+
     for md in MD_DIR.glob("*.md"):
         print(f"chunking {md}...")
         with md.open() as f:
@@ -369,7 +372,9 @@ def chunk_files(conn: psycopg.Connection, version: int) -> None:
                 header_hases = match.group(1)
                 depth = len(header_hases)
                 header_path = header_path[: (depth - 1)]
-                header = re.sub(section_prefix, '', line.lstrip("#").strip()).strip()
+                header = line.lstrip("#").strip()
+                header = re.sub(section_prefix, '', header).strip()
+                header = re.sub(chapter_prefix, '', header).strip()
                 header_path.append(header)
                 if chunk is not None:
                     process_chunk(conn, page, chunk)
@@ -383,19 +388,26 @@ def chunk_files(conn: psycopg.Connection, version: int) -> None:
             update_page_stats(conn, page)
             conn.commit()
 
+    with conn.cursor() as cur:
+        cur.execute("drop table docs.postgres_chunks")
+        cur.execute("drop table docs.postgres_pages")
+        cur.execute("rename table docs.postgres_chunks_tmp to docs.postgres_chunks")
+        cur.execute("rename table docs.postgres_pages_tmp to docs.postgres_pages")
+    conn.commit()
+
 
 if __name__ == "__main__":
     update_repo()
     postgres_versions = [
         (17, "REL_17_6"),
-        # (16, "REL_16_9"),
-        # (15, "REL_15_13"),
-        # (14, "REL_14_18")
+        (16, "REL_16_9"),
+        (15, "REL_15_13"),
+        (14, "REL_14_18")
     ]
     db_uri = f"postgresql://{os.environ['PGUSER']}:{os.environ['PGPASSWORD']}@{os.environ['PGHOST']}:{os.environ['PGPORT']}/{os.environ['PGDATABASE']}"
     with psycopg.connect(db_uri) as conn:
         for version, tag in postgres_versions:
             print(f"Building Postgres {version} documentation...")
-            # build_html(version, tag)
-            # build_markdown()
+            build_html(version, tag)
+            build_markdown()
             chunk_files(conn, version)

From aa55e663af04ddabb1c374f3830a33497f357cb5 Mon Sep 17 00:00:00 2001
From: Matthew Peveler <mpeveler@tigerdata.com>
Date: Tue, 16 Sep 2025 18:46:19 -0600
Subject: [PATCH 4/5] wip

Signed-off-by: Matthew Peveler <mpeveler@tigerdata.com>
---
 ingest/postgres_docs.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/ingest/postgres_docs.py b/ingest/postgres_docs.py
index f336e79..4a2fbf3 100644
--- a/ingest/postgres_docs.py
+++ b/ingest/postgres_docs.py
@@ -391,9 +391,9 @@ def chunk_files(conn: psycopg.Connection, version: int) -> None:
     with conn.cursor() as cur:
         cur.execute("drop table docs.postgres_chunks")
         cur.execute("drop table docs.postgres_pages")
-        cur.execute("rename table docs.postgres_chunks_tmp to docs.postgres_chunks")
-        cur.execute("rename table docs.postgres_pages_tmp to docs.postgres_pages")
-    conn.commit()
+        cur.execute("alter table docs.postgres_chunks_tmp rename to postgres_chunks")
+        cur.execute("alter table docs.postgres_pages_tmp rename to postgres_pages")
+        conn.commit()
 
 
 if __name__ == "__main__":

From c4027951d557eb8e78fe9ba419c8ed81309cde50 Mon Sep 17 00:00:00 2001
From: Matthew Peveler <mpeveler@tigerdata.com>
Date: Wed, 17 Sep 2025 13:51:04 -0600
Subject: [PATCH 5/5] calculate page stats from db summation

Signed-off-by: Matthew Peveler <mpeveler@tigerdata.com>
---
 ingest/postgres_docs.py | 27 +++++++++++++++------------
 1 file changed, 15 insertions(+), 12 deletions(-)

diff --git a/ingest/postgres_docs.py b/ingest/postgres_docs.py
index 4a2fbf3..a37f16e 100644
--- a/ingest/postgres_docs.py
+++ b/ingest/postgres_docs.py
@@ -175,8 +175,6 @@ class Page:
     url: str
     domain: str
     filename: str
-    content_length: int = 0
-    chunks_count: int = 0
 
 
 @dataclass
@@ -214,14 +212,21 @@ def update_page_stats(
     conn: psycopg.Connection,
     page: Page,
 ) -> None:
-    conn.execute(
-        "update docs.postgres_pages_tmp set content_length = %s, chunks_count = %s where id = %s",
-        [
-            page.content_length,
-            page.chunks_count,
-            page.id,
-        ],
-    )
+    conn.execute("""
+        update docs.postgres_pages_tmp p
+        set
+            content_length = coalesce(chunks_stats.total_length, 0),
+            chunks_count = coalesce(chunks_stats.chunks_count, 0)
+        from (
+            select
+                page_id,
+                sum(char_length(content)) as total_length,
+                count(*) as chunks_count
+            from docs.postgres_chunks_tmp
+            group by page_id
+        ) as chunks_stats
+        where p.id = chunks_stats.page_id
+    """, [page.id, page.id])
 
 
 def insert_chunk(
@@ -297,8 +302,6 @@ def split_chunk(chunk: Chunk) -> list[Chunk]:
 
 
 def process_chunk(conn: psycopg.Connection, page: Page, chunk: Chunk) -> None:
-    page.content_length += len(chunk.content)
-    page.chunks_count += 1
 
     if chunk.content == "":  # discard empty chunks
         return