From d399fb1d415db0f3960c7a37a2f8c8b60e36be08 Mon Sep 17 00:00:00 2001 From: Matthew Peveler Date: Tue, 16 Sep 2025 15:39:19 -0600 Subject: [PATCH 1/5] feat: add postgres ingest Signed-off-by: Matthew Peveler --- .gitignore | 4 +- ingest/postgres_docs.py | 289 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 292 insertions(+), 1 deletion(-) create mode 100644 ingest/postgres_docs.py diff --git a/.gitignore b/.gitignore index d14eab8..bc6d753 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,5 @@ dist node_modules -.env \ No newline at end of file +.env +ingest/build +ingest/postgres diff --git a/ingest/postgres_docs.py b/ingest/postgres_docs.py new file mode 100644 index 0000000..8a57603 --- /dev/null +++ b/ingest/postgres_docs.py @@ -0,0 +1,289 @@ +from dataclasses import dataclass +from dotenv import load_dotenv +from bs4 import BeautifulSoup, element as BeautifulSoupElement +from markdownify import markdownify +import openai +import os +from pathlib import Path +import psycopg +import re +import shutil +import subprocess +import tiktoken + + +THIS_DIR = Path(__file__).parent.resolve() + +load_dotenv(dotenv_path=os.path.join(THIS_DIR, "..", ".env")) + +OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") + +POSTGRES_DIR = THIS_DIR / "postgres" +SMGL_DIR = POSTGRES_DIR / "doc" / "src" / "sgml" +HTML_DIR = SMGL_DIR / "html" +BUILD_DIR = THIS_DIR / "build" +BUILD_DIR.mkdir(exist_ok=True) +MD_DIR = BUILD_DIR / "md" + +POSTGRES_BASE_URL = "https://www.postgresql.org/docs" + +ENC = tiktoken.get_encoding("o200k_base") + +def update_repo(): + if not POSTGRES_DIR.exists(): + subprocess.run( + "git clone https://github.com/postgres/postgres.git postgres", + shell=True, + check=True, + env=os.environ, + text=True, + ) + else: + subprocess.run( + "git fetch", + shell=True, + check=True, + env=os.environ, + text=True, + cwd=POSTGRES_DIR, + ) + + +def build_html(version: int, tag: str) -> None: + html_stamp = SMGL_DIR / "html-stamp" + + # make uses the presence of html-stamp to determine if it needs to + # rebuild the html docs. + if html_stamp.exists(): + html_stamp.unlink() + + if HTML_DIR.exists(): + shutil.rmtree(HTML_DIR) + + print(f"checking out version {version} at {tag}...") + subprocess.run( + f"git checkout {tag}", + shell=True, + check=True, + env=os.environ, + text=True, + cwd=POSTGRES_DIR, + ) + + print("configuring postgres build...") + environ = os.environ.copy() + # Shim for macOS and icu4c installed via homebrew, where it's not linked into + # /usr/local by default. + if Path("/opt/homebrew/opt/icu4c/lib/pkgconfig").exists(): + environ["PKG_CONFIG_PATH"] = "/opt/homebrew/opt/icu4c/lib/pkgconfig" + subprocess.run( + "./configure --without-readline --without-zlib", + shell=True, + check=True, + env=environ, + text=True, + cwd=POSTGRES_DIR, + ) + + print("building postgres docs...") + subprocess.run( + "make html", + shell=True, + check=True, + env=os.environ, + text=True, + cwd=SMGL_DIR, + ) + + +def build_markdown() -> None: + print("converting to markdown...") + if MD_DIR.exists(): + shutil.rmtree(MD_DIR) + MD_DIR.mkdir() + + for html_file in HTML_DIR.glob("*.html"): + if html_file.name in [ + "legalnotice.html", + "appendix-obsolete.md", + "appendixes.md", + "biblio.html", + "bookindex.html", + "bug-reporting.html", + "source-format.html", + "error-message-reporting.html", + "error-style-guide.html", + "source-conventions.html", + "sourcerepo.html", + ] or html_file.name.startswith("docguide"): + continue + md_file = MD_DIR / (html_file.stem + ".md") + + html_content = html_file.read_text(encoding="utf-8") + html_content = html_content.replace( + '', "" + ) + + soup = BeautifulSoup(html_content, "html.parser") + + is_refentry = bool(soup.find("div", class_="refentry")) + + elem = soup.find("div", attrs={"id": True}) + if elem and isinstance(elem, BeautifulSoupElement.Tag): + slug = str(elem["id"]).lower() + ".html" + else: + raise SystemError(f"No div with id found in {html_file}") + + title = soup.find("title") + title_text = ( + str(title.string).strip() + if title and isinstance(title, BeautifulSoupElement.Tag) + else "PostgreSQL Documentation" + ) + if title: + title.decompose() + for class_name in ["navheader", "navfooter"]: + for div in soup.find_all("div", class_=class_name): + div.decompose() + + # Convert first h3 to h4 in notice/warning/tip divs + if not is_refentry: + for class_name in ["caution", "important", "notice", "warning", "tip", "note"]: + for div in soup.find_all("div", class_=class_name): + if div is None or not isinstance(div, BeautifulSoupElement.Tag): + continue + h3 = div.find("h3") + if h3 and isinstance(h3, BeautifulSoupElement.Tag): + h3.name = "h4" + + md_content = markdownify(str(soup), heading_style="ATX") + md_content = f"""--- +title: {title_text} +slug: {slug} +refentry: {is_refentry} +--- +{md_content}""" + md_file.write_text(md_content, encoding="utf-8") + + +@dataclass +class Chunk: + header: str + header_path: list[str] + content: str + slug: str + version: int + token_count: int = 0 + +def insert_chunk( + conn: psycopg.Connection, + chunk: Chunk, +) -> None: + client = openai.OpenAI(api_key=OPENAI_API_KEY) + content = '' + for i in range(len(chunk.header_path)): + content += ''.join(['#' for _ in range(i + 1)]) + ' ' + chunk.header_path[i] + '\n\n' + content += chunk.content + embedding = client.embeddings.create( + model="text-embedding-3-small", + input=chunk.content, + ).data[0].embedding + content = chunk.content + # token_count, embedding = embed(header_path, content) + print(f"header: {chunk.header}") + conn.execute( + "insert into docs.postgres_2 (version, header, header_path, source_url, content, token_count, embedding) values (%s,%s,%s,%s,%s,%s,%s)", + [ + chunk.version, + chunk.header, + chunk.header_path, + f"{POSTGRES_BASE_URL}/{chunk.version}/{chunk.slug}", + content, + 0, + embedding, + ], + ) + conn.commit() + + +def process_chunk(conn: psycopg.Connection, chunk: Chunk) -> None: + if chunk.content == "": # discard empty chunks + return + + chunk.token_count = len(ENC.encode(chunk.content)) + if chunk.token_count < 10: # discard chunks that are too tiny to be useful + return + + chunks = [chunk] + + if chunk.token_count > 7000: + print(f"chunk {chunk.header} too large ({chunk.token_count} tokens), skipping...") + return + # chunks = chunk_by_term(chunk) + + for chunk in chunks: + insert_chunk(conn, chunk) + + +def chunk_files(conn: psycopg.Connection, version: int) -> None: + conn.execute("delete from docs.postgres_2 where version = %s", [version]) + + header_pattern = re.compile( + "^(#{1,3}) .+$" + ) # find lines that are markdown headers with 1-3 # + codeblock_pattern = re.compile("^```") + for md in MD_DIR.glob("*.md"): + print(f"chunking {md}...") + with md.open() as f: + # process the frontmatter + f.readline() + title_line = f.readline() + slug = f.readline().split(":", 1)[1].strip() + refentry = f.readline().split(":", 1)[1].strip().lower() == "true" + f.readline() + header_path = [] + chunk: Chunk | None = None + in_codeblock = False + while True: + line = f.readline() + if line == "": + if chunk is not None: + process_chunk(conn, chunk) + break + match = header_pattern.match(line) + if match is None or in_codeblock or (refentry and chunk is not None): + assert chunk is not None + if codeblock_pattern.match(line): + in_codeblock = not in_codeblock + chunk.content += line + continue + header = match.group(1) + depth = len(header) + header_path = header_path[: (depth - 1)] + header_path.append(line.lstrip("#").strip()) + if chunk is not None: + process_chunk(conn, chunk) + chunk = Chunk( + header=line.lstrip("#").strip(), + header_path=header_path.copy(), + content="", + slug=slug, + version=version, + ) + + +if __name__ == "__main__": + update_repo() + postgres_versions = [ + (17, "REL_17_6"), + # (16, "REL_16_9"), + # (15, "REL_15_13"), + # (14, "REL_14_18") + ] + db_uri = f"postgresql://{os.environ['PGUSER']}:{os.environ['PGPASSWORD']}@{os.environ['PGHOST']}:{os.environ['PGPORT']}/{os.environ['PGDATABASE']}" + with psycopg.connect(db_uri) as conn: + for version, tag in postgres_versions: + print(f"Building Postgres {version} documentation...") + # build_html(version, tag) + build_markdown() + chunk_files(conn, version) From 50ee21e4a97d3386d3a44066a8a1fb3b1943e5a2 Mon Sep 17 00:00:00 2001 From: Matthew Peveler Date: Tue, 16 Sep 2025 16:56:02 -0600 Subject: [PATCH 2/5] wip Signed-off-by: Matthew Peveler --- ingest/postgres_docs.py | 170 +++++++++++++++++++++++++++++++++------- 1 file changed, 141 insertions(+), 29 deletions(-) diff --git a/ingest/postgres_docs.py b/ingest/postgres_docs.py index 8a57603..c5b9031 100644 --- a/ingest/postgres_docs.py +++ b/ingest/postgres_docs.py @@ -1,6 +1,7 @@ from dataclasses import dataclass from dotenv import load_dotenv from bs4 import BeautifulSoup, element as BeautifulSoupElement +import json from markdownify import markdownify import openai import os @@ -27,7 +28,8 @@ POSTGRES_BASE_URL = "https://www.postgresql.org/docs" -ENC = tiktoken.get_encoding("o200k_base") +ENC = tiktoken.get_encoding("cl100k_base") +MAX_CHUNK_TOKENS = 7000 def update_repo(): if not POSTGRES_DIR.exists(): @@ -166,17 +168,65 @@ def build_markdown() -> None: md_file.write_text(md_content, encoding="utf-8") +@dataclass +class Page: + id: int + version: int + url: str + domain: str + filename: str + content_length: int = 0 + chunks_count: int = 0 + + @dataclass class Chunk: + idx: int header: str header_path: list[str] content: str - slug: str - version: int token_count: int = 0 + subindex: int = 0 + + +def insert_page( + conn: psycopg.Connection, + page: Page, +) -> None: + print('inserting page', page.filename, page.url) + result = conn.execute( + "insert into docs.postgres_pages (version, url, domain, filename, content_length, chunks_count) values (%s,%s,%s,%s,%s,%s) RETURNING id", + [ + page.version, + page.url, + page.domain, + page.filename, + 0, + 0, + ], + ) + row = result.fetchone() + assert row is not None + page.id = row[0] + + +def update_page_stats( + conn: psycopg.Connection, + page: Page, +) -> None: + conn.execute( + "update docs.postgres_pages set content_length = %s, chunks_count = %s where id = %s", + [ + page.content_length, + page.chunks_count, + page.id, + ], + ) + def insert_chunk( conn: psycopg.Connection, + page: Page, chunk: Chunk, ) -> None: client = openai.OpenAI(api_key=OPENAI_API_KEY) @@ -191,22 +241,65 @@ def insert_chunk( content = chunk.content # token_count, embedding = embed(header_path, content) print(f"header: {chunk.header}") + url = page.url + if len(chunk.header_path) > 1: + pattern = r'\((#\S+)\)' + match = re.search(pattern, chunk.header_path[-1]) + if match: + url += match.group(1).lower() conn.execute( - "insert into docs.postgres_2 (version, header, header_path, source_url, content, token_count, embedding) values (%s,%s,%s,%s,%s,%s,%s)", + "insert into docs.postgres_chunks (page_id, chunk_index, sub_chunk_index, content, metadata, embedding) values (%s,%s,%s,%s,%s,%s)", [ - chunk.version, - chunk.header, - chunk.header_path, - f"{POSTGRES_BASE_URL}/{chunk.version}/{chunk.slug}", - content, - 0, + page.id, + chunk.idx, + chunk.subindex, + chunk.content, + json.dumps({ + "header": chunk.header, + "header_path": chunk.header_path, + "source_url": url, + "token_count": chunk.token_count, + }), embedding, ], ) - conn.commit() -def process_chunk(conn: psycopg.Connection, chunk: Chunk) -> None: +def split_chunk(chunk: Chunk) -> list[Chunk]: + num_subchunks = (chunk.token_count // MAX_CHUNK_TOKENS) + 1 + input_ids = ENC.encode(chunk.content) + + tokens_per_chunk = len(input_ids) // num_subchunks + + subchunks = [] + subindex = 0 + idx = 0 + while idx < len(input_ids): + cur_idx = min(idx + tokens_per_chunk, len(input_ids)) + chunk_ids = input_ids[idx:cur_idx] + if not chunk_ids: + break + decoded = ENC.decode(chunk_ids) + if decoded: + subchunks.append(Chunk( + idx=chunk.idx, + header=chunk.header, + header_path=chunk.header_path, + content=decoded, + token_count=len(chunk_ids), + subindex=subindex, + )) + subindex += 1 + if cur_idx == len(input_ids): + break + idx += tokens_per_chunk + return subchunks + + +def process_chunk(conn: psycopg.Connection, page: Page, chunk: Chunk) -> None: + page.content_length += len(chunk.content) + page.chunks_count += 1 + if chunk.content == "": # discard empty chunks return @@ -216,39 +309,55 @@ def process_chunk(conn: psycopg.Connection, chunk: Chunk) -> None: chunks = [chunk] - if chunk.token_count > 7000: - print(f"chunk {chunk.header} too large ({chunk.token_count} tokens), skipping...") - return - # chunks = chunk_by_term(chunk) + if chunk.token_count > MAX_CHUNK_TOKENS: + print(f"Chunk {chunk.header} too large ({chunk.token_count} tokens), splitting...") + chunks = split_chunk(chunk) for chunk in chunks: - insert_chunk(conn, chunk) + insert_chunk(conn, page, chunk) + conn.commit() def chunk_files(conn: psycopg.Connection, version: int) -> None: - conn.execute("delete from docs.postgres_2 where version = %s", [version]) + conn.execute("delete from docs.postgres_chunks where page_id IN (select id from docs.postgres_pages where version = %s)", [version]) + conn.execute("delete from docs.postgres_pages where version = %s", [version]) + conn.commit() header_pattern = re.compile( "^(#{1,3}) .+$" - ) # find lines that are markdown headers with 1-3 # + ) + section_prefix = r'^[A-Za-z0-9.]+\.\s*' + # TODO: trim Chapter ##. prefix from headers too, e.g.: Chapter 65. Database Physical Storage codeblock_pattern = re.compile("^```") for md in MD_DIR.glob("*.md"): print(f"chunking {md}...") with md.open() as f: # process the frontmatter f.readline() - title_line = f.readline() + f.readline() # title line slug = f.readline().split(":", 1)[1].strip() refentry = f.readline().split(":", 1)[1].strip().lower() == "true" f.readline() + + page = Page( + id=0, + version=version, + url=f"{POSTGRES_BASE_URL}/{version}/{slug}", + domain="postgresql.org", + filename=md.name, + ) + + insert_page(conn, page) + header_path = [] + idx = 0 chunk: Chunk | None = None in_codeblock = False while True: line = f.readline() if line == "": if chunk is not None: - process_chunk(conn, chunk) + process_chunk(conn, page, chunk) break match = header_pattern.match(line) if match is None or in_codeblock or (refentry and chunk is not None): @@ -257,19 +366,22 @@ def chunk_files(conn: psycopg.Connection, version: int) -> None: in_codeblock = not in_codeblock chunk.content += line continue - header = match.group(1) - depth = len(header) + header_hases = match.group(1) + depth = len(header_hases) header_path = header_path[: (depth - 1)] - header_path.append(line.lstrip("#").strip()) + header = re.sub(section_prefix, '', line.lstrip("#").strip()).strip() + header_path.append(header) if chunk is not None: - process_chunk(conn, chunk) + process_chunk(conn, page, chunk) chunk = Chunk( - header=line.lstrip("#").strip(), + idx=idx, + header=header, header_path=header_path.copy(), content="", - slug=slug, - version=version, ) + idx += 1 + update_page_stats(conn, page) + conn.commit() if __name__ == "__main__": @@ -285,5 +397,5 @@ def chunk_files(conn: psycopg.Connection, version: int) -> None: for version, tag in postgres_versions: print(f"Building Postgres {version} documentation...") # build_html(version, tag) - build_markdown() + # build_markdown() chunk_files(conn, version) From c423ef542af9a2704b6467b98785b9005db1edc7 Mon Sep 17 00:00:00 2001 From: Matthew Peveler Date: Tue, 16 Sep 2025 17:46:13 -0600 Subject: [PATCH 3/5] wip Signed-off-by: Matthew Peveler --- ingest/postgres_docs.py | 44 ++++++++++++++++++++++++++--------------- 1 file changed, 28 insertions(+), 16 deletions(-) diff --git a/ingest/postgres_docs.py b/ingest/postgres_docs.py index c5b9031..f336e79 100644 --- a/ingest/postgres_docs.py +++ b/ingest/postgres_docs.py @@ -195,7 +195,7 @@ def insert_page( ) -> None: print('inserting page', page.filename, page.url) result = conn.execute( - "insert into docs.postgres_pages (version, url, domain, filename, content_length, chunks_count) values (%s,%s,%s,%s,%s,%s) RETURNING id", + "insert into docs.postgres_pages_tmp (version, url, domain, filename, content_length, chunks_count) values (%s,%s,%s,%s,%s,%s) RETURNING id", [ page.version, page.url, @@ -215,7 +215,7 @@ def update_page_stats( page: Page, ) -> None: conn.execute( - "update docs.postgres_pages set content_length = %s, chunks_count = %s where id = %s", + "update docs.postgres_pages_tmp set content_length = %s, chunks_count = %s where id = %s", [ page.content_length, page.chunks_count, @@ -248,7 +248,7 @@ def insert_chunk( if match: url += match.group(1).lower() conn.execute( - "insert into docs.postgres_chunks (page_id, chunk_index, sub_chunk_index, content, metadata, embedding) values (%s,%s,%s,%s,%s,%s)", + "insert into docs.postgres_chunks_tmp (page_id, chunk_index, sub_chunk_index, content, metadata, embedding) values (%s,%s,%s,%s,%s,%s)", [ page.id, chunk.idx, @@ -319,16 +319,19 @@ def process_chunk(conn: psycopg.Connection, page: Page, chunk: Chunk) -> None: def chunk_files(conn: psycopg.Connection, version: int) -> None: - conn.execute("delete from docs.postgres_chunks where page_id IN (select id from docs.postgres_pages where version = %s)", [version]) - conn.execute("delete from docs.postgres_pages where version = %s", [version]) + conn.execute("drop table if exists docs.postgres_chunks_tmp") + conn.execute("drop table if exists docs.postgres_pages_tmp") + conn.execute("create table docs.postgres_pages_tmp (like docs.postgres_pages including all)") + conn.execute("create table docs.postgres_chunks_tmp (like docs.postgres_chunks including all)") + conn.execute("alter table docs.postgres_chunks_tmp add foreign key (page_id) references docs.postgres_pages_tmp(id)") conn.commit() - header_pattern = re.compile( - "^(#{1,3}) .+$" - ) + header_pattern = re.compile('^(#{1,3}) .+$') + codeblock_pattern = re.compile('^```') + section_prefix = r'^[A-Za-z0-9.]+\.\s*' - # TODO: trim Chapter ##. prefix from headers too, e.g.: Chapter 65. Database Physical Storage - codeblock_pattern = re.compile("^```") + chapter_prefix = r'^Chapter\s+[0-9]+\.\s*' + for md in MD_DIR.glob("*.md"): print(f"chunking {md}...") with md.open() as f: @@ -369,7 +372,9 @@ def chunk_files(conn: psycopg.Connection, version: int) -> None: header_hases = match.group(1) depth = len(header_hases) header_path = header_path[: (depth - 1)] - header = re.sub(section_prefix, '', line.lstrip("#").strip()).strip() + header = line.lstrip("#").strip() + header = re.sub(section_prefix, '', header).strip() + header = re.sub(chapter_prefix, '', header).strip() header_path.append(header) if chunk is not None: process_chunk(conn, page, chunk) @@ -383,19 +388,26 @@ def chunk_files(conn: psycopg.Connection, version: int) -> None: update_page_stats(conn, page) conn.commit() + with conn.cursor() as cur: + cur.execute("drop table docs.postgres_chunks") + cur.execute("drop table docs.postgres_pages") + cur.execute("rename table docs.postgres_chunks_tmp to docs.postgres_chunks") + cur.execute("rename table docs.postgres_pages_tmp to docs.postgres_pages") + conn.commit() + if __name__ == "__main__": update_repo() postgres_versions = [ (17, "REL_17_6"), - # (16, "REL_16_9"), - # (15, "REL_15_13"), - # (14, "REL_14_18") + (16, "REL_16_9"), + (15, "REL_15_13"), + (14, "REL_14_18") ] db_uri = f"postgresql://{os.environ['PGUSER']}:{os.environ['PGPASSWORD']}@{os.environ['PGHOST']}:{os.environ['PGPORT']}/{os.environ['PGDATABASE']}" with psycopg.connect(db_uri) as conn: for version, tag in postgres_versions: print(f"Building Postgres {version} documentation...") - # build_html(version, tag) - # build_markdown() + build_html(version, tag) + build_markdown() chunk_files(conn, version) From aa55e663af04ddabb1c374f3830a33497f357cb5 Mon Sep 17 00:00:00 2001 From: Matthew Peveler Date: Tue, 16 Sep 2025 18:46:19 -0600 Subject: [PATCH 4/5] wip Signed-off-by: Matthew Peveler --- ingest/postgres_docs.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ingest/postgres_docs.py b/ingest/postgres_docs.py index f336e79..4a2fbf3 100644 --- a/ingest/postgres_docs.py +++ b/ingest/postgres_docs.py @@ -391,9 +391,9 @@ def chunk_files(conn: psycopg.Connection, version: int) -> None: with conn.cursor() as cur: cur.execute("drop table docs.postgres_chunks") cur.execute("drop table docs.postgres_pages") - cur.execute("rename table docs.postgres_chunks_tmp to docs.postgres_chunks") - cur.execute("rename table docs.postgres_pages_tmp to docs.postgres_pages") - conn.commit() + cur.execute("alter table docs.postgres_chunks_tmp rename to postgres_chunks") + cur.execute("alter table docs.postgres_pages_tmp rename to postgres_pages") + conn.commit() if __name__ == "__main__": From c4027951d557eb8e78fe9ba419c8ed81309cde50 Mon Sep 17 00:00:00 2001 From: Matthew Peveler Date: Wed, 17 Sep 2025 13:51:04 -0600 Subject: [PATCH 5/5] calculate page stats from db summation Signed-off-by: Matthew Peveler --- ingest/postgres_docs.py | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/ingest/postgres_docs.py b/ingest/postgres_docs.py index 4a2fbf3..a37f16e 100644 --- a/ingest/postgres_docs.py +++ b/ingest/postgres_docs.py @@ -175,8 +175,6 @@ class Page: url: str domain: str filename: str - content_length: int = 0 - chunks_count: int = 0 @dataclass @@ -214,14 +212,21 @@ def update_page_stats( conn: psycopg.Connection, page: Page, ) -> None: - conn.execute( - "update docs.postgres_pages_tmp set content_length = %s, chunks_count = %s where id = %s", - [ - page.content_length, - page.chunks_count, - page.id, - ], - ) + conn.execute(""" + update docs.postgres_pages_tmp p + set + content_length = coalesce(chunks_stats.total_length, 0), + chunks_count = coalesce(chunks_stats.chunks_count, 0) + from ( + select + page_id, + sum(char_length(content)) as total_length, + count(*) as chunks_count + from docs.postgres_chunks_tmp + group by page_id + ) as chunks_stats + where p.id = chunks_stats.page_id + """, [page.id, page.id]) def insert_chunk( @@ -297,8 +302,6 @@ def split_chunk(chunk: Chunk) -> list[Chunk]: def process_chunk(conn: psycopg.Connection, page: Page, chunk: Chunk) -> None: - page.content_length += len(chunk.content) - page.chunks_count += 1 if chunk.content == "": # discard empty chunks return