Skip to content

Commit d94209c

Browse files
committed
feat(dedupe): add a deduplication script
Add a script capable of deduplicating assets used between multiple build targets. When attempting to play with sphinx's asset paths I noticed there was a lot of logic around preserving relative links. It didn't seem like they want to allow usage of an asset path outside of the build directory. The WebSupport class seemed to allow some of that flexibility we were looking for, but unfortunately it still assumes the conf.py is in the source directory and has issues working with other Sphinx extensions like ifconfig. Given what we are doing seems to be a niche usecase, we can address this with a little post processing using lxml filters. Signed-off-by: Randolph Sapp <[email protected]>
1 parent ccba41b commit d94209c

File tree

1 file changed

+127
-0
lines changed

1 file changed

+127
-0
lines changed

bin/dedupe.py

Lines changed: 127 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,127 @@
1+
#!/usr/bin/env python3
2+
3+
"""Tool to deduplicate HTML assets for GitHub pages deployments
4+
5+
SPDX-License-Identifier: MIT
6+
Copyright (C) 2025 Texas Instruments Incorporated - https://www.ti.com
7+
"""
8+
9+
import logging
10+
11+
from lxml import html
12+
from root_index import get_root_index, BUILD_PATH
13+
14+
COMMON_PATHS = {"_images", "_downloads", "_static"}
15+
16+
17+
def _rewrite_wrapper(document, old_rel_path, new_rel_path, check_list):
18+
"""Wrapper to replace links using lxml rewrite_links. Defines a throwaway function to make
19+
things faster.
20+
21+
:param document: lxml html document to operate on
22+
:param old_rel_path: Pathlib path to the document root directory
23+
:param new_rel_path: Pathlib path to the new common directory
24+
:param check_list: Iterable of pathlib paths to check
25+
"""
26+
27+
def _update_link(link):
28+
"""Function to interact with lxml's rewrite_links
29+
30+
:param link: String link to rewrite
31+
"""
32+
clean_link = link.strip()
33+
if clean_link[:4] == "http":
34+
return link
35+
36+
link_path = old_rel_path.joinpath(clean_link).resolve()
37+
for check_path in check_list:
38+
if link_path.is_relative_to(check_path):
39+
logging.info("old link path: %s", link_path)
40+
new_path = new_rel_path.joinpath(
41+
link_path.relative_to(check_path.parent)
42+
)
43+
logging.info("new link path: %s", new_path)
44+
rel_path = new_path.relative_to(old_rel_path, walk_up=True)
45+
logging.info("new rel path: %s", rel_path)
46+
logging.info("---")
47+
return rel_path.as_posix()
48+
49+
return link
50+
51+
document.rewrite_links(_update_link, resolve_base_href=False)
52+
53+
54+
def _move_files(old_rel_path, new_rel_path, check_list):
55+
"""Move the files that match the check_list from the old_rel_path root into new_rel_path.
56+
57+
:param old_rel_path: Pathlib path to the document root directory
58+
:param new_rel_path: Pathlib path to the new common directory
59+
:param check_list: Iterable of pathlib paths to check
60+
"""
61+
for check_path in check_list:
62+
operating_dir = old_rel_path.joinpath(check_path)
63+
for path in operating_dir.glob("**/*"):
64+
if not path.is_file():
65+
continue
66+
rel = path.relative_to(old_rel_path)
67+
logging.info("moving file: %s", rel)
68+
new = new_rel_path.joinpath(rel)
69+
logging.info("destination: %s", new)
70+
path.replace(new)
71+
logging.info("---")
72+
73+
for empty_dir in sorted(operating_dir.glob("**/*"), reverse=True):
74+
empty_dir.rmdir()
75+
76+
77+
def rewrite_paths(root_dir, common_dir):
78+
"""Rewrite the paths to move assets into a common_dir directory. This assumes:
79+
80+
1. Paths are already relative to the given root_dir
81+
2. The root_dir resides under the common_dir
82+
83+
:param root_dir: Pathlib path to document root directory
84+
:param common_dir: Pathlib path to new common_dir directory
85+
"""
86+
check_list = {root_dir.joinpath(x).resolve() for x in COMMON_PATHS}
87+
logging.info("rewriting paths")
88+
for html_path in root_dir.glob("**/*.html"):
89+
with html_path.open("r", encoding="utf-8") as file:
90+
document = html.fromstring(file.read())
91+
92+
_rewrite_wrapper(
93+
document, html_path.parent.resolve(), common_dir.resolve(), check_list
94+
)
95+
96+
with html_path.open("wb") as file:
97+
file.write(
98+
html.tostring(
99+
document,
100+
encoding="utf-8",
101+
include_meta_content_type=True,
102+
doctype="<!DOCTYPE html>",
103+
)
104+
)
105+
106+
logging.info("moving the files")
107+
_move_files(root_dir.resolve(), common_dir.resolve(), check_list)
108+
109+
110+
def main():
111+
"""Main processing loop"""
112+
logging.basicConfig(level=logging.INFO)
113+
114+
root_list = []
115+
for path in BUILD_PATH.glob("*/"):
116+
index_path = get_root_index(path)
117+
if index_path:
118+
root_list.append(index_path.parent)
119+
logging.info("found the following index: %s", index_path)
120+
121+
for path in root_list:
122+
logging.info("working on the following document dir: %s", path)
123+
rewrite_paths(path, BUILD_PATH)
124+
125+
126+
if __name__ == "__main__":
127+
main()

0 commit comments

Comments
 (0)