Skip to content

Commit 047319d

Browse files
adarsh-craftsDanztee
authored andcommitted
docs: add docstrings to utility and helper functions (openai#97)
Co-authored-by: Adarsh N <[email protected]>
1 parent eb190d4 commit 047319d

File tree

1 file changed

+15
-0
lines changed

1 file changed

+15
-0
lines changed

gpt_oss/tools/simple_browser/page_contents.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@ class Tokens:
6464

6565

6666
def get_domain(url: str) -> str:
67+
"""Extracts the domain from a URL."""
6768
if "http" not in url:
6869
# If `get_domain` is called on a domain, add a scheme so that the
6970
# original domain is returned instead of the empty string.
@@ -72,12 +73,14 @@ def get_domain(url: str) -> str:
7273

7374

7475
def multiple_replace(text: str, replacements: dict[str, str]) -> str:
76+
"""Performs multiple string replacements using regex pass."""
7577
regex = re.compile("(%s)" % "|".join(map(re.escape, replacements.keys())))
7678
return regex.sub(lambda mo: replacements[mo.group(1)], text)
7779

7880

7981
@functools.lru_cache(maxsize=1024)
8082
def mark_lines(text: str) -> str:
83+
"""Adds line numbers (ex: 'L0:') to the beginning of each line in a string."""
8184
# Split the string by newline characters
8285
lines = text.split("\n")
8386

@@ -88,16 +91,19 @@ def mark_lines(text: str) -> str:
8891

8992
@functools.cache
9093
def _tiktoken_vocabulary_lengths(enc_name: str) -> list[int]:
94+
"""Gets the character lengths of all tokens in the specified TikToken vocabulary."""
9195
encoding = tiktoken.get_encoding(enc_name)
9296
return [len(encoding.decode([i])) for i in range(encoding.n_vocab)]
9397

9498

9599
def warmup_caches(enc_names: list[str]) -> None:
100+
"""Warm up the cache by computing token length lists for the given TikToken encodings."""
96101
for _ in map(_tiktoken_vocabulary_lengths, enc_names):
97102
pass
98103

99104

100105
def _replace_special_chars(text: str) -> str:
106+
"""Replaces specific special characters with visually similar alternatives."""
101107
replacements = {
102108
"【": "〖",
103109
"】": "〗",
@@ -110,16 +116,19 @@ def _replace_special_chars(text: str) -> str:
110116

111117

112118
def merge_whitespace(text: str) -> str:
119+
"""Replace newlines with spaces and merge consecutive whitespace into a single space."""
113120
text = text.replace("\n", " ")
114121
text = re.sub(r"\s+", " ", text)
115122
return text
116123

117124

118125
def arxiv_to_ar5iv(url: str) -> str:
126+
"""Converts an arxiv.org URL to its ar5iv.org equivalent."""
119127
return re.sub(r"arxiv.org", r"ar5iv.org", url)
120128

121129

122130
def _clean_links(root: lxml.html.HtmlElement, cur_url: str) -> dict[str, str]:
131+
"""Processes all anchor tags in the HTML, replaces them with a custom format and returns an ID-to-URL mapping."""
123132
cur_domain = get_domain(cur_url)
124133
urls: dict[str, str] = {}
125134
urls_rev: dict[str, str] = {}
@@ -156,10 +165,12 @@ def _clean_links(root: lxml.html.HtmlElement, cur_url: str) -> dict[str, str]:
156165

157166

158167
def _get_text(node: lxml.html.HtmlElement) -> str:
168+
"""Extracts all text from an HTML element and merges it into a whitespace-normalized string."""
159169
return merge_whitespace(" ".join(node.itertext()))
160170

161171

162172
def _remove_node(node: lxml.html.HtmlElement) -> None:
173+
"""Removes a node from its parent in the lxml tree."""
163174
node.getparent().remove(node)
164175

165176

@@ -172,6 +183,7 @@ def _escape_md_section(text: str, snob: bool = False) -> str:
172183

173184

174185
def html_to_text(html: str) -> str:
186+
"""Converts an HTML string to clean plaintext."""
175187
html = re.sub(HTML_SUP_RE, r"^{\2}", html)
176188
html = re.sub(HTML_SUB_RE, r"_{\2}", html)
177189
# add spaces between tags such as table cells
@@ -195,6 +207,7 @@ def html_to_text(html: str) -> str:
195207

196208

197209
def _remove_math(root: lxml.html.HtmlElement) -> None:
210+
"""Removes all <math> elements from the lxml tree."""
198211
for node in root.findall(".//math"):
199212
_remove_node(node)
200213

@@ -209,6 +222,7 @@ def remove_unicode_smp(text: str) -> str:
209222

210223

211224
def replace_node_with_text(node: lxml.html.HtmlElement, text: str) -> None:
225+
"""Replaces an lxml node with a text string while preserving surrounding text."""
212226
previous = node.getprevious()
213227
parent = node.getparent()
214228
tail = node.tail or ""
@@ -224,6 +238,7 @@ def replace_images(
224238
base_url: str,
225239
session: aiohttp.ClientSession | None,
226240
) -> None:
241+
"""Finds all image tags and replaces them with numbered placeholders (includes alt/title if available)."""
227242
cnt = 0
228243
for img_tag in root.findall(".//img"):
229244
image_name = img_tag.get("alt", img_tag.get("title"))

0 commit comments

Comments
 (0)