@@ -64,6 +64,7 @@ class Tokens:
64
64
65
65
66
66
def get_domain (url : str ) -> str :
67
+ """Extracts the domain from a URL."""
67
68
if "http" not in url :
68
69
# If `get_domain` is called on a domain, add a scheme so that the
69
70
# original domain is returned instead of the empty string.
@@ -72,12 +73,14 @@ def get_domain(url: str) -> str:
72
73
73
74
74
75
def multiple_replace (text : str , replacements : dict [str , str ]) -> str :
76
+ """Performs multiple string replacements using regex pass."""
75
77
regex = re .compile ("(%s)" % "|" .join (map (re .escape , replacements .keys ())))
76
78
return regex .sub (lambda mo : replacements [mo .group (1 )], text )
77
79
78
80
79
81
@functools .lru_cache (maxsize = 1024 )
80
82
def mark_lines (text : str ) -> str :
83
+ """Adds line numbers (ex: 'L0:') to the beginning of each line in a string."""
81
84
# Split the string by newline characters
82
85
lines = text .split ("\n " )
83
86
@@ -88,16 +91,19 @@ def mark_lines(text: str) -> str:
88
91
89
92
@functools .cache
90
93
def _tiktoken_vocabulary_lengths (enc_name : str ) -> list [int ]:
94
+ """Gets the character lengths of all tokens in the specified TikToken vocabulary."""
91
95
encoding = tiktoken .get_encoding (enc_name )
92
96
return [len (encoding .decode ([i ])) for i in range (encoding .n_vocab )]
93
97
94
98
95
99
def warmup_caches (enc_names : list [str ]) -> None :
100
+ """Warm up the cache by computing token length lists for the given TikToken encodings."""
96
101
for _ in map (_tiktoken_vocabulary_lengths , enc_names ):
97
102
pass
98
103
99
104
100
105
def _replace_special_chars (text : str ) -> str :
106
+ """Replaces specific special characters with visually similar alternatives."""
101
107
replacements = {
102
108
"【" : "〖" ,
103
109
"】" : "〗" ,
@@ -110,16 +116,19 @@ def _replace_special_chars(text: str) -> str:
110
116
111
117
112
118
def merge_whitespace (text : str ) -> str :
119
+ """Replace newlines with spaces and merge consecutive whitespace into a single space."""
113
120
text = text .replace ("\n " , " " )
114
121
text = re .sub (r"\s+" , " " , text )
115
122
return text
116
123
117
124
118
125
def arxiv_to_ar5iv (url : str ) -> str :
126
+ """Converts an arxiv.org URL to its ar5iv.org equivalent."""
119
127
return re .sub (r"arxiv.org" , r"ar5iv.org" , url )
120
128
121
129
122
130
def _clean_links (root : lxml .html .HtmlElement , cur_url : str ) -> dict [str , str ]:
131
+ """Processes all anchor tags in the HTML, replaces them with a custom format and returns an ID-to-URL mapping."""
123
132
cur_domain = get_domain (cur_url )
124
133
urls : dict [str , str ] = {}
125
134
urls_rev : dict [str , str ] = {}
@@ -156,10 +165,12 @@ def _clean_links(root: lxml.html.HtmlElement, cur_url: str) -> dict[str, str]:
156
165
157
166
158
167
def _get_text (node : lxml .html .HtmlElement ) -> str :
168
+ """Extracts all text from an HTML element and merges it into a whitespace-normalized string."""
159
169
return merge_whitespace (" " .join (node .itertext ()))
160
170
161
171
162
172
def _remove_node (node : lxml .html .HtmlElement ) -> None :
173
+ """Removes a node from its parent in the lxml tree."""
163
174
node .getparent ().remove (node )
164
175
165
176
@@ -172,6 +183,7 @@ def _escape_md_section(text: str, snob: bool = False) -> str:
172
183
173
184
174
185
def html_to_text (html : str ) -> str :
186
+ """Converts an HTML string to clean plaintext."""
175
187
html = re .sub (HTML_SUP_RE , r"^{\2}" , html )
176
188
html = re .sub (HTML_SUB_RE , r"_{\2}" , html )
177
189
# add spaces between tags such as table cells
@@ -195,6 +207,7 @@ def html_to_text(html: str) -> str:
195
207
196
208
197
209
def _remove_math (root : lxml .html .HtmlElement ) -> None :
210
+ """Removes all <math> elements from the lxml tree."""
198
211
for node in root .findall (".//math" ):
199
212
_remove_node (node )
200
213
@@ -209,6 +222,7 @@ def remove_unicode_smp(text: str) -> str:
209
222
210
223
211
224
def replace_node_with_text (node : lxml .html .HtmlElement , text : str ) -> None :
225
+ """Replaces an lxml node with a text string while preserving surrounding text."""
212
226
previous = node .getprevious ()
213
227
parent = node .getparent ()
214
228
tail = node .tail or ""
@@ -224,6 +238,7 @@ def replace_images(
224
238
base_url : str ,
225
239
session : aiohttp .ClientSession | None ,
226
240
) -> None :
241
+ """Finds all image tags and replaces them with numbered placeholders (includes alt/title if available)."""
227
242
cnt = 0
228
243
for img_tag in root .findall (".//img" ):
229
244
image_name = img_tag .get ("alt" , img_tag .get ("title" ))
0 commit comments