1
+ from bs4 import BeautifulSoup as bs
2
+ import bs4
3
+ import mistune
4
+ from timeout_decorator import TimeoutError , timeout
5
+ from functools import partial
6
+ from typing import List , Union , Collection , Optional , Any , Callable , Iterable
7
+ from urllib3 .util import parse_url
8
+ import regex
9
+ from textacy .preprocess import preprocess_text , normalize_whitespace
10
+ import re
11
+
12
+ # initialize markdown parser
13
+ markdown = mistune .Markdown ()
14
+
15
+ # shamlessly stolen from fastai
16
+ ListOrItem = Union [Collection [Any ], int , float , str ]
17
+ OptListOrItem = Optional [ListOrItem ]
18
+
19
+
20
+ def compose (funcs :List [Callable ])-> Callable :
21
+ """
22
+ shamlessly stolen from fastai.core.compose
23
+ Compose `funcs`
24
+ """
25
+ def compose_ (funcs , x , * args , ** kwargs ):
26
+ for f in listify (funcs ): x = f (x , * args , ** kwargs )
27
+ return x
28
+ return partial (compose_ , funcs )
29
+
30
+
31
+ def listify (p :OptListOrItem = None , q :OptListOrItem = None ):
32
+ """
33
+ shamlessly stolen from fastai.core.lisfify
34
+ Make `p` listy and the same length as `q`.
35
+ """
36
+
37
+ if p is None : p = []
38
+ elif isinstance (p , str ): p = [p ]
39
+ elif not isinstance (p , Iterable ): p = [p ]
40
+ #Rank 0 tensors in PyTorch are Iterable but don't have a length.
41
+ else :
42
+ try : a = len (p )
43
+ except : p = [p ]
44
+ n = q if type (q )== int else len (p ) if q is None else len (q )
45
+ if len (p )== 1 : p = p * n
46
+ assert len (p )== n , f'List len mismatch ({ len (p )} vs { n } )'
47
+ return list (p )
48
+
49
+
50
+ class md :
51
+ "class that organizes functions that can cleanup a namespace"
52
+ @staticmethod
53
+ def parse (x :str ) -> bs4 .BeautifulSoup :
54
+
55
+ # find & replace html, which can break things (non-greedy)
56
+ x = re .sub (r'<.+?>.+?</.+?>|<[a-zA-Z]{1,}.*?>' , 'xxxhtml' , x , re .DOTALL )
57
+
58
+ #because former html replacement was non-greedy dedupe html marker
59
+ x = re .sub ('(xxxhtml(xxxlnbrk)?(\s)?)+' , ' xxxhtml ' , x )
60
+
61
+ # fix the linebreak issue from BigQuery
62
+ x = re .sub (r'xxxlnbrk( +)?' , '\n ' , x )
63
+
64
+ @timeout (1 )
65
+ def timed_parse (x ):
66
+ try :
67
+ return bs (markdown (x ), features = "html5lib" )
68
+
69
+ except TimeoutError :
70
+ return bs (markdown ('xxxunabletoparse' ), features = "html5lib" )
71
+
72
+ return timed_parse (x )
73
+
74
+ @staticmethod
75
+ def prepend (fldname :str , tag :Union [List [str ], str ], soup :bs4 .BeautifulSoup ) -> bs4 .BeautifulSoup :
76
+ for tag in soup .find_all (listify (tag )):
77
+ if tag .text .strip () or tag .name == 'hr' :
78
+ tag .insert (0 , fldname + ' ' )
79
+ return soup
80
+
81
+ @staticmethod
82
+ def enclose (bfldname :str , efldname :str , tag :Union [List [str ], str ], nlines :int , soup :bs4 .BeautifulSoup ) -> bs4 .BeautifulSoup :
83
+ """Helper function for when you want to add a beginning and ending marker to text."""
84
+ for tag in soup .find_all (listify (tag )):
85
+
86
+ # preview the text inside an enclosure show nlines of beginning and nlines of the end.
87
+ text_lines = tag .text .split ('\n ' )
88
+ if len (text_lines ) <= nlines * 2 :
89
+ newstr = tag .text
90
+ else :
91
+ newstr = '\n ' .join (text_lines [:nlines ] + text_lines [- nlines :])
92
+
93
+ tag .string = newstr
94
+
95
+ # add the values of the class attributes, if exist
96
+ tag .insert (0 , bfldname + ' ' + (' ' .join (tag ['class' ]) if 'class' in tag .attrs else '' ) + ' ' )
97
+
98
+ # insert ending tag with/without space depending if last char is \n
99
+ if tag .text [- 1 ] == '\n ' :
100
+ tag .append (efldname )
101
+ else :
102
+ tag .append (' ' + efldname )
103
+ return soup
104
+
105
+ @staticmethod
106
+ def lst (soup :bs4 .BeautifulSoup ) -> bs4 .BeautifulSoup :
107
+ "annotate list elements <ul> and <ol>"
108
+ for tag in soup .find_all (['ul' , 'ol' ]):
109
+ # clear all the artifacts that are in lists and replace with text.
110
+ text = 'xxxlistB ' + tag .getText () + 'xxxlistE'
111
+ tag .string = text .strip ()
112
+ return soup
113
+
114
+ @staticmethod
115
+ def tbl (soup :bs4 .BeautifulSoup ) -> bs4 .BeautifulSoup :
116
+ "annotate table elements <table> only keeping information from header rows"
117
+ for tag in soup .find_all ('table' ):
118
+ # empty string if there are no table headers.
119
+ text = ''
120
+ if tag .thead :
121
+ text = 'xxtbl ' + '|' .join ([x .getText () for x in tag .thead .find_all ('th' )])
122
+ tag .string = text
123
+ return soup
124
+
125
+ @staticmethod
126
+ def img (soup :bs4 .BeautifulSoup ) -> bs4 .BeautifulSoup :
127
+ for tag in soup .find_all ('img' ):
128
+ tag .insert (0 , 'xxximg ' )
129
+ if 'alt' in tag .attrs :
130
+ tag .insert (1 , tag ['alt' ])
131
+ if 'src' in tag .attrs :
132
+ tag .append (' xxximgf ' + tag ['src' ].split ('.' )[- 1 ])
133
+ return soup
134
+
135
+ @staticmethod
136
+ def lnk (soup :bs4 .BeautifulSoup ) -> bs4 .BeautifulSoup :
137
+ for tag in soup .find_all ('a' ):
138
+ if 'href' in tag .attrs :
139
+ try :
140
+ tag .append (' xxxlnkhb ' + parse_url (tag ['href' ]).host + ' xxxlnkhe' )
141
+ except :
142
+ pass
143
+ if 'title' in tag .attrs :
144
+ tag .append (' xxxlnktb ' + tag ['title' ] + ' xxxlnkte ' )
145
+ return soup
146
+
147
+ @staticmethod
148
+ def get_text (soup :bs4 .BeautifulSoup ) -> str :
149
+ "get the raw text"
150
+ text = soup .getText ()
151
+ #translate newlines back from BigQuery
152
+ text = re .sub (r'\n\n+' , '\n ' , text )
153
+ #translate double quotes back from BigQuery
154
+ text = re .sub (r'xxxdblqte' , ' \" ' , text )
155
+ return normalize_whitespace (text )
156
+
157
+ @staticmethod
158
+ def sym (text :str ) -> str :
159
+ """generalize symbols such as urls, emails, phone numbers and filepaths to generic tokens."""
160
+ text = preprocess_text (text ,
161
+ no_emails = True ,
162
+ no_phone_numbers = True ,
163
+ no_accents = True )
164
+
165
+ # generalize file paths
166
+ file_path_regex = r'C:(\\\\\S+){2,}|(/\S+){2,}|[Cc]:\\\w+(\\[0-9a-zA-Z_\-]+)+'
167
+ text = re .sub (file_path_regex , ' xxxfilepath ' , text )
168
+
169
+ # generalize @ mentions
170
+ at_mention_regex = r'\W@\w+'
171
+ text = re .sub (at_mention_regex , ' xxxatmention ' , text )
172
+
173
+ # get date/time
174
+ text = re .sub (r'\d+[-/]\d+[-/]\d+(.{0,2})?(\d+:\d+:\d+)' , ' xxxdatetm ' , text )
175
+
176
+ # strings that have >=4 dots w/o any whitespace in between
177
+ text = re .sub (r'(\S+\.\S+){4,}' , 'xxunk' , text )
178
+
179
+ # things that look like IP addresses
180
+ text = re .sub (r'\d+\.\d+.\d+\.\d+' , 'xxunk' , text )
181
+
182
+ # long strings or numbers
183
+ text = re .sub (r'\S{30,}|\d{6,}' , 'xxunk' , text )
184
+
185
+ # generalize json
186
+ json_regex = r'\{(?:[^{}]|(?R))*\}'
187
+ text = regex .sub (json_regex , ' xxxjson ' , text )
188
+
189
+ return text
190
+
191
+ ### transformations that are the same from factory functions
192
+ # large headers: h1
193
+ hL = partial (prepend .__func__ , 'xxxhl' , 'h1' )
194
+ # medium headers: h2, h3
195
+ hM = partial (prepend .__func__ , 'xxxhm' , ['h2' , 'h3' ])
196
+ # small headers: h4, h5, h6
197
+ hS = partial (prepend .__func__ , 'xxxhs' , ['h4' , 'h5' , 'h6' ])
198
+ # code blocks
199
+ code = partial (enclose .__func__ , ' xxxcdb ' , ' xxxcde ' , 'code' , 2 )
200
+ # paragraph blocks (plain text)
201
+ txt = partial (prepend .__func__ , '' , 'p' )
202
+ # block quotes
203
+ bqt = partial (enclose .__func__ , 'xxxqb' , 'xxxqe' , 'blockquote' , 3 )
204
+ # strikethrough
205
+ st = partial (enclose .__func__ , 'xxxdelb' , 'xxxdele' , 'del' , 1 )
206
+ # horizontal rule
207
+ hr = partial (prepend .__func__ , 'xxxhr' , 'hr' )
208
+
209
+
210
+ transform_pre_rules = [md .parse , md .hL , md .hM , md .hS , md .lst , md .bqt ,
211
+ md .code , md .tbl , md .st , md .txt , md .lnk , md .img ,
212
+ md .hr , md .get_text , md .sym ]
0 commit comments