1+ <!DOCTYPE html>
2+ < html class ="writer-html5 " lang ="en " >
3+ < head >
4+ < meta charset ="utf-8 " />
5+ < meta name ="viewport " content ="width=device-width, initial-scale=1.0 " />
6+ < title > scrapemed._morehtml — scrapemed 1.0.8 documentation</ title >
7+ < link rel ="stylesheet " href ="../../_static/pygments.css " type ="text/css " />
8+ < link rel ="stylesheet " href ="../../_static/css/theme.css " type ="text/css " />
9+ <!--[if lt IE 9]>
10+ <script src="../../_static/js/html5shiv.min.js"></script>
11+ <![endif]-->
12+
13+ < script src ="../../_static/jquery.js?v=5d32c60e "> </ script >
14+ < script src ="../../_static/_sphinx_javascript_frameworks_compat.js?v=2cd50e6c "> </ script >
15+ < script src ="../../_static/documentation_options.js?v=aec50437 "> </ script >
16+ < script src ="../../_static/doctools.js?v=888ff710 "> </ script >
17+ < script src ="../../_static/sphinx_highlight.js?v=dc90522c "> </ script >
18+ < script src ="../../_static/js/theme.js "> </ script >
19+ < link rel ="index " title ="Index " href ="../../genindex.html " />
20+ < link rel ="search " title ="Search " href ="../../search.html " />
21+ </ head >
22+
23+ < body class ="wy-body-for-nav ">
24+ < div class ="wy-grid-for-nav ">
25+ < nav data-toggle ="wy-nav-shift " class ="wy-nav-side ">
26+ < div class ="wy-side-scroll ">
27+ < div class ="wy-side-nav-search " >
28+
29+
30+
31+ < a href ="../../index.html " class ="icon icon-home ">
32+ scrapemed
33+ </ a >
34+ < div role ="search ">
35+ < form id ="rtd-search-form " class ="wy-form " action ="../../search.html " method ="get ">
36+ < input type ="text " name ="q " placeholder ="Search docs " aria-label ="Search docs " />
37+ < input type ="hidden " name ="check_keywords " value ="yes " />
38+ < input type ="hidden " name ="area " value ="default " />
39+ </ form >
40+ </ div >
41+ </ div > < div class ="wy-menu wy-menu-vertical " data-spy ="affix " role ="navigation " aria-label ="Navigation menu ">
42+ < p class ="caption " role ="heading "> < span class ="caption-text "> Contents:</ span > </ p >
43+ < ul >
44+ < li class ="toctree-l1 "> < a class ="reference internal " href ="../../modules.html "> scrapemed</ a > </ li >
45+ </ ul >
46+
47+ </ div >
48+ </ div >
49+ </ nav >
50+
51+ < section data-toggle ="wy-nav-shift " class ="wy-nav-content-wrap "> < nav class ="wy-nav-top " aria-label ="Mobile navigation menu " >
52+ < i data-toggle ="wy-nav-top " class ="fa fa-bars "> </ i >
53+ < a href ="../../index.html "> scrapemed</ a >
54+ </ nav >
55+
56+ < div class ="wy-nav-content ">
57+ < div class ="rst-content ">
58+ < div role ="navigation " aria-label ="Page navigation ">
59+ < ul class ="wy-breadcrumbs ">
60+ < li > < a href ="../../index.html " class ="icon icon-home " aria-label ="Home "> </ a > </ li >
61+ < li class ="breadcrumb-item "> < a href ="../index.html "> Module code</ a > </ li >
62+ < li class ="breadcrumb-item active "> scrapemed._morehtml</ li >
63+ < li class ="wy-breadcrumbs-aside ">
64+ </ li >
65+ </ ul >
66+ < hr />
67+ </ div >
68+ < div role ="main " class ="document " itemscope ="itemscope " itemtype ="http://schema.org/Article ">
69+ < div itemprop ="articleBody ">
70+
71+ < h1 > Source code for scrapemed._morehtml</ h1 > < div class ="highlight "> < pre >
72+ < span > </ span > < span class ="sd "> """</ span >
73+ < span class ="sd "> ScrapeMed's Custom Markup Language - MoreHTML (MHTML)</ span >
74+ < span class ="sd "> ======================================================</ span >
75+
76+ < span class ="sd "> Wrapper on basic functions for HTML manipulation.</ span >
77+
78+ < span class ="sd "> **Added on top of core html functionality:**</ span >
79+ < span class ="sd "> Non-markup significant unescape function, custom MHTML tag encoding and removal.</ span >
80+ < span class ="sd "> """</ span >
81+
82+ < span class ="kn "> import</ span > < span class ="nn "> re</ span >
83+ < span class ="kn "> import</ span > < span class ="nn "> html</ span >
84+
85+
86+ < div class ="viewcode-block " id ="unescape_except ">
87+ < a class ="viewcode-back " href ="../../scrapemed.html#scrapemed._morehtml.unescape_except "> [docs]</ a >
88+ < span class ="k "> def</ span > < span class ="nf "> unescape_except</ span > < span class ="p "> (</ span > < span class ="n "> s</ span > < span class ="p "> ,</ span > < span class ="o "> **</ span > < span class ="n "> kwargs</ span > < span class ="p "> ):</ span >
89+ < span class ="w "> </ span > < span class ="sd "> """</ span >
90+ < span class ="sd "> Convert all named and numeric character references in the provided string to</ span >
91+ < span class ="sd "> the corresponding Unicode characters, excluding any provided encodings to be</ span >
92+ < span class ="sd "> ignored.</ span >
93+
94+ < span class ="sd "> :param str s: The input string containing character references.</ span >
95+ < span class ="sd "> :param kwargs: Keyword arguments of the form key=encoding. These encodings</ span >
96+ < span class ="sd "> will be ignored when unescaping.</ span >
97+ < span class ="sd "> For keys with multiple encodings, use unique keynames.</ span >
98+ < span class ="sd "> Encodings must be single code strings.</ span >
99+ < span class ="sd "> :type kwargs: dict</ span >
100+ < span class ="sd "> :return: A string with character references unescaped, except for the</ span >
101+ < span class ="sd "> specified encodings to be ignored.</ span >
102+ < span class ="sd "> :rtype: str</ span >
103+
104+ < span class ="sd "> This function uses the rules defined by the HTML 5 standard for both valid</ span >
105+ < span class ="sd "> and invalid character references, and the list of HTML 5 named character</ span >
106+ < span class ="sd "> references defined in html.entities.html5.</ span >
107+ < span class ="sd "> """</ span >
108+
109+ < span class ="c1 "> # no need to do anything if there are no html encodings</ span >
110+ < span class ="k "> if</ span > < span class ="s2 "> "&"</ span > < span class ="ow "> not</ span > < span class ="ow "> in</ span > < span class ="n "> s</ span > < span class ="p "> :</ span >
111+ < span class ="k "> return</ span > < span class ="n "> s</ span >
112+
113+ < span class ="n "> encoding_dict</ span > < span class ="o "> =</ span > < span class ="p "> {}</ span >
114+
115+ < span class ="c1 "> # Translate keys to MHTML placeholder codes</ span >
116+ < span class ="k "> for</ span > < span class ="n "> key</ span > < span class ="p "> ,</ span > < span class ="n "> encoding</ span > < span class ="ow "> in</ span > < span class ="n "> kwargs</ span > < span class ="o "> .</ span > < span class ="n "> items</ span > < span class ="p "> ():</ span >
117+ < span class ="n "> placehold_str</ span > < span class ="o "> =</ span > < span class ="n "> generate_mhtml_tag</ span > < span class ="p "> (</ span > < span class ="n "> key</ span > < span class ="p "> )</ span >
118+ < span class ="n "> encoding_dict</ span > < span class ="p "> [</ span > < span class ="n "> placehold_str</ span > < span class ="p "> ]</ span > < span class ="o "> =</ span > < span class ="n "> encoding</ span >
119+
120+ < span class ="c1 "> # Convert encodings to MHTML placeholder codes</ span >
121+ < span class ="k "> for</ span > < span class ="n "> placehold_str</ span > < span class ="p "> ,</ span > < span class ="n "> encoding</ span > < span class ="ow "> in</ span > < span class ="n "> encoding_dict</ span > < span class ="o "> .</ span > < span class ="n "> items</ span > < span class ="p "> ():</ span >
122+ < span class ="n "> code_to_save</ span > < span class ="o "> =</ span > < span class ="n "> re</ span > < span class ="o "> .</ span > < span class ="n "> compile</ span > < span class ="p "> (</ span > < span class ="n "> re</ span > < span class ="o "> .</ span > < span class ="n "> escape</ span > < span class ="p "> (</ span > < span class ="n "> encoding</ span > < span class ="p "> ))</ span >
123+ < span class ="n "> s</ span > < span class ="o "> =</ span > < span class ="n "> code_to_save</ span > < span class ="o "> .</ span > < span class ="n "> sub</ span > < span class ="p "> (</ span > < span class ="n "> placehold_str</ span > < span class ="p "> ,</ span > < span class ="n "> s</ span > < span class ="p "> )</ span >
124+
125+ < span class ="c1 "> # Unescape everything else</ span >
126+ < span class ="n "> s</ span > < span class ="o "> =</ span > < span class ="n "> html</ span > < span class ="o "> .</ span > < span class ="n "> unescape</ span > < span class ="p "> (</ span > < span class ="n "> s</ span > < span class ="p "> )</ span >
127+
128+ < span class ="c1 "> # Convert placeheld items back to their original html encodings</ span >
129+ < span class ="k "> for</ span > < span class ="n "> placehold_str</ span > < span class ="p "> ,</ span > < span class ="n "> encoding</ span > < span class ="ow "> in</ span > < span class ="n "> encoding_dict</ span > < span class ="o "> .</ span > < span class ="n "> items</ span > < span class ="p "> ():</ span >
130+ < span class ="n "> placehold_r</ span > < span class ="o "> =</ span > < span class ="n "> re</ span > < span class ="o "> .</ span > < span class ="n "> compile</ span > < span class ="p "> (</ span > < span class ="n "> re</ span > < span class ="o "> .</ span > < span class ="n "> escape</ span > < span class ="p "> (</ span > < span class ="n "> placehold_str</ span > < span class ="p "> ))</ span >
131+ < span class ="n "> s</ span > < span class ="o "> =</ span > < span class ="n "> placehold_r</ span > < span class ="o "> .</ span > < span class ="n "> sub</ span > < span class ="p "> (</ span > < span class ="n "> encoding</ span > < span class ="p "> ,</ span > < span class ="n "> s</ span > < span class ="p "> )</ span >
132+
133+ < span class ="k "> return</ span > < span class ="n "> s</ span > </ div >
134+
135+
136+
137+ < div class ="viewcode-block " id ="generate_mhtml_tag ">
138+ < a class ="viewcode-back " href ="../../scrapemed.html#scrapemed._morehtml.generate_mhtml_tag "> [docs]</ a >
139+ < span class ="k "> def</ span > < span class ="nf "> generate_mhtml_tag</ span > < span class ="p "> (</ span > < span class ="n "> string</ span > < span class ="p "> :</ span > < span class ="nb "> str</ span > < span class ="p "> )</ span > < span class ="o "> -></ span > < span class ="nb "> str</ span > < span class ="p "> :</ span >
140+ < span class ="w "> </ span > < span class ="sd "> """</ span >
141+ < span class ="sd "> Generates an MHTML tag from the provided string.</ span >
142+
143+ < span class ="sd "> :param str string: The text to be tagged in MHTML format.</ span >
144+ < span class ="sd "> :return: An MHTML tag containing the input string, in format</ span >
145+ < span class ="sd "> `f"[MHTML::{string}]"`</ span >
146+ < span class ="sd "> :rtype: str</ span >
147+ < span class ="sd "> """</ span >
148+ < span class ="k "> return</ span > < span class ="sa "> f</ span > < span class ="s2 "> "[MHTML::</ span > < span class ="si "> {</ span > < span class ="n "> string</ span > < span class ="si "> }</ span > < span class ="s2 "> ]"</ span > </ div >
149+
150+
151+
152+ < div class ="viewcode-block " id ="generate_typed_mhtml_tag ">
153+ < a class ="viewcode-back " href ="../../scrapemed.html#scrapemed._morehtml.generate_typed_mhtml_tag "> [docs]</ a >
154+ < span class ="k "> def</ span > < span class ="nf "> generate_typed_mhtml_tag</ span > < span class ="p "> (</ span > < span class ="n "> tag_type</ span > < span class ="p "> :</ span > < span class ="nb "> str</ span > < span class ="p "> ,</ span > < span class ="n "> string</ span > < span class ="p "> :</ span > < span class ="nb "> str</ span > < span class ="p "> )</ span > < span class ="o "> -></ span > < span class ="nb "> str</ span > < span class ="p "> :</ span >
155+ < span class ="w "> </ span > < span class ="sd "> """</ span >
156+ < span class ="sd "> Generates a typed MHTML tag from the provided string.</ span >
157+
158+ < span class ="sd "> :param str tag_type: The type of the MHTML tag.</ span >
159+ < span class ="sd "> :param str string: The text to be tagged in MHTML format.</ span >
160+ < span class ="sd "> :return: A typed MHTML tag containing the input string, in format</ span >
161+ < span class ="sd "> `[MHTML::type::string]`.</ span >
162+ < span class ="sd "> :rtype: str</ span >
163+ < span class ="sd "> """</ span >
164+ < span class ="k "> return</ span > < span class ="sa "> f</ span > < span class ="s2 "> "[MHTML::</ span > < span class ="si "> {</ span > < span class ="n "> tag_type</ span > < span class ="si "> }</ span > < span class ="s2 "> ::</ span > < span class ="si "> {</ span > < span class ="n "> string</ span > < span class ="si "> }</ span > < span class ="s2 "> ]"</ span > </ div >
165+
166+
167+
168+ < div class ="viewcode-block " id ="remove_mhtml_tags ">
169+ < a class ="viewcode-back " href ="../../scrapemed.html#scrapemed._morehtml.remove_mhtml_tags "> [docs]</ a >
170+ < span class ="k "> def</ span > < span class ="nf "> remove_mhtml_tags</ span > < span class ="p "> (</ span > < span class ="n "> text</ span > < span class ="p "> :</ span > < span class ="nb "> str</ span > < span class ="p "> )</ span > < span class ="o "> -></ span > < span class ="nb "> str</ span > < span class ="p "> :</ span >
171+ < span class ="w "> </ span > < span class ="sd "> """</ span >
172+ < span class ="sd "> Removes all MHTML tags and typed MHTML tags found in the provided text.</ span >
173+
174+ < span class ="sd "> :param str text: The text from which to remove MHTML tags.</ span >
175+ < span class ="sd "> :return: The text with MHTML tags removed.</ span >
176+ < span class ="sd "> :rtype: str</ span >
177+ < span class ="sd "> """</ span >
178+ < span class ="c1 "> # match MHTML tags</ span >
179+ < span class ="c1 "> # group1 = tag type for typed MHTML tags</ span >
180+ < span class ="c1 "> # group2 = tag value for typed MHTML tags</ span >
181+ < span class ="c1 "> # group3 = tag for non-typed MHTML tags</ span >
182+ < span class ="n "> mhtml_pattern</ span > < span class ="o "> =</ span > < span class ="sa "> r</ span > < span class ="s2 "> "\[MHTML::([^:\[\]]+)::([^:\[\]]+)\]"</ span > < span class ="sa "> r</ span > < span class ="s2 "> "|\[MHTML::([^:\[\]]+)\]"</ span >
183+ < span class ="n "> mhtml_r</ span > < span class ="o "> =</ span > < span class ="n "> re</ span > < span class ="o "> .</ span > < span class ="n "> compile</ span > < span class ="p "> (</ span > < span class ="n "> mhtml_pattern</ span > < span class ="p "> )</ span >
184+ < span class ="c1 "> # remove MHTML tags and return result</ span >
185+ < span class ="k "> return</ span > < span class ="n "> mhtml_r</ span > < span class ="o "> .</ span > < span class ="n "> sub</ span > < span class ="p "> (</ span > < span class ="s2 "> ""</ span > < span class ="p "> ,</ span > < span class ="n "> text</ span > < span class ="p "> )</ span > </ div >
186+
187+ </ pre > </ div >
188+
189+ </ div >
190+ </ div >
191+ < footer >
192+
193+ < hr />
194+
195+ < div role ="contentinfo ">
196+ < p > © Copyright 2023, Daniel Frees.</ p >
197+ </ div >
198+
199+ Built with < a href ="https://www.sphinx-doc.org/ "> Sphinx</ a > using a
200+ < a href ="https://github.com/readthedocs/sphinx_rtd_theme "> theme</ a >
201+ provided by < a href ="https://readthedocs.org "> Read the Docs</ a > .
202+
203+
204+ </ footer >
205+ </ div >
206+ </ div >
207+ </ section >
208+ </ div >
209+ < script >
210+ jQuery ( function ( ) {
211+ SphinxRtdTheme . Navigation . enable ( true ) ;
212+ } ) ;
213+ </ script >
214+
215+ </ body >
216+ </ html >
0 commit comments