Skip to content
This repository was archived by the owner on Dec 9, 2022. It is now read-only.

Commit f89f8d4

Browse files
committed
add files
0 parents  commit f89f8d4

File tree

8 files changed

+1356
-0
lines changed

8 files changed

+1356
-0
lines changed

LICENSE

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
MIT License
2+
3+
Copyright (c) 2019 Hamel Husain
4+
5+
Permission is hereby granted, free of charge, to any person obtaining a copy
6+
of this software and associated documentation files (the "Software"), to deal
7+
in the Software without restriction, including without limitation the rights
8+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9+
copies of the Software, and to permit persons to whom the Software is
10+
furnished to do so, subject to the following conditions:
11+
12+
The above copyright notice and this permission notice shall be included in all
13+
copies or substantial portions of the Software.
14+
15+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21+
SOFTWARE.

Pipfile

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
[[source]]
2+
name = "pypi"
3+
url = "https://pypi.org/simple"
4+
verify_ssl = true
5+
6+
[dev-packages]
7+
8+
[packages]
9+
bs4 = "*"
10+
mistune = "*"
11+
timeout-decorator = "*"
12+
urllib3 = "*"
13+
textacy = "*"
14+
ipykernel = "*"
15+
regex = "*"
16+
html5lib = "*"
17+
18+
[requires]
19+
python_version = "3.7"

Pipfile.lock

Lines changed: 631 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

example.md

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
# This is a test markdown that has different types of formatting
2+
3+
## Header 2
4+
5+
Hello **world** this is going to be a long sentence that. also has a newline:
6+
lala lala
7+
8+
continuing the sentence here.
9+
more text.
10+
11+
- bullet 1
12+
- bullet 2
13+
14+
"quoted text"
15+
16+
#### a small header
17+
18+
```python
19+
def something(x):
20+
\"""docstring\"""
21+
#some comment
22+
somevar = 123
23+
anothervar = somevar * 3
24+
for i in range(33):
25+
anothervar *= i
26+
27+
if i % 2 == 0:
28+
return True
29+
else:
30+
return False
31+
```
32+
33+
@mention somebody
34+
35+
> blockquote text is here!
36+
37+
[hobbit-hole][1]
38+
39+
1. first
40+
41+
2. second
42+
43+
44+
> what this?
45+
46+
![Tux, the Linux mascot](https://d33wubrfki0l68.cloudfront.net/e7ed9fe4bafe46e275c807d63591f85f9ab246ba/e2d28/assets/images/tux.png)
47+
48+
49+
[1]: <https://en.wikipedia.org/wiki/Hobbit#Lifestyle> "Hobbit lifestyles"
50+
51+
| First Header | Second Header |
52+
| ------------- | ------------- |
53+
| Content Cell | Content Cell |
54+
| Content Cell | Content Cell |
55+
56+
----
57+
58+
Hello `something` here and __stuff__ is _there_. [google](www.google.com/search). random text
59+
60+
~~The world is flat.~~ We now know that the world is round.
61+
62+
- [x] Write the press release
63+
- [ ] Update the website
64+
- [ ] Contact the media
65+
66+
67+
`http://www.example.com`

mdparse/__init__.py

Whitespace-only changes.

mdparse/parser.py

Lines changed: 212 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,212 @@
1+
from bs4 import BeautifulSoup as bs
2+
import bs4
3+
import mistune
4+
from timeout_decorator import TimeoutError, timeout
5+
from functools import partial
6+
from typing import List, Union, Collection, Optional, Any, Callable, Iterable
7+
from urllib3.util import parse_url
8+
import regex
9+
from textacy.preprocess import preprocess_text, normalize_whitespace
10+
import re
11+
12+
# initialize markdown parser
13+
markdown = mistune.Markdown()
14+
15+
# shamlessly stolen from fastai
16+
ListOrItem = Union[Collection[Any], int, float, str]
17+
OptListOrItem = Optional[ListOrItem]
18+
19+
20+
def compose(funcs:List[Callable])->Callable:
21+
"""
22+
shamlessly stolen from fastai.core.compose
23+
Compose `funcs`
24+
"""
25+
def compose_(funcs, x, *args, **kwargs):
26+
for f in listify(funcs): x = f(x, *args, **kwargs)
27+
return x
28+
return partial(compose_, funcs)
29+
30+
31+
def listify(p:OptListOrItem=None, q:OptListOrItem=None):
32+
"""
33+
shamlessly stolen from fastai.core.lisfify
34+
Make `p` listy and the same length as `q`.
35+
"""
36+
37+
if p is None: p=[]
38+
elif isinstance(p, str): p = [p]
39+
elif not isinstance(p, Iterable): p = [p]
40+
#Rank 0 tensors in PyTorch are Iterable but don't have a length.
41+
else:
42+
try: a = len(p)
43+
except: p = [p]
44+
n = q if type(q)==int else len(p) if q is None else len(q)
45+
if len(p)==1: p = p * n
46+
assert len(p)==n, f'List len mismatch ({len(p)} vs {n})'
47+
return list(p)
48+
49+
50+
class md:
51+
"class that organizes functions that can cleanup a namespace"
52+
@staticmethod
53+
def parse(x:str) -> bs4.BeautifulSoup:
54+
55+
# find & replace html, which can break things (non-greedy)
56+
x = re.sub(r'<.+?>.+?</.+?>|<[a-zA-Z]{1,}.*?>', 'xxxhtml', x, re.DOTALL)
57+
58+
#because former html replacement was non-greedy dedupe html marker
59+
x = re.sub('(xxxhtml(xxxlnbrk)?(\s)?)+', ' xxxhtml ', x)
60+
61+
# fix the linebreak issue from BigQuery
62+
x = re.sub(r'xxxlnbrk( +)?', '\n', x)
63+
64+
@timeout(1)
65+
def timed_parse(x):
66+
try:
67+
return bs(markdown(x), features="html5lib")
68+
69+
except TimeoutError:
70+
return bs(markdown('xxxunabletoparse'), features="html5lib")
71+
72+
return timed_parse(x)
73+
74+
@staticmethod
75+
def prepend(fldname:str, tag:Union[List[str], str], soup:bs4.BeautifulSoup) -> bs4.BeautifulSoup:
76+
for tag in soup.find_all(listify(tag)):
77+
if tag.text.strip() or tag.name == 'hr':
78+
tag.insert(0, fldname+' ')
79+
return soup
80+
81+
@staticmethod
82+
def enclose(bfldname:str, efldname:str, tag:Union[List[str], str], nlines:int, soup:bs4.BeautifulSoup) -> bs4.BeautifulSoup:
83+
"""Helper function for when you want to add a beginning and ending marker to text."""
84+
for tag in soup.find_all(listify(tag)):
85+
86+
# preview the text inside an enclosure show nlines of beginning and nlines of the end.
87+
text_lines = tag.text.split('\n')
88+
if len(text_lines) <= nlines * 2:
89+
newstr = tag.text
90+
else:
91+
newstr = '\n'.join(text_lines[:nlines] + text_lines[-nlines:])
92+
93+
tag.string = newstr
94+
95+
# add the values of the class attributes, if exist
96+
tag.insert(0, bfldname + ' ' + (' '.join(tag['class']) if 'class' in tag.attrs else '') + ' ')
97+
98+
# insert ending tag with/without space depending if last char is \n
99+
if tag.text[-1] == '\n':
100+
tag.append(efldname)
101+
else:
102+
tag.append(' ' + efldname)
103+
return soup
104+
105+
@staticmethod
106+
def lst(soup:bs4.BeautifulSoup) -> bs4.BeautifulSoup:
107+
"annotate list elements <ul> and <ol>"
108+
for tag in soup.find_all(['ul', 'ol']):
109+
# clear all the artifacts that are in lists and replace with text.
110+
text = 'xxxlistB ' + tag.getText() + 'xxxlistE'
111+
tag.string = text.strip()
112+
return soup
113+
114+
@staticmethod
115+
def tbl(soup:bs4.BeautifulSoup) -> bs4.BeautifulSoup:
116+
"annotate table elements <table> only keeping information from header rows"
117+
for tag in soup.find_all('table'):
118+
# empty string if there are no table headers.
119+
text = ''
120+
if tag.thead:
121+
text = 'xxtbl ' + '|'.join([x.getText() for x in tag.thead.find_all('th')])
122+
tag.string = text
123+
return soup
124+
125+
@staticmethod
126+
def img(soup:bs4.BeautifulSoup) -> bs4.BeautifulSoup:
127+
for tag in soup.find_all('img'):
128+
tag.insert(0, 'xxximg ')
129+
if 'alt' in tag.attrs:
130+
tag.insert(1, tag['alt'])
131+
if 'src' in tag.attrs:
132+
tag.append(' xxximgf ' + tag['src'].split('.')[-1])
133+
return soup
134+
135+
@staticmethod
136+
def lnk(soup:bs4.BeautifulSoup) -> bs4.BeautifulSoup:
137+
for tag in soup.find_all('a'):
138+
if 'href' in tag.attrs:
139+
try:
140+
tag.append(' xxxlnkhb ' + parse_url(tag['href']).host + ' xxxlnkhe')
141+
except:
142+
pass
143+
if 'title' in tag.attrs:
144+
tag.append(' xxxlnktb ' + tag['title'] + ' xxxlnkte ')
145+
return soup
146+
147+
@staticmethod
148+
def get_text(soup:bs4.BeautifulSoup) -> str:
149+
"get the raw text"
150+
text = soup.getText()
151+
#translate newlines back from BigQuery
152+
text = re.sub(r'\n\n+', '\n', text)
153+
#translate double quotes back from BigQuery
154+
text = re.sub(r'xxxdblqte', ' \" ', text)
155+
return normalize_whitespace(text)
156+
157+
@staticmethod
158+
def sym(text:str) -> str:
159+
"""generalize symbols such as urls, emails, phone numbers and filepaths to generic tokens."""
160+
text = preprocess_text(text,
161+
no_emails=True,
162+
no_phone_numbers=True,
163+
no_accents=True)
164+
165+
# generalize file paths
166+
file_path_regex = r'C:(\\\\\S+){2,}|(/\S+){2,}|[Cc]:\\\w+(\\[0-9a-zA-Z_\-]+)+'
167+
text = re.sub(file_path_regex, ' xxxfilepath ', text)
168+
169+
# generalize @ mentions
170+
at_mention_regex = r'\W@\w+'
171+
text = re.sub(at_mention_regex, ' xxxatmention ', text)
172+
173+
# get date/time
174+
text = re.sub(r'\d+[-/]\d+[-/]\d+(.{0,2})?(\d+:\d+:\d+)', ' xxxdatetm ', text)
175+
176+
# strings that have >=4 dots w/o any whitespace in between
177+
text = re.sub(r'(\S+\.\S+){4,}', 'xxunk', text)
178+
179+
# things that look like IP addresses
180+
text = re.sub(r'\d+\.\d+.\d+\.\d+', 'xxunk', text)
181+
182+
# long strings or numbers
183+
text = re.sub(r'\S{30,}|\d{6,}', 'xxunk', text)
184+
185+
# generalize json
186+
json_regex = r'\{(?:[^{}]|(?R))*\}'
187+
text = regex.sub(json_regex, ' xxxjson ', text)
188+
189+
return text
190+
191+
### transformations that are the same from factory functions
192+
# large headers: h1
193+
hL = partial(prepend.__func__, 'xxxhl', 'h1')
194+
# medium headers: h2, h3
195+
hM = partial(prepend.__func__, 'xxxhm', ['h2', 'h3'])
196+
# small headers: h4, h5, h6
197+
hS = partial(prepend.__func__, 'xxxhs', ['h4', 'h5', 'h6'])
198+
# code blocks
199+
code = partial(enclose.__func__, ' xxxcdb ', ' xxxcde ', 'code', 2)
200+
# paragraph blocks (plain text)
201+
txt = partial(prepend.__func__, '', 'p')
202+
# block quotes
203+
bqt = partial(enclose.__func__, 'xxxqb', 'xxxqe', 'blockquote', 3)
204+
# strikethrough
205+
st = partial(enclose.__func__, 'xxxdelb', 'xxxdele', 'del', 1)
206+
# horizontal rule
207+
hr = partial(prepend.__func__, 'xxxhr', 'hr')
208+
209+
210+
transform_pre_rules = [md.parse, md.hL, md.hM, md.hS, md.lst, md.bqt,
211+
md.code, md.tbl, md.st, md.txt, md.lnk, md.img,
212+
md.hr, md.get_text, md.sym]

0 commit comments

Comments
 (0)