machine-learning-apps
diff --git a/‎LICENSE
Lines changed: 21 additions & 0 deletions b/‎LICENSE
Lines changed: 21 additions & 0 deletions
diff --git a/‎Pipfile
Lines changed: 19 additions & 0 deletions b/‎Pipfile
Lines changed: 19 additions & 0 deletions
diff --git a/‎Pipfile.lock
Lines changed: 631 additions & 0 deletions b/‎Pipfile.lock
Lines changed: 631 additions & 0 deletions
diff --git a/‎example.md
Lines changed: 67 additions & 0 deletions b/‎example.md
Lines changed: 67 additions & 0 deletions
diff --git a/‎mdparse/__init__.py b/‎mdparse/__init__.py
diff --git a/‎mdparse/parser.py
Lines changed: 212 additions & 0 deletions b/‎mdparse/parser.py
Lines changed: 212 additions & 0 deletions
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2019 Hamel Husain
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
@@ -0,0 +1,19 @@
+[[source]]
+name = "pypi"
+url = "https://pypi.org/simple"
+verify_ssl = true
+
+[dev-packages]
+
+[packages]
+bs4 = "*"
+mistune = "*"
+timeout-decorator = "*"
+urllib3 = "*"
+textacy = "*"
+ipykernel = "*"
+regex = "*"
+html5lib = "*"
+
+[requires]
+python_version = "3.7"
@@ -0,0 +1,67 @@
+# This is a test markdown that has different types of formatting
+
+## Header 2
+
+ Hello **world** this is going to be a long sentence that. also has a newline:
+ lala lala
+ 
+continuing the sentence here.
+more text.
+ 
+ - bullet 1
+ - bullet 2
+ 
+"quoted text"
+
+#### a small header
+
+```python
+def something(x):
+    \"""docstring\"""
+    #some comment
+    somevar = 123
+    anothervar = somevar * 3
+    for i in range(33):
+        anothervar *= i
+    
+    if i % 2 == 0:
+        return True
+    else:
+        return False
+```
+
+@mention somebody
+
+> blockquote text is here!
+ 
+ [hobbit-hole][1]
+ 
+ 1. first
+ 
+ 2. second
+ 
+
+ > what this?
+ 
+ ![Tux, the Linux mascot](https://d33wubrfki0l68.cloudfront.net/e7ed9fe4bafe46e275c807d63591f85f9ab246ba/e2d28/assets/images/tux.png)
+ 
+ 
+ [1]: <https://en.wikipedia.org/wiki/Hobbit#Lifestyle> "Hobbit lifestyles"
+ 
+| First Header  | Second Header |
+| ------------- | ------------- |
+| Content Cell  | Content Cell  |
+| Content Cell  | Content Cell  |
+ 
+ ----
+ 
+ Hello `something` here and __stuff__ is _there_.  [google](www.google.com/search). random text
+    
+~~The world is flat.~~ We now know that the world is round.
+
+- [x] Write the press release
+- [ ] Update the website
+- [ ] Contact the media
+
+
+`http://www.example.com`
@@ -0,0 +1,212 @@
+from bs4 import BeautifulSoup as bs
+import bs4
+import mistune
+from timeout_decorator import TimeoutError, timeout
+from functools import partial
+from typing import List, Union, Collection, Optional, Any, Callable, Iterable
+from urllib3.util import parse_url
+import regex
+from textacy.preprocess import preprocess_text, normalize_whitespace
+import re
+
+# initialize markdown parser
+markdown = mistune.Markdown()
+
+# shamlessly stolen from fastai
+ListOrItem = Union[Collection[Any], int, float, str]
+OptListOrItem = Optional[ListOrItem]
+
+
+def compose(funcs:List[Callable])->Callable:
+    """
+    shamlessly stolen from fastai.core.compose
+    Compose `funcs`
+    """
+    def compose_(funcs, x, *args, **kwargs):
+        for f in listify(funcs): x = f(x, *args, **kwargs)
+        return x
+    return partial(compose_, funcs)
+
+
+def listify(p:OptListOrItem=None, q:OptListOrItem=None):
+    """
+    shamlessly stolen from fastai.core.lisfify
+    Make `p` listy and the same length as `q`.
+    """
+    
+    if p is None: p=[]
+    elif isinstance(p, str):          p = [p]
+    elif not isinstance(p, Iterable): p = [p]
+    #Rank 0 tensors in PyTorch are Iterable but don't have a length.
+    else:
+        try: a = len(p)
+        except: p = [p]
+    n = q if type(q)==int else len(p) if q is None else len(q)
+    if len(p)==1: p = p * n
+    assert len(p)==n, f'List len mismatch ({len(p)} vs {n})'
+    return list(p)
+
+
+class md:
+    "class that organizes functions that can cleanup a namespace"
+    @staticmethod
+    def parse(x:str) -> bs4.BeautifulSoup:
+        
+        # find & replace html, which can break things (non-greedy)
+        x = re.sub(r'<.+?>.+?</.+?>|<[a-zA-Z]{1,}.*?>', 'xxxhtml', x, re.DOTALL)
+        
+        #because former html replacement was non-greedy dedupe html marker
+        x = re.sub('(xxxhtml(xxxlnbrk)?(\s)?)+', ' xxxhtml ', x)
+        
+        # fix the linebreak issue from BigQuery
+        x = re.sub(r'xxxlnbrk( +)?', '\n', x)
+       
+        @timeout(1)
+        def timed_parse(x):
+            try:
+                return bs(markdown(x), features="html5lib")
+            
+            except TimeoutError:
+                return bs(markdown('xxxunabletoparse'), features="html5lib")
+            
+        return timed_parse(x)
+    
+    @staticmethod
+    def prepend(fldname:str, tag:Union[List[str], str], soup:bs4.BeautifulSoup) -> bs4.BeautifulSoup:
+        for tag in soup.find_all(listify(tag)):
+            if tag.text.strip() or tag.name == 'hr':
+                tag.insert(0, fldname+' ')
+        return soup
+    
+    @staticmethod
+    def enclose(bfldname:str, efldname:str, tag:Union[List[str], str], nlines:int, soup:bs4.BeautifulSoup) -> bs4.BeautifulSoup:
+        """Helper function for when you want to add a beginning and ending marker to text."""
+        for tag in soup.find_all(listify(tag)):
+            
+            # preview the text inside an enclosure show nlines of beginning and nlines of the end.
+            text_lines = tag.text.split('\n')
+            if len(text_lines) <= nlines * 2:
+                newstr = tag.text
+            else:
+                newstr = '\n'.join(text_lines[:nlines] + text_lines[-nlines:])
+                
+            tag.string = newstr
+            
+            # add the values of the class attributes, if exist
+            tag.insert(0, bfldname + ' ' + (' '.join(tag['class']) if 'class' in tag.attrs else '') + ' ')
+            
+            # insert ending tag with/without space depending if last char is \n
+            if tag.text[-1] == '\n':
+                tag.append(efldname)
+            else:
+                tag.append(' ' + efldname)
+        return soup
+    
+    @staticmethod
+    def lst(soup:bs4.BeautifulSoup) -> bs4.BeautifulSoup:
+        "annotate list elements <ul> and <ol>"
+        for tag in soup.find_all(['ul', 'ol']):
+            # clear all the artifacts that are in lists and replace with text.
+            text = 'xxxlistB ' + tag.getText() + 'xxxlistE'
+            tag.string = text.strip()
+        return soup
+    
+    @staticmethod
+    def tbl(soup:bs4.BeautifulSoup) -> bs4.BeautifulSoup:
+        "annotate table elements <table> only keeping information from header rows"
+        for tag in soup.find_all('table'):
+            # empty string if there are no table headers.
+            text = ''
+            if tag.thead:
+                text = 'xxtbl ' + '|'.join([x.getText() for x in tag.thead.find_all('th')])
+            tag.string = text
+        return soup
+    
+    @staticmethod
+    def img(soup:bs4.BeautifulSoup) -> bs4.BeautifulSoup:
+        for tag in soup.find_all('img'):
+            tag.insert(0, 'xxximg ')
+            if 'alt' in tag.attrs:
+                tag.insert(1, tag['alt'])
+            if 'src' in tag.attrs:
+                tag.append(' xxximgf ' + tag['src'].split('.')[-1])
+        return soup
+    
+    @staticmethod
+    def lnk(soup:bs4.BeautifulSoup) -> bs4.BeautifulSoup:
+        for tag in soup.find_all('a'):
+            if 'href' in tag.attrs:
+                try:
+                    tag.append(' xxxlnkhb ' + parse_url(tag['href']).host + ' xxxlnkhe')
+                except:
+                    pass
+            if 'title' in tag.attrs:
+                tag.append(' xxxlnktb ' + tag['title'] + ' xxxlnkte ')
+        return soup
+    
+    @staticmethod
+    def get_text(soup:bs4.BeautifulSoup) -> str:
+        "get the raw text"
+        text = soup.getText()
+        #translate newlines back from BigQuery
+        text = re.sub(r'\n\n+', '\n', text)
+        #translate double quotes back from BigQuery
+        text = re.sub(r'xxxdblqte', ' \" ', text)
+        return normalize_whitespace(text)
+    
+    @staticmethod
+    def sym(text:str) -> str:
+        """generalize symbols such as urls, emails, phone numbers and filepaths to generic tokens."""
+        text = preprocess_text(text, 
+                               no_emails=True, 
+                               no_phone_numbers=True,
+                               no_accents=True)
+        
+        # generalize file paths
+        file_path_regex = r'C:(\\\\\S+){2,}|(/\S+){2,}|[Cc]:\\\w+(\\[0-9a-zA-Z_\-]+)+'
+        text = re.sub(file_path_regex, ' xxxfilepath ', text)
+        
+        # generalize @ mentions
+        at_mention_regex = r'\W@\w+'
+        text = re.sub(at_mention_regex, ' xxxatmention ', text)
+        
+        # get date/time
+        text = re.sub(r'\d+[-/]\d+[-/]\d+(.{0,2})?(\d+:\d+:\d+)', ' xxxdatetm ', text)
+        
+        # strings that have >=4 dots w/o any whitespace in between
+        text = re.sub(r'(\S+\.\S+){4,}', 'xxunk', text)
+        
+        # things that look like IP addresses
+        text = re.sub(r'\d+\.\d+.\d+\.\d+', 'xxunk', text)
+        
+        # long strings or numbers
+        text = re.sub(r'\S{30,}|\d{6,}', 'xxunk', text)
+        
+        # generalize json
+        json_regex = r'\{(?:[^{}]|(?R))*\}'
+        text = regex.sub(json_regex, ' xxxjson ', text)
+        
+        return text
+            
+    ### transformations that are the same from factory functions
+    # large headers: h1
+    hL =   partial(prepend.__func__, 'xxxhl', 'h1')
+    # medium headers: h2, h3
+    hM =   partial(prepend.__func__, 'xxxhm', ['h2', 'h3'])
+    # small headers: h4, h5, h6
+    hS =   partial(prepend.__func__, 'xxxhs', ['h4', 'h5', 'h6'])
+    # code blocks
+    code = partial(enclose.__func__, ' xxxcdb ', ' xxxcde ', 'code', 2)
+    # paragraph blocks (plain text)
+    txt =  partial(prepend.__func__, '', 'p')
+    # block quotes
+    bqt =  partial(enclose.__func__, 'xxxqb', 'xxxqe', 'blockquote', 3)
+    # strikethrough
+    st =   partial(enclose.__func__, 'xxxdelb', 'xxxdele', 'del', 1)
+    # horizontal rule
+    hr =   partial(prepend.__func__, 'xxxhr', 'hr')
+    
+
+transform_pre_rules = [md.parse, md.hL, md.hM, md.hS, md.lst, md.bqt, 
+                       md.code, md.tbl, md.st, md.txt, md.lnk, md.img, 
+                       md.hr, md.get_text, md.sym]