diff --git a/README.md b/README.md
index 198a278..5ef03bc 100644
--- a/README.md
+++ b/README.md
@@ -7,13 +7,18 @@ A Python parser for scientific PDF based on [GROBID](https://github.com/kermitt2
 Use `pip` to install from this Github repository
 
 ```bash
-pip install git+https://github.com/titipata/scipdf_parser
+pip install git+https://github.com/skuam/scipdf_parser
 ```
 
 **Note**
 * We also need an `en_core_web_sm` model for spacy, where you can run `python -m spacy download en_core_web_sm` to download it
 * You can change GROBID version in `serve_grobid.sh` to test the parser on a new GROBID version
 
+```bash
+python -m spacy download en_core_web_sm
+```
+
+
 ## Usage
 
 Run the GROBID using the given bash script before parsing PDF
@@ -26,39 +31,76 @@ This script will download GROBID and run the service at default port 8070 (see m
 To parse a PDF provided in `example_data` folder or direct URL, use the following function:
 
 ```python
-import scipdf
-article_dict = scipdf.parse_pdf_to_dict('example_data/futoma2017improved.pdf') # return dictionary
- 
-# option to parse directly from URL to PDF, if as_list is set to True, output 'text' of parsed section will be in a list of paragraphs instead
-article_dict = scipdf.parse_pdf_to_dict('https://www.biorxiv.org/content/biorxiv/early/2018/11/20/463760.full.pdf', as_list=False)
+import json
+from scipdf.parse_pdf import SciPDFParser
+from scipdf.models import Article
+
+parser = SciPDFParser()
 
+article:Article = parser.parse_pdf('https://www.biorxiv.org/content/biorxiv/early/2018/11/20/463760.full.pdf')
+
+print(json.dumps(article.dict(), indent = 4))
 # output example
->> {
-    'title': 'Proceedings of Machine Learning for Healthcare',
-    'abstract': '...',
-    'sections': [
-        {'heading': '...', 'text': '...'},
-        {'heading': '...', 'text': '...'},
-        ...
+{
+    "title": "A new method for measuring daytime sleepiness: the Epworth sleepiness scale.",
+    "authors": "Murray Johns",
+    "pub_date": "1991",
+    "abstract": "Text of abstract",
+    "sections": [
+        {
+            "heading": "Introduction",
+            "text": "Text of introduction",
+            "n_publication_ref": 1,
+            "n_figure_ref": 1
+        }
+    ],
+    "references": [
+        {
+            "title": "The Epworth Sleepiness Scale in Clinical Practice",
+            "journal": "Sleep Breath",
+            "year": "2017",
+            "authors": "Chervin RD, et al."
+        },
+        {
+            "title": "A new method for measuring daytime sleepiness: the Epworth sleepiness scale.",
+            "journal": "Sleep",
+            "year": "1991",
+            "authors": "Johns MW"
+        }
     ],
-    'references': [
-        {'title': '...', 'year': '...', 'journal': '...', 'author': '...'},
-        ...
+    "figures": [
+        {
+            "figure_label": "Figure 1",
+            "figure_type": "table",
+            "figure_id": "fig1",
+            "figure_caption": "Caption of figure 1",
+            "figure_data": "Data of figure 1"
+        }
     ],
-    'figures': [
-        {'figure_label': '...', 'figure_type': '...', 'figure_id': '...', 'figure_caption': '...', 'figure_data': '...'},
-        ...
+    "formulas": [
+        {
+            "formula_id": "f1",
+            "formula_text": "a^2 + b^2 = c^2",
+            "formula_coordinates": [
+                1,
+                2,
+                3,
+                4
+            ]
+        }
     ],
-    'doi': '...'
+    "doi": "10.1111/j.1365-2869.1991.tb00031.x"
 }
-
-xml = scipdf.parse_pdf('example_data/futoma2017improved.pdf', soup=True) # option to parse full XML from GROBID
 ```
 
+!!! Warning Parsing of figures is not supported yet in pydantic models, so you need to parse it manually. !!!
+
 To parse figures from PDF using [pdffigures2](https://github.com/allenai/pdffigures2), you can run
 
 ```python
-scipdf.parse_figures('example_data', output_folder='figures') # folder should contain only PDF files
+from scipdf.parse_pdf import SciPDFParser
+parser = SciPDFParser()
+parser.parse_figures('example_data', output_folder='figures') # folder should contain only PDF files
 ```
 
 You can see example output figures in `figures` folder.
diff --git a/example.py b/example.py
new file mode 100644
index 0000000..85febb5
--- /dev/null
+++ b/example.py
@@ -0,0 +1,9 @@
+import json
+from scipdf.parse_pdf import SciPDFParser
+from scipdf.models import Article
+
+if __name__ == '__main__':
+    parser = SciPDFParser()
+    article: Article = parser.parse_pdf('https://www.biorxiv.org/content/biorxiv/early/2018/11/20/463760.full.pdf')
+
+    print(json.dumps(article.dict(), indent=4))
diff --git a/requirements.txt b/requirements.txt
index f90aadb..7b9f2f4 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,3 +3,4 @@ spacy
 pandas
 textstat
 beautifulsoup4
+pydantic
diff --git a/scipdf/__init__.py b/scipdf/__init__.py
index d232dfb..6d95d37 100644
--- a/scipdf/__init__.py
+++ b/scipdf/__init__.py
@@ -1,6 +1,7 @@
-__version__ = "0.1dev"
+__version__ = "1.0.1"
 
 __all__ = ["pdf", "features"]
 
 from scipdf.features.text_utils import *
-from scipdf.pdf.parse_pdf import *
+from scipdf.models import *
+from scipdf.parse_pdf import *
diff --git a/scipdf/features/__init__.py b/scipdf/features/__init__.py
index c296bab..cc98ccb 100644
--- a/scipdf/features/__init__.py
+++ b/scipdf/features/__init__.py
@@ -1,7 +1 @@
-from .text_utils import compute_readability_stats, compute_text_stats
-
-__all__ = [
-    "compute_readability_stats",
-    "compute_text_stats",
-    "compute_journal_features",
-]
+from .text_utils import compute_readability_stats, compute_text_stats , compute_journal_features
diff --git a/scipdf/features/text_utils.py b/scipdf/features/text_utils.py
index f0c49af..dfe5824 100644
--- a/scipdf/features/text_utils.py
+++ b/scipdf/features/text_utils.py
@@ -1,10 +1,13 @@
+import warnings
+
 import numpy as np
 import pandas as pd
-import textstat
 import spacy
-from collections import Counter
-from itertools import groupby
+import textstat
+from bs4 import BeautifulSoup
+from spacy.tokens import Doc
 
+from scipdf.models import ReadabilityStats, TextStats, JournalFeatures
 
 nlp = spacy.load("en_core_web_sm")
 
@@ -12,7 +15,6 @@
 VERB_LIST = ["VB", "VBP", "VBZ", "VBG", "VBN", "VBD"]
 NOUN_LIST = ["NNP", "NNPS"]
 
-
 SECTIONS_MAPS = {
     "Authors": "Authors",
     "AUTHORS": "AUTHORS",
@@ -37,7 +39,7 @@
 }
 
 
-def compute_readability_stats(text):
+def compute_readability_stats(text) -> ReadabilityStats:
     """
     Compute reading statistics of the given text
     Reference: https://github.com/shivam5992/textstat
@@ -46,39 +48,58 @@ def compute_readability_stats(text):
     ==========
     text: str, input section or abstract text
     """
-    try:
-        readability_dict = {
-            "flesch_reading_ease": textstat.flesch_reading_ease(text),
-            "smog": textstat.smog_index(text),
-            "flesch_kincaid_grade": textstat.flesch_kincaid_grade(text),
-            "coleman_liau_index": textstat.coleman_liau_index(text),
-            "automated_readability_index": textstat.automated_readability_index(text),
-            "dale_chall": textstat.dale_chall_readability_score(text),
-            "difficult_words": textstat.difficult_words(text),
-            "linsear_write": textstat.linsear_write_formula(text),
-            "gunning_fog": textstat.gunning_fog(text),
-            "text_standard": textstat.text_standard(text),
-            "n_syllable": textstat.syllable_count(text),
-            "avg_letter_per_word": textstat.avg_letter_per_word(text),
-            "avg_sentence_length": textstat.avg_sentence_length(text),
-        }
-    except:
-        readability_dict = {
-            "flesch_reading_ease": None,
-            "smog": None,
-            "flesch_kincaid_grade": None,
-            "coleman_liau_index": None,
-            "automated_readability_index": None,
-            "dale_chall": None,
-            "difficult_words": None,
-            "linsear_write": None,
-            "gunning_fog": None,
-            "text_standard": None,
-            "n_syllable": None,
-            "avg_letter_per_word": None,
-            "avg_sentence_length": None,
-        }
-    return readability_dict
+    functions = {
+        "flesch_reading_ease": textstat.flesch_reading_ease,
+        "smog": textstat.smog_index,
+        "flesch_kincaid_grade": textstat.flesch_kincaid_grade,
+        "coleman_liau_index": textstat.coleman_liau_index,
+        "automated_readability_index": textstat.automated_readability_index,
+        "dale_chall": textstat.dale_chall_readability_score,
+        "difficult_words": textstat.difficult_words,
+        "linsear_write": textstat.linsear_write_formula,
+        "gunning_fog": textstat.gunning_fog,
+        "text_standard": textstat.text_standard,
+        "n_syllable": textstat.syllable_count,
+        "avg_letter_per_word": textstat.avg_letter_per_word,
+        "avg_sentence_length": textstat.avg_sentence_length,
+    }
+
+    readability_dict = {}
+
+    for key, function in functions.items():
+        try:
+            readability_dict[key] = function(text)
+        except Exception:
+            readability_dict[key] = None
+
+    return ReadabilityStats(**readability_dict)
+
+
+from collections import Counter
+
+
+def count_pos(text):
+    return dict(Counter([token.pos_ for token in text]))
+
+
+def count_pos_tag(text):
+    return dict(Counter([token.tag_ for token in text]))
+
+
+def sum_present_verb(pos_tag):
+    return sum([v for k, v in pos_tag.items() if k in PRESENT_TENSE_VERB_LIST])
+
+
+def sum_verb(pos_tag):
+    return sum([v for k, v in pos_tag.items() if k in VERB_LIST])
+
+
+def count_word_shape(text):
+    return dict(Counter([token.shape_ for token in text]))
+
+
+def count_digits(text):
+    return sum([token.is_digit or token.like_num for token in text])
 
 
 def compute_text_stats(text):
@@ -91,128 +112,79 @@ def compute_text_stats(text):
 
     Output
     ======
-    text_stat: dict, part of speech and text features extracted from the given text
+    text_stats: TextStats, part of speech and text features extracted from the given text
     """
+    spacy_text: Doc = nlp(text)
+    text_stats_dict = {}
+    functions = [
+        (count_pos, "pos"),
+        (count_pos_tag, "pos_tag"),
+        (count_word_shape, "word_shape"),
+        (count_digits, "n_digits"),
+    ]
+    for function, key in functions:
+        try:
+            text_stats_dict[key] = function(spacy_text)
+        except Exception:
+            text_stats_dict[key] = None
     try:
-        pos = dict(Counter([token.pos_ for token in text]))
-        pos_tag = dict(
-            Counter([token.tag_ for token in text])
-        )  # detailed part-of-speech
-
-        n_present_verb = sum(
-            [v for k, v in pos_tag.items() if k in PRESENT_TENSE_VERB_LIST]
-        )
-        n_verb = sum([v for k, v in pos_tag.items() if k in VERB_LIST])
-
-        word_shape = dict(Counter([token.shape_ for token in text]))  # word shape
-        n_word_per_sents = [len([token for token in sent]) for sent in text.sents]
-        n_digits = sum([token.is_digit or token.like_num for token in text])
-        n_word = sum(n_word_per_sents)
-        n_sents = len(n_word_per_sents)
-        text_stats_dict = {
-            "pos": pos,
-            "pos_tag": pos_tag,
-            "word_shape": word_shape,
-            "n_word": n_word,
-            "n_sents": n_sents,
-            "n_present_verb": n_present_verb,
-            "n_verb": n_verb,
-            "n_digits": n_digits,
-            "percent_digits": n_digits / n_word,
-            "n_word_per_sents": n_word_per_sents,
-            "avg_word_per_sents": np.mean(n_word_per_sents),
-        }
-    except:
-        text_stats_dict = {
-            "pos": None,
-            "pos_tag": None,
-            "word_shape": None,
-            "n_word": None,
-            "n_sents": None,
-            "n_present_verb": None,
-            "n_verb": None,
-            "n_digits": None,
-            "percent_digits": None,
-            "n_word_per_sents": None,
-            "avg_word_per_sents": None,
-        }
-    return text_stats_dict
-
-
-def compute_journal_features(article):
-    """
-    Parse features about journal references from a given dictionary of parsed article e.g.
-    number of reference made, number of unique journal refered, minimum year of references,
-    maximum year of references, ...
+        pos_tag = text_stats_dict.get("pos_tag", {})
+        text_stats_dict["n_present_verb"] = sum_present_verb(pos_tag)
+        text_stats_dict["n_verb"] = sum_verb(pos_tag)
+    except Exception:
+        text_stats_dict["n_present_verb"] = None
+        text_stats_dict["n_verb"] = None
+    # Use spacy to parse the text
 
-    Parameters
-    ==========
-    article: dict, article dictionary parsed from GROBID and converted to dictionary
-        see ``pdf/parse_pdf.py`` for the detail of the output dictionary
+    n_word_per_sents = [len([token for token in sent]) for sent in spacy_text.sents]
+    text_stats_dict["n_word"] = sum(n_word_per_sents)
+    text_stats_dict["n_sents"] = len(n_word_per_sents)
+    text_stats_dict["percent_digits"] = (
+        text_stats_dict["n_digits"] / text_stats_dict["n_word"]
+        if text_stats_dict["n_word"] > 0
+        else None
+    )
+    text_stats_dict["n_word_per_sents"] = n_word_per_sents
+    text_stats_dict["avg_word_per_sents"] = np.mean(n_word_per_sents)
 
-    Output
-    ======
-    reference_dict: dict, dictionary of
-    """
-    try:
-        n_reference = len(article["references"])
-        n_unique_journals = len(
-            pd.unique([a["journal"] for a in article["references"]])
-        )
-        reference_years = []
-        for reference in article["references"]:
-            year = reference["year"]
-            if year.isdigit():
-                # filter outliers
-                if int(year) in range(1800, 2100):
-                    reference_years.append(int(year))
-        avg_ref_year = np.mean(reference_years)
-        median_ref_year = np.median(reference_years)
-        min_ref_year = np.min(reference_years)
-        max_ref_year = np.max(reference_years)
-        journal_features_dict = {
-            "n_reference": n_reference,
-            "n_unique_journals": n_unique_journals,
-            "avg_ref_year": avg_ref_year,
-            "median_ref_year": median_ref_year,
-            "min_ref_year": min_ref_year,
-            "max_ref_year": max_ref_year,
-        }
-    except:
-        journal_features_dict = {
-            "n_reference": None,
-            "n_unique_journals": None,
-            "avg_ref_year": None,
-            "median_ref_year": None,
-            "min_ref_year": None,
-            "max_ref_year": None,
-        }
-    return journal_features_dict
-
-
-def merge_section_list(section_list, section_maps=SECTIONS_MAPS, section_start=""):
-    """
-    Merge a list of sections into a normalized list of sections,
-    you can get the list of sections from parsed article JSON in ``parse_pdf.py`` e.g.
+    return TextStats(**text_stats_dict)
 
-    >> section_list = [s['heading'] for s in article_json['sections']]
-    >> section_list_merged = merge_section_list(section_list)
+
+def filter_valid_years(years):
+    return [year for year in years if year.isdigit() and int(year) in range(1800, 2100)]
+
+
+def compute_journal_features(soup: BeautifulSoup):
+    """
+    Parse features about journal references from a given dictionary of parsed article
 
     Parameters
     ==========
-    section_list: list, list of sections
+    soup: dict, article dictionary parsed from GROBID and converted to dictionary
 
     Output
     ======
-    section_list_merged: list,  sections
+    journal_features: JournalFeatures, features about journal references
     """
-    sect_map = section_start  # text for starting section e.g. ``Introduction``
-    section_list_merged = []
-    for section in section_list:
-        if any([(s.lower() in section.lower()) for s in section_maps.keys()]):
-            sect = [s for s in section_maps.keys() if s.lower() in section.lower()][0]
-            sect_map = section_maps.get(sect, "")  #
-            section_list_merged.append(sect_map)
-        else:
-            section_list_merged.append(sect_map)
-    return section_list_merged
+    functions = [
+        ("n_reference", lambda: len(soup.get("references", []))),
+        ("n_unique_journals", lambda: len(pd.unique([a.get("journal") for a in soup.get("references", [])]))),
+        ("avg_ref_year", lambda: np.mean(filter_valid_years([a.get("year") for a in soup.get("references", [])]))),
+        ("median_ref_year", lambda: np.median(filter_valid_years([a.get("year") for a in soup.get("references", [])]))),
+        ("min_ref_year", lambda: np.min(filter_valid_years([a.get("year") for a in soup.get("references", [])])), ),
+        ("max_ref_year", lambda: np.max(filter_valid_years([a.get("year") for a in soup.get("references", [])]))),
+    ]
+
+    journal_features_dict = {}
+    failed_functions = []
+
+    for key, function in functions:
+        try:
+            journal_features_dict[key] = function()
+        except Exception:
+            failed_functions.append(key)
+
+    if failed_functions:
+        warnings.warn(f"The following functions failed: {failed_functions}")
+
+    return JournalFeatures(**journal_features_dict)
diff --git a/scipdf/models.py b/scipdf/models.py
new file mode 100644
index 0000000..3bc0a0e
--- /dev/null
+++ b/scipdf/models.py
@@ -0,0 +1,102 @@
+"""
+All Pydantic models for the scipdf package.
+"""
+from typing import Optional
+
+from pydantic import BaseModel
+
+
+# Text Stats models
+class ReadabilityStats(BaseModel):
+    flesch_reading_ease: Optional[float]
+    smog: Optional[float]
+    flesch_kincaid_grade: Optional[float]
+    coleman_liau_index: Optional[float]
+    automated_readability_index: Optional[float]
+    dale_chall: Optional[float]
+    difficult_words: Optional[int]
+    linsear_write: Optional[float]
+    gunning_fog: Optional[float]
+    text_standard: Optional[str]
+    n_syllable: Optional[int]
+    avg_letter_per_word: Optional[float]
+    avg_sentence_length: Optional[float]
+
+
+class TextStats(BaseModel):
+    pos: dict
+    pos_tag: dict
+    word_shape: dict
+    n_word: int
+    n_sents: int
+    n_present_verb: Optional[int]
+    n_verb: Optional[int]
+    n_digits: int
+    percent_digits: float
+    n_word_per_sents: list
+    avg_word_per_sents: float
+
+
+class JournalFeatures(BaseModel):
+    n_reference: Optional[int]
+    n_unique_journals: Optional[int]
+    avg_ref_year: Optional[float]
+    median_ref_year: Optional[float]
+    min_ref_year: Optional[int]
+    max_ref_year: Optional[int]
+
+
+class TextStatistic(BaseModel):
+    readability: ReadabilityStats
+    text_stats: TextStats
+    journal_features: JournalFeatures
+
+
+# Text content models
+class Section(BaseModel):
+    heading: Optional[str]
+    text: str
+    n_publication_ref: int
+    n_figure_ref: int
+
+    @property
+    def full_text(self):
+        return self.heading + "\n" + self.text
+
+
+class Reference(BaseModel):
+    title: str
+    journal: str
+    year: Optional[str]
+    authors: str
+
+
+class Figure(BaseModel):
+    figure_label: str
+    figure_type: str
+    figure_id: str
+    figure_caption: str
+    figure_data: str
+
+
+class Formula(BaseModel):
+    formula_id: str
+    formula_text: str
+    formula_coordinates: list
+
+
+class Article(BaseModel):
+    title: str
+    authors: str
+    pub_date: str
+    abstract: str
+    sections: list[Section]
+    references: list[Reference]
+    figures: list[Figure]
+    formulas: list[Formula]
+    doi: str
+    text_stats: Optional[TextStatistic]
+
+    @property
+    def full_text(self) -> str:
+        return "\n\n".join([section.full_text for section in self.sections])
diff --git a/scipdf/parse_pdf.py b/scipdf/parse_pdf.py
new file mode 100644
index 0000000..599226a
--- /dev/null
+++ b/scipdf/parse_pdf.py
@@ -0,0 +1,137 @@
+import os
+import os.path as op
+import subprocess
+import urllib
+
+import requests
+from bs4 import BeautifulSoup
+
+from scipdf.models import Article
+from scipdf.pdf.parser_functions import validate_url, convert_article_soup_to_pydantic
+
+
+class SciPDFParser:
+    def __init__(self, grobid_url: str = "http://localhost:8070"):
+        self.grobid_url = grobid_url
+        self.pdf_figures_jar_path = op.join(op.dirname(__file__),
+                                            "pdf/pdffigures2/pdffigures2-assembly-0.0.12-SNAPSHOT.jar")
+
+    def parse_pdf(self,
+                  pdf_path: str,
+                  fulltext: bool = True,
+                  return_coordinates: bool = True,
+                  ) -> Article:
+        """
+        Function to parse PDF to XML or BeautifulSoup using GROBID tool
+
+        You can see http://grobid.readthedocs.io/en/latest/Install-Grobid/ on how to run GROBID locally
+        After loading GROBID zip file, you can run GROBID by using the following
+        >> ./gradlew run
+
+        Parameters
+        ==========
+        pdf_path: str or bytes, path or URL to publication or article or bytes string of PDF
+        fulltext: bool, option for parsing, if True, parse full text of the article
+            if False, parse only header
+        grobid_url: str, url to GROBID parser, default at 'http://localhost:8070'
+            This could be changed to "https://cloud.science-miner.com/grobid/" for the cloud service
+        soup: bool, if True, return BeautifulSoup of the article
+
+        Output
+        ======
+        parsed_article: if soup is False, return parsed XML in text format,
+            else return BeautifulSoup of the XML
+        Example
+        =======
+        >> parsed_article = parse_pdf(pdf_path, fulltext=True, soup=True)
+        """
+        # GROBID URL
+        if fulltext:
+            url = "%s/api/processFulltextDocument" % self.grobid_url
+        else:
+            url = "%s/api/processHeaderDocument" % self.grobid_url
+
+        files = []
+        if return_coordinates:
+            files += [
+                ("teiCoordinates", (None, "persName")),
+                ("teiCoordinates", (None, "figure")),
+                ("teiCoordinates", (None, "ref")),
+                ("teiCoordinates", (None, "formula")),
+                ("teiCoordinates", (None, "biblStruct")),
+            ]
+
+        if isinstance(pdf_path, str):
+            if op.splitext(pdf_path)[-1].lower() != ".pdf":
+                raise ValueError("The input has to end with ``.pdf``")
+            elif validate_url(pdf_path):
+                page = urllib.request.urlopen(pdf_path).read()
+                parsed_article = requests.post(url, files={"input": page}).text
+            elif op.exists(pdf_path):
+                parsed_article = requests.post(
+                    url, files={"input": open(pdf_path, "rb")}
+                ).text
+            else:
+                raise RuntimeError("The input URL is not valid")
+        elif isinstance(pdf_path, bytes):
+            # assume that incoming is byte string
+            parsed_article = requests.post(url, files={"input": pdf_path}).text
+        else:
+            raise RuntimeError("Failed to parse PDF, Do you have GROBID running?")
+
+        parsed_article = BeautifulSoup(parsed_article, "lxml")
+
+        return convert_article_soup_to_pydantic(parsed_article)
+
+    def parse_figures(
+            self,
+            pdf_folder: str,
+            resolution: int = 300,
+            output_folder: str = "figures",
+    ):
+        """
+        Parse figures from the given scientific PDF using pdffigures2
+
+        Parameters
+        ==========
+        pdf_folder: str, path to a folder that contains PDF files. A folder must contains only PDF files
+        jar_path: str, default path to pdffigures2-assembly-0.0.12-SNAPSHOT.jar file
+        resolution: int, resolution of the output figures
+        output_folder: str, path to folder that we want to save parsed data (related to figures) and figures
+
+        Output
+        ======
+        folder: making a folder of output_folder/data and output_folder/figures of parsed data and figures relatively
+        """
+        if not op.isdir(output_folder):
+            os.makedirs(output_folder)
+
+        # create ``data`` and ``figures`` subfolder within ``output_folder``
+        data_path = op.join(output_folder, "data")
+        figure_path = op.join(output_folder, "figures")
+        if not op.exists(data_path):
+            os.makedirs(data_path)
+        if not op.exists(figure_path):
+            os.makedirs(figure_path)
+
+        if op.isdir(data_path) and op.isdir(figure_path):
+            args = [
+                "java",
+                "-jar",
+                self.pdf_figures_jar_path,
+                pdf_folder,
+                "-i",
+                str(resolution),
+                "-d",
+                op.join(op.abspath(data_path), ""),
+                "-m",
+                op.join(op.abspath(figure_path), ""),  # end path with "/"
+            ]
+            _ = subprocess.run(
+                args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, timeout=20
+            )
+            print("Done parsing figures from PDFs!")
+        else:
+            print(
+                "You may have to check of ``data`` and ``figures`` in the the output folder path."
+            )
diff --git a/scipdf/pdf/__init__.py b/scipdf/pdf/__init__.py
index 0e29fce..e69de29 100644
--- a/scipdf/pdf/__init__.py
+++ b/scipdf/pdf/__init__.py
@@ -1,9 +0,0 @@
-from .parse_pdf import *
-
-__all__ = [
-    "list_pdf_paths",
-    "parse_abstract",
-    "parse_figure_caption",
-    "parse_references",
-    "parse_pdf_to_dict",
-]
diff --git a/scipdf/pdf/parse_pdf.py b/scipdf/pdf/parse_pdf.py
deleted file mode 100644
index cf82641..0000000
--- a/scipdf/pdf/parse_pdf.py
+++ /dev/null
@@ -1,455 +0,0 @@
-import re
-import os
-import os.path as op
-from glob import glob
-import urllib
-import subprocess
-import requests
-from bs4 import BeautifulSoup, NavigableString
-from tqdm import tqdm, tqdm_notebook
-
-
-GROBID_URL = "http://localhost:8070"
-DIR_PATH = op.dirname(op.abspath(__file__))
-PDF_FIGURES_JAR_PATH = op.join(
-    DIR_PATH, "pdffigures2", "pdffigures2-assembly-0.0.12-SNAPSHOT.jar"
-)
-
-
-def list_pdf_paths(pdf_folder: str):
-    """
-    list of pdf paths in pdf folder
-    """
-    return glob(op.join(pdf_folder, "*", "*", "*.pdf"))
-
-
-def validate_url(path: str):
-    """
-    Validate a given ``path`` if it is URL or not
-    """
-    regex = re.compile(
-        r"^(?:http|ftp)s?://"  # http:// or https://
-        r"(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|"  # domain...
-        r"localhost|"  # localhost...
-        r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})"  # ...or ip
-        r"(?::\d+)?"  # optional port
-        r"(?:/?|[/?]\S+)$",
-        re.IGNORECASE,
-    )
-    return re.match(regex, path) is not None
-
-
-def parse_pdf(
-    pdf_path: str,
-    fulltext: bool = True,
-    soup: bool = False,
-    return_coordinates: bool = True,
-    grobid_url: str = GROBID_URL,
-):
-    """
-    Function to parse PDF to XML or BeautifulSoup using GROBID tool
-
-    You can see http://grobid.readthedocs.io/en/latest/Install-Grobid/ on how to run GROBID locally
-    After loading GROBID zip file, you can run GROBID by using the following
-    >> ./gradlew run
-
-    Parameters
-    ==========
-    pdf_path: str or bytes, path or URL to publication or article or bytes string of PDF
-    fulltext: bool, option for parsing, if True, parse full text of the article
-        if False, parse only header
-    grobid_url: str, url to GROBID parser, default at 'http://localhost:8070'
-        This could be changed to "https://cloud.science-miner.com/grobid/" for the cloud service
-    soup: bool, if True, return BeautifulSoup of the article
-
-    Output
-    ======
-    parsed_article: if soup is False, return parsed XML in text format,
-        else return BeautifulSoup of the XML
-    Example
-    =======
-    >> parsed_article = parse_pdf(pdf_path, fulltext=True, soup=True)
-    """
-    # GROBID URL
-    if fulltext:
-        url = "%s/api/processFulltextDocument" % grobid_url
-    else:
-        url = "%s/api/processHeaderDocument" % grobid_url
-
-    files = []
-    if return_coordinates:
-        files += [
-            ("teiCoordinates", (None, "persName")),
-            ("teiCoordinates", (None, "figure")),
-            ("teiCoordinates", (None, "ref")),
-            ("teiCoordinates", (None, "formula")),
-            ("teiCoordinates", (None, "biblStruct")),
-        ]
-
-    if isinstance(pdf_path, str):
-        if validate_url(pdf_path) and op.splitext(pdf_path)[-1].lower() != ".pdf":
-            print("The input URL has to end with ``.pdf``")
-            parsed_article = None
-        elif validate_url(pdf_path) and op.splitext(pdf_path)[-1] == ".pdf":
-            page = urllib.request.urlopen(pdf_path).read()
-            parsed_article = requests.post(url, files={"input": page}).text
-        elif op.exists(pdf_path):
-            parsed_article = requests.post(
-                url, files={"input": open(pdf_path, "rb")}
-            ).text
-        else:
-            parsed_article = None
-    elif isinstance(pdf_path, bytes):
-        # assume that incoming is byte string
-        parsed_article = requests.post(url, files={"input": pdf_path}).text
-    else:
-        parsed_article = None
-
-    if soup and parsed_article is not None:
-        parsed_article = BeautifulSoup(parsed_article, "lxml")
-    return parsed_article
-
-
-def parse_authors(article):
-    """
-    Parse authors from a given BeautifulSoup of an article
-    """
-    author_names = article.find("sourcedesc").findAll("persname")
-    authors = []
-    for author in author_names:
-        firstname = author.find("forename", {"type": "first"})
-        firstname = firstname.text.strip() if firstname is not None else ""
-        middlename = author.find("forename", {"type": "middle"})
-        middlename = middlename.text.strip() if middlename is not None else ""
-        lastname = author.find("surname")
-        lastname = lastname.text.strip() if lastname is not None else ""
-        if middlename is not "":
-            authors.append(firstname + " " + middlename + " " + lastname)
-        else:
-            authors.append(firstname + " " + lastname)
-    authors = "; ".join(authors)
-    return authors
-
-
-def parse_date(article):
-    """
-    Parse date from a given BeautifulSoup of an article
-    """
-    pub_date = article.find("publicationstmt")
-    year = pub_date.find("date")
-    year = year.attrs.get("when") if year is not None else ""
-    return year
-
-
-def parse_abstract(article):
-    """
-    Parse abstract from a given BeautifulSoup of an article
-    """
-    div = article.find("abstract")
-    abstract = ""
-    for p in list(div.children):
-        if not isinstance(p, NavigableString) and len(list(p)) > 0:
-            abstract += " ".join(
-                [elem.text for elem in p if not isinstance(elem, NavigableString)]
-            )
-    return abstract
-
-
-def calculate_number_of_references(div):
-    """
-    For a given section, calculate number of references made in the section
-    """
-    n_publication_ref = len(
-        [ref for ref in div.find_all("ref") if ref.attrs.get("type") == "bibr"]
-    )
-    n_figure_ref = len(
-        [ref for ref in div.find_all("ref") if ref.attrs.get("type") == "figure"]
-    )
-    return {"n_publication_ref": n_publication_ref, "n_figure_ref": n_figure_ref}
-
-
-def parse_sections(article, as_list: bool = False):
-    """
-    Parse list of sections from a given BeautifulSoup of an article
-
-    Parameters
-    ==========
-    as_list: bool, if True, output text as a list of paragraph instead
-        of joining it together as one single text
-    """
-    article_text = article.find("text")
-    divs = article_text.find_all("div", attrs={"xmlns": "http://www.tei-c.org/ns/1.0"})
-    sections = []
-    for div in divs:
-        div_list = list(div.children)
-        if len(div_list) == 0:
-            heading = ""
-            text = ""
-        elif len(div_list) == 1:
-            if isinstance(div_list[0], NavigableString):
-                heading = str(div_list[0])
-                text = ""
-            else:
-                heading = ""
-                text = div_list[0].text
-        else:
-            text = []
-            heading = div_list[0]
-            if isinstance(heading, NavigableString):
-                heading = str(heading)
-                p_all = list(div.children)[1:]
-            else:
-                heading = ""
-                p_all = list(div.children)
-            for p in p_all:
-                if p is not None:
-                    try:
-                        text.append(p.text)
-                    except:
-                        pass
-            if not as_list:
-                text = "\n".join(text)
-
-        if heading is not "" or text is not "":
-            ref_dict = calculate_number_of_references(div)
-            sections.append(
-                {
-                    "heading": heading,
-                    "text": text,
-                    "n_publication_ref": ref_dict["n_publication_ref"],
-                    "n_figure_ref": ref_dict["n_figure_ref"],
-                }
-            )
-    return sections
-
-
-def parse_references(article):
-    """
-    Parse list of references from a given BeautifulSoup of an article
-    """
-    reference_list = []
-    references = article.find("text").find("div", attrs={"type": "references"})
-    references = references.find_all("biblstruct") if references is not None else []
-    reference_list = []
-    for reference in references:
-        title = reference.find("title", attrs={"level": "a"})
-        if title is None:
-            title = reference.find("title", attrs={"level": "m"})
-        title = title.text if title is not None else ""
-        journal = reference.find("title", attrs={"level": "j"})
-        journal = journal.text if journal is not None else ""
-        if journal is "":
-            journal = reference.find("publisher")
-            journal = journal.text if journal is not None else ""
-        year = reference.find("date")
-        year = year.attrs.get("when") if year is not None else ""
-        authors = []
-        for author in reference.find_all("author"):
-            firstname = author.find("forename", {"type": "first"})
-            firstname = firstname.text.strip() if firstname is not None else ""
-            middlename = author.find("forename", {"type": "middle"})
-            middlename = middlename.text.strip() if middlename is not None else ""
-            lastname = author.find("surname")
-            lastname = lastname.text.strip() if lastname is not None else ""
-            if middlename is not "":
-                authors.append(firstname + " " + middlename + " " + lastname)
-            else:
-                authors.append(firstname + " " + lastname)
-        authors = "; ".join(authors)
-        reference_list.append(
-            {"title": title, "journal": journal, "year": year, "authors": authors}
-        )
-    return reference_list
-
-
-def parse_figure_caption(article):
-    """
-    Parse list of figures/tables from a given BeautifulSoup of an article
-    """
-    figures_list = []
-    figures = article.find_all("figure")
-    for figure in figures:
-        figure_type = figure.attrs.get("type") or ""
-        figure_id = figure.attrs.get("xml:id") or ""
-        label = figure.find("label").text
-        if figure_type == "table":
-            caption = figure.find("figdesc").text
-            data = figure.table.text
-        else:
-            caption = figure.text
-            data = ""
-        figures_list.append(
-            {
-                "figure_label": label,
-                "figure_type": figure_type,
-                "figure_id": figure_id,
-                "figure_caption": caption,
-                "figure_data": data,
-            }
-        )
-    return figures_list
-
-
-def parse_formulas(article):
-    """
-    Parse list of formulas from a given BeautifulSoup of an article
-    """
-    formulas_list = []
-    formulas = article.find_all("formula")
-    for formula in formulas:
-        formula_id = formula.attrs["xml:id"] or ""
-        formula_text = formula.text
-        formula_coordinates = formula.attrs.get("coords") or ""
-        if formula_coordinates is not "":
-            formula_coordinates = [float(x) for x in formula_coordinates.split(",")]
-            formulas_list.append(
-                {
-                    "formula_id": formula_id,
-                    "formula_text": formula_text,
-                    "formula_coordinates": formula_coordinates,
-                }
-            )
-    return formulas_list
-
-
-def convert_article_soup_to_dict(article, as_list: bool = False):
-    """
-    Function to convert BeautifulSoup to JSON format
-    similar to the output from https://github.com/allenai/science-parse/
-
-    Parameters
-    ==========
-    article: BeautifulSoup
-
-    Output
-    ======
-    article_json: dict, parsed dictionary of a given article in the following format
-        {
-            'title': ...,
-            'abstract': ...,
-            'sections': [
-                {'heading': ..., 'text': ...},
-                {'heading': ..., 'text': ...},
-                ...
-            ],
-            'references': [
-                {'title': ..., 'journal': ..., 'year': ..., 'authors': ...},
-                {'title': ..., 'journal': ..., 'year': ..., 'authors': ...},
-                ...
-            ],
-            'figures': [
-                {'figure_label': ..., 'figure_type': ..., 'figure_id': ..., 'figure_caption': ..., 'figure_data': ...},
-                ...
-            ]
-        }
-    """
-    article_dict = {}
-    if article is not None:
-        title = article.find("title", attrs={"type": "main"})
-        title = title.text.strip() if title is not None else ""
-
-        article_dict["title"] = title
-        article_dict["authors"] = parse_authors(article)
-        article_dict["pub_date"] = parse_date(article)
-        article_dict["abstract"] = parse_abstract(article)
-        article_dict["sections"] = parse_sections(article, as_list=as_list)
-        article_dict["references"] = parse_references(article)
-        article_dict["figures"] = parse_figure_caption(article)
-        article_dict["formulas"] = parse_formulas(article)
-
-        doi = article.find("idno", attrs={"type": "DOI"})
-        doi = doi.text if doi is not None else ""
-        article_dict["doi"] = doi
-
-        return article_dict
-    else:
-        return None
-
-
-def parse_pdf_to_dict(
-    pdf_path: str,
-    fulltext: bool = True,
-    soup: bool = True,
-    as_list: bool = False,
-    return_coordinates: bool = True,
-    grobid_url: str = GROBID_URL,
-):
-    """
-    Parse the given PDF and return dictionary of the parsed article
-
-    Parameters
-    ==========
-    pdf_path: str, path to publication or article
-    fulltext: bool, whether to extract fulltext or not
-    soup: bool, whether to return BeautifulSoup or not
-    as_list: bool, whether to return list of sections or not
-    grobid_url: str, url to grobid server, default is `GROBID_URL`
-        This could be changed to "https://cloud.science-miner.com/grobid/" for the cloud service
-
-    Ouput
-    =====
-    article_dict: dict, dictionary of an article
-    """
-    parsed_article = parse_pdf(
-        pdf_path,
-        fulltext=fulltext,
-        soup=soup,
-        return_coordinates=return_coordinates,
-        grobid_url=grobid_url,
-    )
-    article_dict = convert_article_soup_to_dict(parsed_article, as_list=as_list)
-    return article_dict
-
-
-def parse_figures(
-    pdf_folder: str,
-    jar_path: str = PDF_FIGURES_JAR_PATH,
-    resolution: int = 300,
-    output_folder: str = "figures",
-):
-    """
-    Parse figures from the given scientific PDF using pdffigures2
-
-    Parameters
-    ==========
-    pdf_folder: str, path to a folder that contains PDF files. A folder must contains only PDF files
-    jar_path: str, default path to pdffigures2-assembly-0.0.12-SNAPSHOT.jar file
-    resolution: int, resolution of the output figures
-    output_folder: str, path to folder that we want to save parsed data (related to figures) and figures
-
-    Output
-    ======
-    folder: making a folder of output_folder/data and output_folder/figures of parsed data and figures relatively
-    """
-    if not op.isdir(output_folder):
-        os.makedirs(output_folder)
-
-    # create ``data`` and ``figures`` subfolder within ``output_folder``
-    data_path = op.join(output_folder, "data")
-    figure_path = op.join(output_folder, "figures")
-    if not op.exists(data_path):
-        os.makedirs(data_path)
-    if not op.exists(figure_path):
-        os.makedirs(figure_path)
-
-    if op.isdir(data_path) and op.isdir(figure_path):
-        args = [
-            "java",
-            "-jar",
-            jar_path,
-            pdf_folder,
-            "-i",
-            str(resolution),
-            "-d",
-            op.join(op.abspath(data_path), ""),
-            "-m",
-            op.join(op.abspath(figure_path), ""),  # end path with "/"
-        ]
-        _ = subprocess.run(
-            args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, timeout=20
-        )
-        print("Done parsing figures from PDFs!")
-    else:
-        print(
-            "You may have to check of ``data`` and ``figures`` in the the output folder path."
-        )
diff --git a/scipdf/pdf/parser_functions.py b/scipdf/pdf/parser_functions.py
new file mode 100644
index 0000000..7e97ed4
--- /dev/null
+++ b/scipdf/pdf/parser_functions.py
@@ -0,0 +1,314 @@
+import re
+import warnings
+from glob import glob
+from os import path as op
+
+from bs4 import BeautifulSoup, NavigableString
+
+from scipdf.features import compute_readability_stats, compute_text_stats, compute_journal_features
+from scipdf.models import Section, Reference, Figure, Formula, Article, TextStatistic
+
+
+def list_pdf_paths(pdf_folder: str):
+    """
+    list of pdf paths in pdf folder
+    """
+    return glob(op.join(pdf_folder, "*", "*", "*.pdf"))
+
+
+def validate_url(path: str):
+    """
+    Validate a given ``path`` if it is URL or not
+    """
+    regex = re.compile(
+        r"^(?:http|ftp)s?://"  # http:// or https://
+        r"(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|"  # domain...
+        r"localhost|"  # localhost...
+        r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})"  # ...or ip
+        r"(?::\d+)?"  # optional port
+        r"(?:/?|[/?]\S+)$",
+        re.IGNORECASE,
+    )
+    return re.match(regex, path) is not None
+
+
+def parse_authors(article: BeautifulSoup) -> str:
+    """
+    Parse authors from a given BeautifulSoup of an article
+    """
+    author_names = article.find("sourcedesc").findAll("persname")
+    authors = []
+    for author in author_names:
+        try:
+            firstname = author.find("forename", {"type": "first"})
+            firstname = firstname.text.strip() if firstname is not None else ""
+            middlename = author.find("forename", {"type": "middle"})
+            middlename = middlename.text.strip() if middlename is not None else ""
+            lastname = author.find("surname")
+            lastname = lastname.text.strip() if lastname is not None else ""
+            if middlename != "":
+                authors.append(firstname + " " + middlename + " " + lastname)
+            else:
+                authors.append(firstname + " " + lastname)
+        except Exception as e:
+            warnings.warn(f"Error parsing author: {author}")
+    authors = "; ".join(authors)
+    return authors
+
+
+def parse_date(article: BeautifulSoup) -> str:
+    """
+    Parse date from a given BeautifulSoup of an article
+    """
+    pub_date = article.find("publicationstmt")
+    year = pub_date.find("date")
+    year = year.attrs.get("when") if year is not None else ""
+    return year
+
+
+def parse_abstract(article: BeautifulSoup) -> str:
+    """
+    Parse abstract from a given BeautifulSoup of an article
+    """
+    div = article.find("abstract")
+    abstract = ""
+    for p in list(div.children):
+        if not isinstance(p, NavigableString) and len(list(p)) > 0:
+            abstract += " ".join(
+                [elem.text for elem in p if not isinstance(elem, NavigableString)]
+            )
+    return abstract
+
+
+def calculate_number_of_references(div):
+    """
+    For a given section, calculate number of references made in the section
+    """
+    n_publication_ref = len(
+        [ref for ref in div.find_all("ref") if ref.attrs.get("type") == "bibr"]
+    )
+    n_figure_ref = len(
+        [ref for ref in div.find_all("ref") if ref.attrs.get("type") == "figure"]
+    )
+    return {"n_publication_ref": n_publication_ref, "n_figure_ref": n_figure_ref}
+
+
+def parse_sections(article: BeautifulSoup) -> list[Section]:
+    """
+    Parse list of sections from a given BeautifulSoup of an article
+
+    Parameters
+    ==========
+    as_list: bool, if True, output text as a list of paragraph instead
+        of joining it together as one single text
+    """
+    article_text = article.find("text")
+    divs = article_text.find_all("div", attrs={"xmlns": "http://www.tei-c.org/ns/1.0"})
+    sections: list[Section] = []
+    for div in divs:
+        try:
+            div_list = list(div.children)
+            if len(div_list) == 0:
+                heading = ""
+                text = ""
+            elif len(div_list) == 1:
+                if isinstance(div_list[0], NavigableString):
+                    heading = str(div_list[0])
+                    text = ""
+                else:
+                    heading = ""
+                    text = div_list[0].text
+            else:
+                text = []
+                heading = div_list[0]
+                if isinstance(heading, NavigableString):
+                    heading = str(heading)
+                    p_all = list(div.children)[1:]
+                else:
+                    heading = ""
+                    p_all = list(div.children)
+                for p in p_all:
+                    if p is not None:
+                        try:
+                            text.append(p.text)
+                        except:
+                            pass
+                text = " ".join(text)
+
+            if heading != "" or text != "":
+                ref_dict = calculate_number_of_references(div)
+                sections.append(
+                    Section(
+                        heading=heading,
+                        text=text,
+                        n_publication_ref=ref_dict["n_publication_ref"],
+                        n_figure_ref=ref_dict["n_figure_ref"],
+                    )
+                )
+        except Exception as e:
+            warnings.warn(f"Error parsing section: {div}")
+    return sections
+
+
+def parse_references(article: BeautifulSoup) -> list[Reference]:
+    """
+    Parse list of references from a given BeautifulSoup of an article
+    """
+    references = article.find("text").find("div", attrs={"type": "references"})
+    references = references.find_all("biblstruct") if references is not None else []
+    reference_list = []
+    for reference in references:
+        try:
+            title = reference.find("title", attrs={"level": "a"})
+            if title is None:
+                title = reference.find("title", attrs={"level": "m"})
+            title = title.text if title is not None else ""
+            journal = reference.find("title", attrs={"level": "j"})
+            journal = journal.text if journal is not None else ""
+            if journal == "":
+                journal = reference.find("publisher")
+                journal = journal.text if journal is not None else ""
+            year = reference.find("date")
+            year = year.attrs.get("when")
+            authors = []
+            for author in reference.find_all("author"):
+                firstname = author.find("forename", {"type": "first"})
+                firstname = firstname.text.strip() if firstname is not None else ""
+                middlename = author.find("forename", {"type": "middle"})
+                middlename = middlename.text.strip() if middlename is not None else ""
+                lastname = author.find("surname")
+                lastname = lastname.text.strip() if lastname is not None else ""
+                if middlename != "":
+                    authors.append(firstname + " " + middlename + " " + lastname)
+                else:
+                    authors.append(firstname + " " + lastname)
+            authors = "; ".join(authors)
+            reference_list.append(
+                Reference(title=title, journal=journal, year=year, authors=authors)
+            )
+        except:
+            warnings.warn(f"Error parsing reference: {reference}")
+    return reference_list
+
+
+def parse_figure_caption(article: BeautifulSoup) -> list[Figure]:
+    """
+    Parse list of figures/tables from a given BeautifulSoup of an article
+    """
+    figures_list = []
+    figures = article.find_all("figure")
+    for figure in figures:
+        try:
+            figure_type = figure.attrs.get("type") or ""
+            figure_id = figure.attrs.get("xml:id") or ""
+            label = figure.find("label").text
+            if figure_type == "table":
+                caption = figure.find("figdesc").text
+                data = figure.table.text
+            else:
+                caption = figure.text
+                data = ""
+            figures_list.append(
+                Figure(
+                    figure_label=label,
+                    figure_type=figure_type,
+                    figure_id=figure_id,
+                    figure_caption=caption,
+                    figure_data=data,
+                )
+            )
+        except:
+            warnings.warn(f"Error parsing figure, {figure}")
+    return figures_list
+
+
+def parse_formulas(article: BeautifulSoup) -> list[Formula]:
+    """
+    Parse list of formulas from a given BeautifulSoup of an article
+
+    Parameters
+    ==========
+    article: BeautifulSoup, parsed article in BeautifulSoup format
+
+    Returns
+    =======
+    formulas_list: list[Formula], list of formulas parsed from the article
+    """
+    formulas_list = []
+
+    formulas = article.find_all("formula")
+    for formula in formulas:
+        try:
+            formula_id = formula.attrs.get("xml:id", "")
+            formula_text = formula.text
+            formula_coordinates = formula.attrs.get("coords", [])
+
+            if formula_coordinates:
+                formula_coordinates = [float(x) for x in formula_coordinates.split(",")]
+
+            formula_data = Formula(
+                formula_id=formula_id,
+                formula_text=formula_text,
+                formula_coordinates=formula_coordinates,
+            )
+            formulas_list.append(formula_data)
+        except:
+            warnings.warn(f"Error parsing formula, {formula}")
+
+    return formulas_list
+
+
+def calculate_text_stats(article: Article, soup: BeautifulSoup) -> Article:
+    """
+    Function to calculate text statistics for a given article
+
+    Parameters
+    ==========
+    article: Article, parsed article in JSON format
+
+    Returns
+    =======
+    article: Article, parsed article with text statistics
+    """
+    full_text = article.full_text
+    return TextStatistic(
+        readability=compute_readability_stats(full_text),
+        text_stats=compute_text_stats(full_text),
+        journal_features=compute_journal_features(soup),
+    )
+
+
+def convert_article_soup_to_pydantic(soup: BeautifulSoup) -> Article:
+    """
+    Function to convert BeautifulSoup to JSON format similar to the output from https://github.com/allenai/science-parse/
+
+    Parameters
+    ==========
+    soup: BeautifulSoup
+
+    Output
+    ======
+    article_dict: ArticleDict, parsed dictionary of a given article
+    """
+    if soup is None:
+        raise ValueError("Soup is None")
+
+    title = soup.find("title", attrs={"type": "main"})
+    title = title.text.strip() if title is not None else ""
+    doi = soup.find("idno", attrs={"type": "DOI"})
+    doi = doi.text if doi is not None else ""
+
+    article = Article(
+        title=title,
+        authors=parse_authors(soup),
+        pub_date=parse_date(soup),
+        abstract=parse_abstract(soup),
+        sections=parse_sections(soup),
+        references=parse_references(soup),
+        figures=parse_figure_caption(soup),
+        formulas=parse_formulas(soup),
+        doi=doi,
+    )
+
+    article.text_stats = calculate_text_stats(article, soup)
+    return article
diff --git a/serve_grobid.sh b/serve_grobid.sh
index dfc27c6..1c96e5c 100644
--- a/serve_grobid.sh
+++ b/serve_grobid.sh
@@ -1,14 +1,7 @@
 #!/bin/bash
+# assumes you have docker and nvidia-container-toolkit installed
+# see https://aur.archlinux.org/packages/nvidia-container-toolkit
+# After installing, you need to restart docker
+# sudo systemctl restart docker
 
-# download GROBID if directory does not exist
-declare -r GROBID_VERSION="0.6.2" # or change to current stable version
-
-if [ ! -d grobid-${GROBID_VERSION} ]; then
-  wget https://github.com/kermitt2/grobid/archive/${GROBID_VERSION}.zip
-  unzip "${GROBID_VERSION}.zip"
-  rm "${GROBID_VERSION}.zip"
-fi
-
-# run GROBID
-cd grobid-${GROBID_VERSION} || exit
-./gradlew run
+docker run -t --rm --gpus all -p 8070:8070 grobid/grobid:0.7.3
diff --git a/setup.py b/setup.py
index 494d979..3f6e9ff 100644
--- a/setup.py
+++ b/setup.py
@@ -7,20 +7,24 @@
 if __name__ == "__main__":
     setup(
         name='scipdf',
-        version='0.1dev',
+        version='1.0.3',
         description=' Python parser for scientific PDF based on GROBID.',
         long_description=long_description,
         long_description_content_type="text/markdown",
-        url='https://github.com/titipata/scipdf_parser',
-        author='Titipat Achakulvisut',
-        author_email='my.titipat@gmail.com',
-        license='(c) MIT License 2019 Titipat Achakulvisut',
-        install_requires=['lxml', 'requests', 'spacy', 'pandas', 'textstat'],
+        url='https://github.com/skuam/scipdf_parser',
+        author='Titipat Achakulvisut, Mateusz Jakubnczak',
+        author_email='my.titipat@gmail.com, mateusz.jakubczak.contact+githubSciPDfParser@gmail.com',
+        license='(c) MIT License 2023 Titipat Achakulvisut, Mateusz Jakubczak',
+        install_requires=['lxml', 'requests', 'spacy', 'pandas', 'textstat', "pydantic", "beautifulsoup4"],
         packages=find_packages(),
         keywords=[
             "PDF parser",
             "GROBID",
-            "Python PDF parser"
+            "Python PDF parser",
+            "Pydantic",
+            "Scientific PDF parser",
+            "Scientific PDF",
+            "PDF",
         ],
         classifiers=[
             "Programming Language :: Python :: 3",
diff --git a/test/__init__.py b/test/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/test/test_parse_pdf.py b/test/test_parse_pdf.py
new file mode 100644
index 0000000..36178a4
--- /dev/null
+++ b/test/test_parse_pdf.py
@@ -0,0 +1,16 @@
+import os
+import pytest
+
+from scipdf.models import Article
+from scipdf.parse_pdf import SciPDFParser
+
+
+def test_parse_pdf():
+    # Requires GROBID to be running locally
+    try:
+        parser = SciPDFParser()
+        article: Article = parser.parse_pdf(os.path.join(os.path.dirname(__file__), "../example_data/futoma2017improved.pdf" ))
+        assert article.title == 'An Improved Multi-Output Gaussian Process RNN with Real-Time Validation for Early Sepsis Detection'
+    except OSError:
+        print(" \n GROBID is not running locally, skipping test_parse_pdf")
+        pytest.skip("GROBID is not running locally, skipping test_parse_pdf")