diff --git a/README.md b/README.md index 198a278..5ef03bc 100644 --- a/README.md +++ b/README.md @@ -7,13 +7,18 @@ A Python parser for scientific PDF based on [GROBID](https://github.com/kermitt2 Use `pip` to install from this Github repository ```bash -pip install git+https://github.com/titipata/scipdf_parser +pip install git+https://github.com/skuam/scipdf_parser ``` **Note** * We also need an `en_core_web_sm` model for spacy, where you can run `python -m spacy download en_core_web_sm` to download it * You can change GROBID version in `serve_grobid.sh` to test the parser on a new GROBID version +```bash +python -m spacy download en_core_web_sm +``` + + ## Usage Run the GROBID using the given bash script before parsing PDF @@ -26,39 +31,76 @@ This script will download GROBID and run the service at default port 8070 (see m To parse a PDF provided in `example_data` folder or direct URL, use the following function: ```python -import scipdf -article_dict = scipdf.parse_pdf_to_dict('example_data/futoma2017improved.pdf') # return dictionary - -# option to parse directly from URL to PDF, if as_list is set to True, output 'text' of parsed section will be in a list of paragraphs instead -article_dict = scipdf.parse_pdf_to_dict('https://www.biorxiv.org/content/biorxiv/early/2018/11/20/463760.full.pdf', as_list=False) +import json +from scipdf.parse_pdf import SciPDFParser +from scipdf.models import Article + +parser = SciPDFParser() +article:Article = parser.parse_pdf('https://www.biorxiv.org/content/biorxiv/early/2018/11/20/463760.full.pdf') + +print(json.dumps(article.dict(), indent = 4)) # output example ->> { - 'title': 'Proceedings of Machine Learning for Healthcare', - 'abstract': '...', - 'sections': [ - {'heading': '...', 'text': '...'}, - {'heading': '...', 'text': '...'}, - ... +{ + "title": "A new method for measuring daytime sleepiness: the Epworth sleepiness scale.", + "authors": "Murray Johns", + "pub_date": "1991", + "abstract": "Text of abstract", + "sections": [ + { + "heading": "Introduction", + "text": "Text of introduction", + "n_publication_ref": 1, + "n_figure_ref": 1 + } + ], + "references": [ + { + "title": "The Epworth Sleepiness Scale in Clinical Practice", + "journal": "Sleep Breath", + "year": "2017", + "authors": "Chervin RD, et al." + }, + { + "title": "A new method for measuring daytime sleepiness: the Epworth sleepiness scale.", + "journal": "Sleep", + "year": "1991", + "authors": "Johns MW" + } ], - 'references': [ - {'title': '...', 'year': '...', 'journal': '...', 'author': '...'}, - ... + "figures": [ + { + "figure_label": "Figure 1", + "figure_type": "table", + "figure_id": "fig1", + "figure_caption": "Caption of figure 1", + "figure_data": "Data of figure 1" + } ], - 'figures': [ - {'figure_label': '...', 'figure_type': '...', 'figure_id': '...', 'figure_caption': '...', 'figure_data': '...'}, - ... + "formulas": [ + { + "formula_id": "f1", + "formula_text": "a^2 + b^2 = c^2", + "formula_coordinates": [ + 1, + 2, + 3, + 4 + ] + } ], - 'doi': '...' + "doi": "10.1111/j.1365-2869.1991.tb00031.x" } - -xml = scipdf.parse_pdf('example_data/futoma2017improved.pdf', soup=True) # option to parse full XML from GROBID ``` +!!! Warning Parsing of figures is not supported yet in pydantic models, so you need to parse it manually. !!! + To parse figures from PDF using [pdffigures2](https://github.com/allenai/pdffigures2), you can run ```python -scipdf.parse_figures('example_data', output_folder='figures') # folder should contain only PDF files +from scipdf.parse_pdf import SciPDFParser +parser = SciPDFParser() +parser.parse_figures('example_data', output_folder='figures') # folder should contain only PDF files ``` You can see example output figures in `figures` folder. diff --git a/example.py b/example.py new file mode 100644 index 0000000..85febb5 --- /dev/null +++ b/example.py @@ -0,0 +1,9 @@ +import json +from scipdf.parse_pdf import SciPDFParser +from scipdf.models import Article + +if __name__ == '__main__': + parser = SciPDFParser() + article: Article = parser.parse_pdf('https://www.biorxiv.org/content/biorxiv/early/2018/11/20/463760.full.pdf') + + print(json.dumps(article.dict(), indent=4)) diff --git a/requirements.txt b/requirements.txt index f90aadb..7b9f2f4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,3 +3,4 @@ spacy pandas textstat beautifulsoup4 +pydantic diff --git a/scipdf/__init__.py b/scipdf/__init__.py index d232dfb..6d95d37 100644 --- a/scipdf/__init__.py +++ b/scipdf/__init__.py @@ -1,6 +1,7 @@ -__version__ = "0.1dev" +__version__ = "1.0.1" __all__ = ["pdf", "features"] from scipdf.features.text_utils import * -from scipdf.pdf.parse_pdf import * +from scipdf.models import * +from scipdf.parse_pdf import * diff --git a/scipdf/features/__init__.py b/scipdf/features/__init__.py index c296bab..cc98ccb 100644 --- a/scipdf/features/__init__.py +++ b/scipdf/features/__init__.py @@ -1,7 +1 @@ -from .text_utils import compute_readability_stats, compute_text_stats - -__all__ = [ - "compute_readability_stats", - "compute_text_stats", - "compute_journal_features", -] +from .text_utils import compute_readability_stats, compute_text_stats , compute_journal_features diff --git a/scipdf/features/text_utils.py b/scipdf/features/text_utils.py index f0c49af..dfe5824 100644 --- a/scipdf/features/text_utils.py +++ b/scipdf/features/text_utils.py @@ -1,10 +1,13 @@ +import warnings + import numpy as np import pandas as pd -import textstat import spacy -from collections import Counter -from itertools import groupby +import textstat +from bs4 import BeautifulSoup +from spacy.tokens import Doc +from scipdf.models import ReadabilityStats, TextStats, JournalFeatures nlp = spacy.load("en_core_web_sm") @@ -12,7 +15,6 @@ VERB_LIST = ["VB", "VBP", "VBZ", "VBG", "VBN", "VBD"] NOUN_LIST = ["NNP", "NNPS"] - SECTIONS_MAPS = { "Authors": "Authors", "AUTHORS": "AUTHORS", @@ -37,7 +39,7 @@ } -def compute_readability_stats(text): +def compute_readability_stats(text) -> ReadabilityStats: """ Compute reading statistics of the given text Reference: https://github.com/shivam5992/textstat @@ -46,39 +48,58 @@ def compute_readability_stats(text): ========== text: str, input section or abstract text """ - try: - readability_dict = { - "flesch_reading_ease": textstat.flesch_reading_ease(text), - "smog": textstat.smog_index(text), - "flesch_kincaid_grade": textstat.flesch_kincaid_grade(text), - "coleman_liau_index": textstat.coleman_liau_index(text), - "automated_readability_index": textstat.automated_readability_index(text), - "dale_chall": textstat.dale_chall_readability_score(text), - "difficult_words": textstat.difficult_words(text), - "linsear_write": textstat.linsear_write_formula(text), - "gunning_fog": textstat.gunning_fog(text), - "text_standard": textstat.text_standard(text), - "n_syllable": textstat.syllable_count(text), - "avg_letter_per_word": textstat.avg_letter_per_word(text), - "avg_sentence_length": textstat.avg_sentence_length(text), - } - except: - readability_dict = { - "flesch_reading_ease": None, - "smog": None, - "flesch_kincaid_grade": None, - "coleman_liau_index": None, - "automated_readability_index": None, - "dale_chall": None, - "difficult_words": None, - "linsear_write": None, - "gunning_fog": None, - "text_standard": None, - "n_syllable": None, - "avg_letter_per_word": None, - "avg_sentence_length": None, - } - return readability_dict + functions = { + "flesch_reading_ease": textstat.flesch_reading_ease, + "smog": textstat.smog_index, + "flesch_kincaid_grade": textstat.flesch_kincaid_grade, + "coleman_liau_index": textstat.coleman_liau_index, + "automated_readability_index": textstat.automated_readability_index, + "dale_chall": textstat.dale_chall_readability_score, + "difficult_words": textstat.difficult_words, + "linsear_write": textstat.linsear_write_formula, + "gunning_fog": textstat.gunning_fog, + "text_standard": textstat.text_standard, + "n_syllable": textstat.syllable_count, + "avg_letter_per_word": textstat.avg_letter_per_word, + "avg_sentence_length": textstat.avg_sentence_length, + } + + readability_dict = {} + + for key, function in functions.items(): + try: + readability_dict[key] = function(text) + except Exception: + readability_dict[key] = None + + return ReadabilityStats(**readability_dict) + + +from collections import Counter + + +def count_pos(text): + return dict(Counter([token.pos_ for token in text])) + + +def count_pos_tag(text): + return dict(Counter([token.tag_ for token in text])) + + +def sum_present_verb(pos_tag): + return sum([v for k, v in pos_tag.items() if k in PRESENT_TENSE_VERB_LIST]) + + +def sum_verb(pos_tag): + return sum([v for k, v in pos_tag.items() if k in VERB_LIST]) + + +def count_word_shape(text): + return dict(Counter([token.shape_ for token in text])) + + +def count_digits(text): + return sum([token.is_digit or token.like_num for token in text]) def compute_text_stats(text): @@ -91,128 +112,79 @@ def compute_text_stats(text): Output ====== - text_stat: dict, part of speech and text features extracted from the given text + text_stats: TextStats, part of speech and text features extracted from the given text """ + spacy_text: Doc = nlp(text) + text_stats_dict = {} + functions = [ + (count_pos, "pos"), + (count_pos_tag, "pos_tag"), + (count_word_shape, "word_shape"), + (count_digits, "n_digits"), + ] + for function, key in functions: + try: + text_stats_dict[key] = function(spacy_text) + except Exception: + text_stats_dict[key] = None try: - pos = dict(Counter([token.pos_ for token in text])) - pos_tag = dict( - Counter([token.tag_ for token in text]) - ) # detailed part-of-speech - - n_present_verb = sum( - [v for k, v in pos_tag.items() if k in PRESENT_TENSE_VERB_LIST] - ) - n_verb = sum([v for k, v in pos_tag.items() if k in VERB_LIST]) - - word_shape = dict(Counter([token.shape_ for token in text])) # word shape - n_word_per_sents = [len([token for token in sent]) for sent in text.sents] - n_digits = sum([token.is_digit or token.like_num for token in text]) - n_word = sum(n_word_per_sents) - n_sents = len(n_word_per_sents) - text_stats_dict = { - "pos": pos, - "pos_tag": pos_tag, - "word_shape": word_shape, - "n_word": n_word, - "n_sents": n_sents, - "n_present_verb": n_present_verb, - "n_verb": n_verb, - "n_digits": n_digits, - "percent_digits": n_digits / n_word, - "n_word_per_sents": n_word_per_sents, - "avg_word_per_sents": np.mean(n_word_per_sents), - } - except: - text_stats_dict = { - "pos": None, - "pos_tag": None, - "word_shape": None, - "n_word": None, - "n_sents": None, - "n_present_verb": None, - "n_verb": None, - "n_digits": None, - "percent_digits": None, - "n_word_per_sents": None, - "avg_word_per_sents": None, - } - return text_stats_dict - - -def compute_journal_features(article): - """ - Parse features about journal references from a given dictionary of parsed article e.g. - number of reference made, number of unique journal refered, minimum year of references, - maximum year of references, ... + pos_tag = text_stats_dict.get("pos_tag", {}) + text_stats_dict["n_present_verb"] = sum_present_verb(pos_tag) + text_stats_dict["n_verb"] = sum_verb(pos_tag) + except Exception: + text_stats_dict["n_present_verb"] = None + text_stats_dict["n_verb"] = None + # Use spacy to parse the text - Parameters - ========== - article: dict, article dictionary parsed from GROBID and converted to dictionary - see ``pdf/parse_pdf.py`` for the detail of the output dictionary + n_word_per_sents = [len([token for token in sent]) for sent in spacy_text.sents] + text_stats_dict["n_word"] = sum(n_word_per_sents) + text_stats_dict["n_sents"] = len(n_word_per_sents) + text_stats_dict["percent_digits"] = ( + text_stats_dict["n_digits"] / text_stats_dict["n_word"] + if text_stats_dict["n_word"] > 0 + else None + ) + text_stats_dict["n_word_per_sents"] = n_word_per_sents + text_stats_dict["avg_word_per_sents"] = np.mean(n_word_per_sents) - Output - ====== - reference_dict: dict, dictionary of - """ - try: - n_reference = len(article["references"]) - n_unique_journals = len( - pd.unique([a["journal"] for a in article["references"]]) - ) - reference_years = [] - for reference in article["references"]: - year = reference["year"] - if year.isdigit(): - # filter outliers - if int(year) in range(1800, 2100): - reference_years.append(int(year)) - avg_ref_year = np.mean(reference_years) - median_ref_year = np.median(reference_years) - min_ref_year = np.min(reference_years) - max_ref_year = np.max(reference_years) - journal_features_dict = { - "n_reference": n_reference, - "n_unique_journals": n_unique_journals, - "avg_ref_year": avg_ref_year, - "median_ref_year": median_ref_year, - "min_ref_year": min_ref_year, - "max_ref_year": max_ref_year, - } - except: - journal_features_dict = { - "n_reference": None, - "n_unique_journals": None, - "avg_ref_year": None, - "median_ref_year": None, - "min_ref_year": None, - "max_ref_year": None, - } - return journal_features_dict - - -def merge_section_list(section_list, section_maps=SECTIONS_MAPS, section_start=""): - """ - Merge a list of sections into a normalized list of sections, - you can get the list of sections from parsed article JSON in ``parse_pdf.py`` e.g. + return TextStats(**text_stats_dict) - >> section_list = [s['heading'] for s in article_json['sections']] - >> section_list_merged = merge_section_list(section_list) + +def filter_valid_years(years): + return [year for year in years if year.isdigit() and int(year) in range(1800, 2100)] + + +def compute_journal_features(soup: BeautifulSoup): + """ + Parse features about journal references from a given dictionary of parsed article Parameters ========== - section_list: list, list of sections + soup: dict, article dictionary parsed from GROBID and converted to dictionary Output ====== - section_list_merged: list, sections + journal_features: JournalFeatures, features about journal references """ - sect_map = section_start # text for starting section e.g. ``Introduction`` - section_list_merged = [] - for section in section_list: - if any([(s.lower() in section.lower()) for s in section_maps.keys()]): - sect = [s for s in section_maps.keys() if s.lower() in section.lower()][0] - sect_map = section_maps.get(sect, "") # - section_list_merged.append(sect_map) - else: - section_list_merged.append(sect_map) - return section_list_merged + functions = [ + ("n_reference", lambda: len(soup.get("references", []))), + ("n_unique_journals", lambda: len(pd.unique([a.get("journal") for a in soup.get("references", [])]))), + ("avg_ref_year", lambda: np.mean(filter_valid_years([a.get("year") for a in soup.get("references", [])]))), + ("median_ref_year", lambda: np.median(filter_valid_years([a.get("year") for a in soup.get("references", [])]))), + ("min_ref_year", lambda: np.min(filter_valid_years([a.get("year") for a in soup.get("references", [])])), ), + ("max_ref_year", lambda: np.max(filter_valid_years([a.get("year") for a in soup.get("references", [])]))), + ] + + journal_features_dict = {} + failed_functions = [] + + for key, function in functions: + try: + journal_features_dict[key] = function() + except Exception: + failed_functions.append(key) + + if failed_functions: + warnings.warn(f"The following functions failed: {failed_functions}") + + return JournalFeatures(**journal_features_dict) diff --git a/scipdf/models.py b/scipdf/models.py new file mode 100644 index 0000000..3bc0a0e --- /dev/null +++ b/scipdf/models.py @@ -0,0 +1,102 @@ +""" +All Pydantic models for the scipdf package. +""" +from typing import Optional + +from pydantic import BaseModel + + +# Text Stats models +class ReadabilityStats(BaseModel): + flesch_reading_ease: Optional[float] + smog: Optional[float] + flesch_kincaid_grade: Optional[float] + coleman_liau_index: Optional[float] + automated_readability_index: Optional[float] + dale_chall: Optional[float] + difficult_words: Optional[int] + linsear_write: Optional[float] + gunning_fog: Optional[float] + text_standard: Optional[str] + n_syllable: Optional[int] + avg_letter_per_word: Optional[float] + avg_sentence_length: Optional[float] + + +class TextStats(BaseModel): + pos: dict + pos_tag: dict + word_shape: dict + n_word: int + n_sents: int + n_present_verb: Optional[int] + n_verb: Optional[int] + n_digits: int + percent_digits: float + n_word_per_sents: list + avg_word_per_sents: float + + +class JournalFeatures(BaseModel): + n_reference: Optional[int] + n_unique_journals: Optional[int] + avg_ref_year: Optional[float] + median_ref_year: Optional[float] + min_ref_year: Optional[int] + max_ref_year: Optional[int] + + +class TextStatistic(BaseModel): + readability: ReadabilityStats + text_stats: TextStats + journal_features: JournalFeatures + + +# Text content models +class Section(BaseModel): + heading: Optional[str] + text: str + n_publication_ref: int + n_figure_ref: int + + @property + def full_text(self): + return self.heading + "\n" + self.text + + +class Reference(BaseModel): + title: str + journal: str + year: Optional[str] + authors: str + + +class Figure(BaseModel): + figure_label: str + figure_type: str + figure_id: str + figure_caption: str + figure_data: str + + +class Formula(BaseModel): + formula_id: str + formula_text: str + formula_coordinates: list + + +class Article(BaseModel): + title: str + authors: str + pub_date: str + abstract: str + sections: list[Section] + references: list[Reference] + figures: list[Figure] + formulas: list[Formula] + doi: str + text_stats: Optional[TextStatistic] + + @property + def full_text(self) -> str: + return "\n\n".join([section.full_text for section in self.sections]) diff --git a/scipdf/parse_pdf.py b/scipdf/parse_pdf.py new file mode 100644 index 0000000..599226a --- /dev/null +++ b/scipdf/parse_pdf.py @@ -0,0 +1,137 @@ +import os +import os.path as op +import subprocess +import urllib + +import requests +from bs4 import BeautifulSoup + +from scipdf.models import Article +from scipdf.pdf.parser_functions import validate_url, convert_article_soup_to_pydantic + + +class SciPDFParser: + def __init__(self, grobid_url: str = "http://localhost:8070"): + self.grobid_url = grobid_url + self.pdf_figures_jar_path = op.join(op.dirname(__file__), + "pdf/pdffigures2/pdffigures2-assembly-0.0.12-SNAPSHOT.jar") + + def parse_pdf(self, + pdf_path: str, + fulltext: bool = True, + return_coordinates: bool = True, + ) -> Article: + """ + Function to parse PDF to XML or BeautifulSoup using GROBID tool + + You can see http://grobid.readthedocs.io/en/latest/Install-Grobid/ on how to run GROBID locally + After loading GROBID zip file, you can run GROBID by using the following + >> ./gradlew run + + Parameters + ========== + pdf_path: str or bytes, path or URL to publication or article or bytes string of PDF + fulltext: bool, option for parsing, if True, parse full text of the article + if False, parse only header + grobid_url: str, url to GROBID parser, default at 'http://localhost:8070' + This could be changed to "https://cloud.science-miner.com/grobid/" for the cloud service + soup: bool, if True, return BeautifulSoup of the article + + Output + ====== + parsed_article: if soup is False, return parsed XML in text format, + else return BeautifulSoup of the XML + Example + ======= + >> parsed_article = parse_pdf(pdf_path, fulltext=True, soup=True) + """ + # GROBID URL + if fulltext: + url = "%s/api/processFulltextDocument" % self.grobid_url + else: + url = "%s/api/processHeaderDocument" % self.grobid_url + + files = [] + if return_coordinates: + files += [ + ("teiCoordinates", (None, "persName")), + ("teiCoordinates", (None, "figure")), + ("teiCoordinates", (None, "ref")), + ("teiCoordinates", (None, "formula")), + ("teiCoordinates", (None, "biblStruct")), + ] + + if isinstance(pdf_path, str): + if op.splitext(pdf_path)[-1].lower() != ".pdf": + raise ValueError("The input has to end with ``.pdf``") + elif validate_url(pdf_path): + page = urllib.request.urlopen(pdf_path).read() + parsed_article = requests.post(url, files={"input": page}).text + elif op.exists(pdf_path): + parsed_article = requests.post( + url, files={"input": open(pdf_path, "rb")} + ).text + else: + raise RuntimeError("The input URL is not valid") + elif isinstance(pdf_path, bytes): + # assume that incoming is byte string + parsed_article = requests.post(url, files={"input": pdf_path}).text + else: + raise RuntimeError("Failed to parse PDF, Do you have GROBID running?") + + parsed_article = BeautifulSoup(parsed_article, "lxml") + + return convert_article_soup_to_pydantic(parsed_article) + + def parse_figures( + self, + pdf_folder: str, + resolution: int = 300, + output_folder: str = "figures", + ): + """ + Parse figures from the given scientific PDF using pdffigures2 + + Parameters + ========== + pdf_folder: str, path to a folder that contains PDF files. A folder must contains only PDF files + jar_path: str, default path to pdffigures2-assembly-0.0.12-SNAPSHOT.jar file + resolution: int, resolution of the output figures + output_folder: str, path to folder that we want to save parsed data (related to figures) and figures + + Output + ====== + folder: making a folder of output_folder/data and output_folder/figures of parsed data and figures relatively + """ + if not op.isdir(output_folder): + os.makedirs(output_folder) + + # create ``data`` and ``figures`` subfolder within ``output_folder`` + data_path = op.join(output_folder, "data") + figure_path = op.join(output_folder, "figures") + if not op.exists(data_path): + os.makedirs(data_path) + if not op.exists(figure_path): + os.makedirs(figure_path) + + if op.isdir(data_path) and op.isdir(figure_path): + args = [ + "java", + "-jar", + self.pdf_figures_jar_path, + pdf_folder, + "-i", + str(resolution), + "-d", + op.join(op.abspath(data_path), ""), + "-m", + op.join(op.abspath(figure_path), ""), # end path with "/" + ] + _ = subprocess.run( + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, timeout=20 + ) + print("Done parsing figures from PDFs!") + else: + print( + "You may have to check of ``data`` and ``figures`` in the the output folder path." + ) diff --git a/scipdf/pdf/__init__.py b/scipdf/pdf/__init__.py index 0e29fce..e69de29 100644 --- a/scipdf/pdf/__init__.py +++ b/scipdf/pdf/__init__.py @@ -1,9 +0,0 @@ -from .parse_pdf import * - -__all__ = [ - "list_pdf_paths", - "parse_abstract", - "parse_figure_caption", - "parse_references", - "parse_pdf_to_dict", -] diff --git a/scipdf/pdf/parse_pdf.py b/scipdf/pdf/parse_pdf.py deleted file mode 100644 index cf82641..0000000 --- a/scipdf/pdf/parse_pdf.py +++ /dev/null @@ -1,455 +0,0 @@ -import re -import os -import os.path as op -from glob import glob -import urllib -import subprocess -import requests -from bs4 import BeautifulSoup, NavigableString -from tqdm import tqdm, tqdm_notebook - - -GROBID_URL = "http://localhost:8070" -DIR_PATH = op.dirname(op.abspath(__file__)) -PDF_FIGURES_JAR_PATH = op.join( - DIR_PATH, "pdffigures2", "pdffigures2-assembly-0.0.12-SNAPSHOT.jar" -) - - -def list_pdf_paths(pdf_folder: str): - """ - list of pdf paths in pdf folder - """ - return glob(op.join(pdf_folder, "*", "*", "*.pdf")) - - -def validate_url(path: str): - """ - Validate a given ``path`` if it is URL or not - """ - regex = re.compile( - r"^(?:http|ftp)s?://" # http:// or https:// - r"(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|" # domain... - r"localhost|" # localhost... - r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})" # ...or ip - r"(?::\d+)?" # optional port - r"(?:/?|[/?]\S+)$", - re.IGNORECASE, - ) - return re.match(regex, path) is not None - - -def parse_pdf( - pdf_path: str, - fulltext: bool = True, - soup: bool = False, - return_coordinates: bool = True, - grobid_url: str = GROBID_URL, -): - """ - Function to parse PDF to XML or BeautifulSoup using GROBID tool - - You can see http://grobid.readthedocs.io/en/latest/Install-Grobid/ on how to run GROBID locally - After loading GROBID zip file, you can run GROBID by using the following - >> ./gradlew run - - Parameters - ========== - pdf_path: str or bytes, path or URL to publication or article or bytes string of PDF - fulltext: bool, option for parsing, if True, parse full text of the article - if False, parse only header - grobid_url: str, url to GROBID parser, default at 'http://localhost:8070' - This could be changed to "https://cloud.science-miner.com/grobid/" for the cloud service - soup: bool, if True, return BeautifulSoup of the article - - Output - ====== - parsed_article: if soup is False, return parsed XML in text format, - else return BeautifulSoup of the XML - Example - ======= - >> parsed_article = parse_pdf(pdf_path, fulltext=True, soup=True) - """ - # GROBID URL - if fulltext: - url = "%s/api/processFulltextDocument" % grobid_url - else: - url = "%s/api/processHeaderDocument" % grobid_url - - files = [] - if return_coordinates: - files += [ - ("teiCoordinates", (None, "persName")), - ("teiCoordinates", (None, "figure")), - ("teiCoordinates", (None, "ref")), - ("teiCoordinates", (None, "formula")), - ("teiCoordinates", (None, "biblStruct")), - ] - - if isinstance(pdf_path, str): - if validate_url(pdf_path) and op.splitext(pdf_path)[-1].lower() != ".pdf": - print("The input URL has to end with ``.pdf``") - parsed_article = None - elif validate_url(pdf_path) and op.splitext(pdf_path)[-1] == ".pdf": - page = urllib.request.urlopen(pdf_path).read() - parsed_article = requests.post(url, files={"input": page}).text - elif op.exists(pdf_path): - parsed_article = requests.post( - url, files={"input": open(pdf_path, "rb")} - ).text - else: - parsed_article = None - elif isinstance(pdf_path, bytes): - # assume that incoming is byte string - parsed_article = requests.post(url, files={"input": pdf_path}).text - else: - parsed_article = None - - if soup and parsed_article is not None: - parsed_article = BeautifulSoup(parsed_article, "lxml") - return parsed_article - - -def parse_authors(article): - """ - Parse authors from a given BeautifulSoup of an article - """ - author_names = article.find("sourcedesc").findAll("persname") - authors = [] - for author in author_names: - firstname = author.find("forename", {"type": "first"}) - firstname = firstname.text.strip() if firstname is not None else "" - middlename = author.find("forename", {"type": "middle"}) - middlename = middlename.text.strip() if middlename is not None else "" - lastname = author.find("surname") - lastname = lastname.text.strip() if lastname is not None else "" - if middlename is not "": - authors.append(firstname + " " + middlename + " " + lastname) - else: - authors.append(firstname + " " + lastname) - authors = "; ".join(authors) - return authors - - -def parse_date(article): - """ - Parse date from a given BeautifulSoup of an article - """ - pub_date = article.find("publicationstmt") - year = pub_date.find("date") - year = year.attrs.get("when") if year is not None else "" - return year - - -def parse_abstract(article): - """ - Parse abstract from a given BeautifulSoup of an article - """ - div = article.find("abstract") - abstract = "" - for p in list(div.children): - if not isinstance(p, NavigableString) and len(list(p)) > 0: - abstract += " ".join( - [elem.text for elem in p if not isinstance(elem, NavigableString)] - ) - return abstract - - -def calculate_number_of_references(div): - """ - For a given section, calculate number of references made in the section - """ - n_publication_ref = len( - [ref for ref in div.find_all("ref") if ref.attrs.get("type") == "bibr"] - ) - n_figure_ref = len( - [ref for ref in div.find_all("ref") if ref.attrs.get("type") == "figure"] - ) - return {"n_publication_ref": n_publication_ref, "n_figure_ref": n_figure_ref} - - -def parse_sections(article, as_list: bool = False): - """ - Parse list of sections from a given BeautifulSoup of an article - - Parameters - ========== - as_list: bool, if True, output text as a list of paragraph instead - of joining it together as one single text - """ - article_text = article.find("text") - divs = article_text.find_all("div", attrs={"xmlns": "http://www.tei-c.org/ns/1.0"}) - sections = [] - for div in divs: - div_list = list(div.children) - if len(div_list) == 0: - heading = "" - text = "" - elif len(div_list) == 1: - if isinstance(div_list[0], NavigableString): - heading = str(div_list[0]) - text = "" - else: - heading = "" - text = div_list[0].text - else: - text = [] - heading = div_list[0] - if isinstance(heading, NavigableString): - heading = str(heading) - p_all = list(div.children)[1:] - else: - heading = "" - p_all = list(div.children) - for p in p_all: - if p is not None: - try: - text.append(p.text) - except: - pass - if not as_list: - text = "\n".join(text) - - if heading is not "" or text is not "": - ref_dict = calculate_number_of_references(div) - sections.append( - { - "heading": heading, - "text": text, - "n_publication_ref": ref_dict["n_publication_ref"], - "n_figure_ref": ref_dict["n_figure_ref"], - } - ) - return sections - - -def parse_references(article): - """ - Parse list of references from a given BeautifulSoup of an article - """ - reference_list = [] - references = article.find("text").find("div", attrs={"type": "references"}) - references = references.find_all("biblstruct") if references is not None else [] - reference_list = [] - for reference in references: - title = reference.find("title", attrs={"level": "a"}) - if title is None: - title = reference.find("title", attrs={"level": "m"}) - title = title.text if title is not None else "" - journal = reference.find("title", attrs={"level": "j"}) - journal = journal.text if journal is not None else "" - if journal is "": - journal = reference.find("publisher") - journal = journal.text if journal is not None else "" - year = reference.find("date") - year = year.attrs.get("when") if year is not None else "" - authors = [] - for author in reference.find_all("author"): - firstname = author.find("forename", {"type": "first"}) - firstname = firstname.text.strip() if firstname is not None else "" - middlename = author.find("forename", {"type": "middle"}) - middlename = middlename.text.strip() if middlename is not None else "" - lastname = author.find("surname") - lastname = lastname.text.strip() if lastname is not None else "" - if middlename is not "": - authors.append(firstname + " " + middlename + " " + lastname) - else: - authors.append(firstname + " " + lastname) - authors = "; ".join(authors) - reference_list.append( - {"title": title, "journal": journal, "year": year, "authors": authors} - ) - return reference_list - - -def parse_figure_caption(article): - """ - Parse list of figures/tables from a given BeautifulSoup of an article - """ - figures_list = [] - figures = article.find_all("figure") - for figure in figures: - figure_type = figure.attrs.get("type") or "" - figure_id = figure.attrs.get("xml:id") or "" - label = figure.find("label").text - if figure_type == "table": - caption = figure.find("figdesc").text - data = figure.table.text - else: - caption = figure.text - data = "" - figures_list.append( - { - "figure_label": label, - "figure_type": figure_type, - "figure_id": figure_id, - "figure_caption": caption, - "figure_data": data, - } - ) - return figures_list - - -def parse_formulas(article): - """ - Parse list of formulas from a given BeautifulSoup of an article - """ - formulas_list = [] - formulas = article.find_all("formula") - for formula in formulas: - formula_id = formula.attrs["xml:id"] or "" - formula_text = formula.text - formula_coordinates = formula.attrs.get("coords") or "" - if formula_coordinates is not "": - formula_coordinates = [float(x) for x in formula_coordinates.split(",")] - formulas_list.append( - { - "formula_id": formula_id, - "formula_text": formula_text, - "formula_coordinates": formula_coordinates, - } - ) - return formulas_list - - -def convert_article_soup_to_dict(article, as_list: bool = False): - """ - Function to convert BeautifulSoup to JSON format - similar to the output from https://github.com/allenai/science-parse/ - - Parameters - ========== - article: BeautifulSoup - - Output - ====== - article_json: dict, parsed dictionary of a given article in the following format - { - 'title': ..., - 'abstract': ..., - 'sections': [ - {'heading': ..., 'text': ...}, - {'heading': ..., 'text': ...}, - ... - ], - 'references': [ - {'title': ..., 'journal': ..., 'year': ..., 'authors': ...}, - {'title': ..., 'journal': ..., 'year': ..., 'authors': ...}, - ... - ], - 'figures': [ - {'figure_label': ..., 'figure_type': ..., 'figure_id': ..., 'figure_caption': ..., 'figure_data': ...}, - ... - ] - } - """ - article_dict = {} - if article is not None: - title = article.find("title", attrs={"type": "main"}) - title = title.text.strip() if title is not None else "" - - article_dict["title"] = title - article_dict["authors"] = parse_authors(article) - article_dict["pub_date"] = parse_date(article) - article_dict["abstract"] = parse_abstract(article) - article_dict["sections"] = parse_sections(article, as_list=as_list) - article_dict["references"] = parse_references(article) - article_dict["figures"] = parse_figure_caption(article) - article_dict["formulas"] = parse_formulas(article) - - doi = article.find("idno", attrs={"type": "DOI"}) - doi = doi.text if doi is not None else "" - article_dict["doi"] = doi - - return article_dict - else: - return None - - -def parse_pdf_to_dict( - pdf_path: str, - fulltext: bool = True, - soup: bool = True, - as_list: bool = False, - return_coordinates: bool = True, - grobid_url: str = GROBID_URL, -): - """ - Parse the given PDF and return dictionary of the parsed article - - Parameters - ========== - pdf_path: str, path to publication or article - fulltext: bool, whether to extract fulltext or not - soup: bool, whether to return BeautifulSoup or not - as_list: bool, whether to return list of sections or not - grobid_url: str, url to grobid server, default is `GROBID_URL` - This could be changed to "https://cloud.science-miner.com/grobid/" for the cloud service - - Ouput - ===== - article_dict: dict, dictionary of an article - """ - parsed_article = parse_pdf( - pdf_path, - fulltext=fulltext, - soup=soup, - return_coordinates=return_coordinates, - grobid_url=grobid_url, - ) - article_dict = convert_article_soup_to_dict(parsed_article, as_list=as_list) - return article_dict - - -def parse_figures( - pdf_folder: str, - jar_path: str = PDF_FIGURES_JAR_PATH, - resolution: int = 300, - output_folder: str = "figures", -): - """ - Parse figures from the given scientific PDF using pdffigures2 - - Parameters - ========== - pdf_folder: str, path to a folder that contains PDF files. A folder must contains only PDF files - jar_path: str, default path to pdffigures2-assembly-0.0.12-SNAPSHOT.jar file - resolution: int, resolution of the output figures - output_folder: str, path to folder that we want to save parsed data (related to figures) and figures - - Output - ====== - folder: making a folder of output_folder/data and output_folder/figures of parsed data and figures relatively - """ - if not op.isdir(output_folder): - os.makedirs(output_folder) - - # create ``data`` and ``figures`` subfolder within ``output_folder`` - data_path = op.join(output_folder, "data") - figure_path = op.join(output_folder, "figures") - if not op.exists(data_path): - os.makedirs(data_path) - if not op.exists(figure_path): - os.makedirs(figure_path) - - if op.isdir(data_path) and op.isdir(figure_path): - args = [ - "java", - "-jar", - jar_path, - pdf_folder, - "-i", - str(resolution), - "-d", - op.join(op.abspath(data_path), ""), - "-m", - op.join(op.abspath(figure_path), ""), # end path with "/" - ] - _ = subprocess.run( - args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, timeout=20 - ) - print("Done parsing figures from PDFs!") - else: - print( - "You may have to check of ``data`` and ``figures`` in the the output folder path." - ) diff --git a/scipdf/pdf/parser_functions.py b/scipdf/pdf/parser_functions.py new file mode 100644 index 0000000..7e97ed4 --- /dev/null +++ b/scipdf/pdf/parser_functions.py @@ -0,0 +1,314 @@ +import re +import warnings +from glob import glob +from os import path as op + +from bs4 import BeautifulSoup, NavigableString + +from scipdf.features import compute_readability_stats, compute_text_stats, compute_journal_features +from scipdf.models import Section, Reference, Figure, Formula, Article, TextStatistic + + +def list_pdf_paths(pdf_folder: str): + """ + list of pdf paths in pdf folder + """ + return glob(op.join(pdf_folder, "*", "*", "*.pdf")) + + +def validate_url(path: str): + """ + Validate a given ``path`` if it is URL or not + """ + regex = re.compile( + r"^(?:http|ftp)s?://" # http:// or https:// + r"(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|" # domain... + r"localhost|" # localhost... + r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})" # ...or ip + r"(?::\d+)?" # optional port + r"(?:/?|[/?]\S+)$", + re.IGNORECASE, + ) + return re.match(regex, path) is not None + + +def parse_authors(article: BeautifulSoup) -> str: + """ + Parse authors from a given BeautifulSoup of an article + """ + author_names = article.find("sourcedesc").findAll("persname") + authors = [] + for author in author_names: + try: + firstname = author.find("forename", {"type": "first"}) + firstname = firstname.text.strip() if firstname is not None else "" + middlename = author.find("forename", {"type": "middle"}) + middlename = middlename.text.strip() if middlename is not None else "" + lastname = author.find("surname") + lastname = lastname.text.strip() if lastname is not None else "" + if middlename != "": + authors.append(firstname + " " + middlename + " " + lastname) + else: + authors.append(firstname + " " + lastname) + except Exception as e: + warnings.warn(f"Error parsing author: {author}") + authors = "; ".join(authors) + return authors + + +def parse_date(article: BeautifulSoup) -> str: + """ + Parse date from a given BeautifulSoup of an article + """ + pub_date = article.find("publicationstmt") + year = pub_date.find("date") + year = year.attrs.get("when") if year is not None else "" + return year + + +def parse_abstract(article: BeautifulSoup) -> str: + """ + Parse abstract from a given BeautifulSoup of an article + """ + div = article.find("abstract") + abstract = "" + for p in list(div.children): + if not isinstance(p, NavigableString) and len(list(p)) > 0: + abstract += " ".join( + [elem.text for elem in p if not isinstance(elem, NavigableString)] + ) + return abstract + + +def calculate_number_of_references(div): + """ + For a given section, calculate number of references made in the section + """ + n_publication_ref = len( + [ref for ref in div.find_all("ref") if ref.attrs.get("type") == "bibr"] + ) + n_figure_ref = len( + [ref for ref in div.find_all("ref") if ref.attrs.get("type") == "figure"] + ) + return {"n_publication_ref": n_publication_ref, "n_figure_ref": n_figure_ref} + + +def parse_sections(article: BeautifulSoup) -> list[Section]: + """ + Parse list of sections from a given BeautifulSoup of an article + + Parameters + ========== + as_list: bool, if True, output text as a list of paragraph instead + of joining it together as one single text + """ + article_text = article.find("text") + divs = article_text.find_all("div", attrs={"xmlns": "http://www.tei-c.org/ns/1.0"}) + sections: list[Section] = [] + for div in divs: + try: + div_list = list(div.children) + if len(div_list) == 0: + heading = "" + text = "" + elif len(div_list) == 1: + if isinstance(div_list[0], NavigableString): + heading = str(div_list[0]) + text = "" + else: + heading = "" + text = div_list[0].text + else: + text = [] + heading = div_list[0] + if isinstance(heading, NavigableString): + heading = str(heading) + p_all = list(div.children)[1:] + else: + heading = "" + p_all = list(div.children) + for p in p_all: + if p is not None: + try: + text.append(p.text) + except: + pass + text = " ".join(text) + + if heading != "" or text != "": + ref_dict = calculate_number_of_references(div) + sections.append( + Section( + heading=heading, + text=text, + n_publication_ref=ref_dict["n_publication_ref"], + n_figure_ref=ref_dict["n_figure_ref"], + ) + ) + except Exception as e: + warnings.warn(f"Error parsing section: {div}") + return sections + + +def parse_references(article: BeautifulSoup) -> list[Reference]: + """ + Parse list of references from a given BeautifulSoup of an article + """ + references = article.find("text").find("div", attrs={"type": "references"}) + references = references.find_all("biblstruct") if references is not None else [] + reference_list = [] + for reference in references: + try: + title = reference.find("title", attrs={"level": "a"}) + if title is None: + title = reference.find("title", attrs={"level": "m"}) + title = title.text if title is not None else "" + journal = reference.find("title", attrs={"level": "j"}) + journal = journal.text if journal is not None else "" + if journal == "": + journal = reference.find("publisher") + journal = journal.text if journal is not None else "" + year = reference.find("date") + year = year.attrs.get("when") + authors = [] + for author in reference.find_all("author"): + firstname = author.find("forename", {"type": "first"}) + firstname = firstname.text.strip() if firstname is not None else "" + middlename = author.find("forename", {"type": "middle"}) + middlename = middlename.text.strip() if middlename is not None else "" + lastname = author.find("surname") + lastname = lastname.text.strip() if lastname is not None else "" + if middlename != "": + authors.append(firstname + " " + middlename + " " + lastname) + else: + authors.append(firstname + " " + lastname) + authors = "; ".join(authors) + reference_list.append( + Reference(title=title, journal=journal, year=year, authors=authors) + ) + except: + warnings.warn(f"Error parsing reference: {reference}") + return reference_list + + +def parse_figure_caption(article: BeautifulSoup) -> list[Figure]: + """ + Parse list of figures/tables from a given BeautifulSoup of an article + """ + figures_list = [] + figures = article.find_all("figure") + for figure in figures: + try: + figure_type = figure.attrs.get("type") or "" + figure_id = figure.attrs.get("xml:id") or "" + label = figure.find("label").text + if figure_type == "table": + caption = figure.find("figdesc").text + data = figure.table.text + else: + caption = figure.text + data = "" + figures_list.append( + Figure( + figure_label=label, + figure_type=figure_type, + figure_id=figure_id, + figure_caption=caption, + figure_data=data, + ) + ) + except: + warnings.warn(f"Error parsing figure, {figure}") + return figures_list + + +def parse_formulas(article: BeautifulSoup) -> list[Formula]: + """ + Parse list of formulas from a given BeautifulSoup of an article + + Parameters + ========== + article: BeautifulSoup, parsed article in BeautifulSoup format + + Returns + ======= + formulas_list: list[Formula], list of formulas parsed from the article + """ + formulas_list = [] + + formulas = article.find_all("formula") + for formula in formulas: + try: + formula_id = formula.attrs.get("xml:id", "") + formula_text = formula.text + formula_coordinates = formula.attrs.get("coords", []) + + if formula_coordinates: + formula_coordinates = [float(x) for x in formula_coordinates.split(",")] + + formula_data = Formula( + formula_id=formula_id, + formula_text=formula_text, + formula_coordinates=formula_coordinates, + ) + formulas_list.append(formula_data) + except: + warnings.warn(f"Error parsing formula, {formula}") + + return formulas_list + + +def calculate_text_stats(article: Article, soup: BeautifulSoup) -> Article: + """ + Function to calculate text statistics for a given article + + Parameters + ========== + article: Article, parsed article in JSON format + + Returns + ======= + article: Article, parsed article with text statistics + """ + full_text = article.full_text + return TextStatistic( + readability=compute_readability_stats(full_text), + text_stats=compute_text_stats(full_text), + journal_features=compute_journal_features(soup), + ) + + +def convert_article_soup_to_pydantic(soup: BeautifulSoup) -> Article: + """ + Function to convert BeautifulSoup to JSON format similar to the output from https://github.com/allenai/science-parse/ + + Parameters + ========== + soup: BeautifulSoup + + Output + ====== + article_dict: ArticleDict, parsed dictionary of a given article + """ + if soup is None: + raise ValueError("Soup is None") + + title = soup.find("title", attrs={"type": "main"}) + title = title.text.strip() if title is not None else "" + doi = soup.find("idno", attrs={"type": "DOI"}) + doi = doi.text if doi is not None else "" + + article = Article( + title=title, + authors=parse_authors(soup), + pub_date=parse_date(soup), + abstract=parse_abstract(soup), + sections=parse_sections(soup), + references=parse_references(soup), + figures=parse_figure_caption(soup), + formulas=parse_formulas(soup), + doi=doi, + ) + + article.text_stats = calculate_text_stats(article, soup) + return article diff --git a/serve_grobid.sh b/serve_grobid.sh index dfc27c6..1c96e5c 100644 --- a/serve_grobid.sh +++ b/serve_grobid.sh @@ -1,14 +1,7 @@ #!/bin/bash +# assumes you have docker and nvidia-container-toolkit installed +# see https://aur.archlinux.org/packages/nvidia-container-toolkit +# After installing, you need to restart docker +# sudo systemctl restart docker -# download GROBID if directory does not exist -declare -r GROBID_VERSION="0.6.2" # or change to current stable version - -if [ ! -d grobid-${GROBID_VERSION} ]; then - wget https://github.com/kermitt2/grobid/archive/${GROBID_VERSION}.zip - unzip "${GROBID_VERSION}.zip" - rm "${GROBID_VERSION}.zip" -fi - -# run GROBID -cd grobid-${GROBID_VERSION} || exit -./gradlew run +docker run -t --rm --gpus all -p 8070:8070 grobid/grobid:0.7.3 diff --git a/setup.py b/setup.py index 494d979..3f6e9ff 100644 --- a/setup.py +++ b/setup.py @@ -7,20 +7,24 @@ if __name__ == "__main__": setup( name='scipdf', - version='0.1dev', + version='1.0.3', description=' Python parser for scientific PDF based on GROBID.', long_description=long_description, long_description_content_type="text/markdown", - url='https://github.com/titipata/scipdf_parser', - author='Titipat Achakulvisut', - author_email='my.titipat@gmail.com', - license='(c) MIT License 2019 Titipat Achakulvisut', - install_requires=['lxml', 'requests', 'spacy', 'pandas', 'textstat'], + url='https://github.com/skuam/scipdf_parser', + author='Titipat Achakulvisut, Mateusz Jakubnczak', + author_email='my.titipat@gmail.com, mateusz.jakubczak.contact+githubSciPDfParser@gmail.com', + license='(c) MIT License 2023 Titipat Achakulvisut, Mateusz Jakubczak', + install_requires=['lxml', 'requests', 'spacy', 'pandas', 'textstat', "pydantic", "beautifulsoup4"], packages=find_packages(), keywords=[ "PDF parser", "GROBID", - "Python PDF parser" + "Python PDF parser", + "Pydantic", + "Scientific PDF parser", + "Scientific PDF", + "PDF", ], classifiers=[ "Programming Language :: Python :: 3", diff --git a/test/__init__.py b/test/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/test/test_parse_pdf.py b/test/test_parse_pdf.py new file mode 100644 index 0000000..36178a4 --- /dev/null +++ b/test/test_parse_pdf.py @@ -0,0 +1,16 @@ +import os +import pytest + +from scipdf.models import Article +from scipdf.parse_pdf import SciPDFParser + + +def test_parse_pdf(): + # Requires GROBID to be running locally + try: + parser = SciPDFParser() + article: Article = parser.parse_pdf(os.path.join(os.path.dirname(__file__), "../example_data/futoma2017improved.pdf" )) + assert article.title == 'An Improved Multi-Output Gaussian Process RNN with Real-Time Validation for Early Sepsis Detection' + except OSError: + print(" \n GROBID is not running locally, skipping test_parse_pdf") + pytest.skip("GROBID is not running locally, skipping test_parse_pdf")