From 947385b0764385e7df81f1fe56ef9de5b8e6c1ee Mon Sep 17 00:00:00 2001 From: alisoncallahan Date: Wed, 20 Apr 2022 09:26:40 -0700 Subject: [PATCH 1/4] First commit of RadGraph dataset loader. Needs fixes to source schema to properly represent unique keys. --- biodatasets/radgraph/radgraph.py | 267 +++++++++++++++++++++++++++++++ 1 file changed, 267 insertions(+) create mode 100644 biodatasets/radgraph/radgraph.py diff --git a/biodatasets/radgraph/radgraph.py b/biodatasets/radgraph/radgraph.py new file mode 100644 index 00000000..0d90fe4c --- /dev/null +++ b/biodatasets/radgraph/radgraph.py @@ -0,0 +1,267 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +This template serves as a starting point for contributing a dataset to the BigScience Biomedical repo. + +When modifying it for your dataset, look for TODO items that offer specific instructions. + +Full documentation on writing dataset loading scripts can be found here: +https://huggingface.co/docs/datasets/add_dataset.html + +To create a dataset loading script you will create a class and implement 3 methods: + * `_info`: Establishes the schema for the dataset, and returns a datasets.DatasetInfo object. + * `_split_generators`: Downloads and extracts data for each split (e.g. train/val/test) or associate local data with each split. + * `_generate_examples`: Creates examples from data on disk that conform to each schema defined in `_info`. + +TODO: Before submitting your script, delete this doc string and replace it with a description of your dataset. + +[bigbio_schema_name] = (kb, pairs, qa, text, t2t, entailment) +""" + +import json +import os +from typing import List, Tuple, Dict + +import datasets +from utils import schemas +from utils.configs import BigBioConfig +from utils.constants import Tasks + +# TODO: Add BibTeX citation +_CITATION = """\ +@article{, + author = {Jain, S., Agrawal, A., Saporta, A., Truong, S. Q., Nguyen Duong, D., Bui, T., Chambon, P., Lungren, M., Ng, A., Langlotz, C., & Rajpurkar, P. }, + title = {RadGraph: Extracting Clinical Entities and Relations from Radiology Reports (version 1.0.0)}, + journal = {PhysioNet}, + volume = {}, + year = {2021}, + url = {https://physionet.org/content/radgraph/1.0.0/}, + doi = {10.13026/hm87-5p47}, + biburl = {}, + bibsource = {} +} +""" + +_DATASETNAME = "radgraph" + +_DESCRIPTION = """\ +This dataset is derived from radiology reports and is designed for named entity recognition and relatation extraction. +""" + +# TODO: Add a link to an official homepage for the dataset here (if possible) +_HOMEPAGE = "https://physionet.org/content/radgraph/1.0.0/" + +# TODO: Add the licence for the dataset here (if possible) +# Note that this doesn't have to be a common open source license. +# Some datasets have custom licenses. In this case, simply put the full license terms +# into `_LICENSE` +_LICENSE = """\ + The PhysioNet Credentialed Health Data License + Version 1.5.0 + + Copyright (c) 2022 MIT Laboratory for Computational Physiology + + The MIT Laboratory for Computational Physiology (MIT-LCP) wishes to make data available for research and educational purposes to qualified requestors, but only if the data are used and protected in accordance with the terms and conditions stated in this License. + + It is hereby agreed between the data requestor, hereinafter referred to as the "LICENSEE", and MIT-LCP, that: + + The LICENSEE will not attempt to identify any individual or institution referenced in PhysioNet restricted data. + The LICENSEE will exercise all reasonable and prudent care to avoid disclosure of the identity of any individual or institution referenced in PhysioNet restricted data in any publication or other communication. + The LICENSEE will not share access to PhysioNet restricted data with anyone else. + The LICENSEE will exercise all reasonable and prudent care to maintain the physical and electronic security of PhysioNet restricted data. + + If the LICENSEE finds information within PhysioNet restricted data that he or she believes might permit identification of any individual or institution, the LICENSEE will report the location of this information promptly by email to PHI-report@physionet.org, citing the location of the specific information in question. + The LICENSEE will use the data for the sole purpose of lawful use in scientific research and no other. + The LICENSEE will be responsible for ensuring that he or she maintains up to date certification in human research subject protection and HIPAA regulations. + The LICENSEE agrees to contribute code associated with publications arising from this data to a repository that is open to the research community. + This agreement may be terminated by either party at any time, but the LICENSEE's obligations with respect to PhysioNet data shall continue after termination. + THE DATA ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE DATA OR THE USE OR OTHER DEALINGS IN THE DATA. + """ + +# Local dataset - available only after completing PhysioNet requirements +_URLS = {} + +_SUPPORTED_TASKS = [ + Tasks.NAMED_ENTITY_RECOGNITION, + Tasks.RELATION_EXTRACTION + ] + +_SOURCE_VERSION = "1.0.0" + +_BIGBIO_VERSION = "1.0.0" + +class RadgraphDataset(datasets.GeneratorBasedBuilder): + """RadGraph is a dataset of entities and relations in full-text radiology reports.""" + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + BIGBIO_VERSION = datasets.Version(_BIGBIO_VERSION) + + BUILDER_CONFIGS = [ + BigBioConfig( + name="radgraph_source", + version=SOURCE_VERSION, + description="RadGraph source schema", + schema="source", + subset_id="radgraph", + ), + BigBioConfig( + name="radgraph_bigbio_kb", + version=BIGBIO_VERSION, + description="RadGraph BigBio schema", + schema="bigbio_kb", + subset_id="radgraph", + ), + ] + + DEFAULT_CONFIG_NAME = "radgraph_source" + + def _info(self) -> datasets.DatasetInfo: + + if self.config.schema == "source": + features = datasets.Features( + { + "report_id" : { + "text": datasets.Value("string"), + "entities": { + "entity_id": { + "tokens": datasets.Value("string"), + "label": datasets.Value("string"), + "start_ix": datasets.Value("int32"), + "end_ix": datasets.Value("int32"), + "relations": [[datasets.Value("string")]] + }, + }, + "data_source": datasets.Value("string"), + "data_split": datasets.Value("string"), + } + } + ) + + # Choose the appropriate bigbio schema for your task and copy it here. You can find information on the schemas in the CONTRIBUTING guide. + + # In rare cases you may get a dataset that supports multiple tasks requiring multiple schemas. In that case you can define multiple bigbio configs with a bigbio_[bigbio_schema_name] format. + + # For example bigbio_kb, bigbio_t2t + elif self.config.schema == "bigbio_kb": + # e.g. features = schemas.kb_features + # TODO: Choose your big-bio schema here + raise NotImplementedError() + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + + if self.config.data_dir is None: + raise ValueError("This is a local dataset. Please pass the data_dir kwarg to load_dataset.") + else: + data_dir = self.config.data_dir + + # Not all datasets have predefined canonical train/val/test splits. + # If your dataset has no predefined splits, use datasets.Split.TRAIN for all of the data. + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + # Whatever you put in gen_kwargs will be passed to _generate_examples + gen_kwargs={ + "filepath": os.path.join(data_dir, "train.json"), + "split": "train", + }, + ), + datasets.SplitGenerator( + name=datasets.Split.TEST, + gen_kwargs={ + "filepath": os.path.join(data_dir, "test.json"), + "split": "test", + }, + ), + datasets.SplitGenerator( + name=datasets.Split.VALIDATION, + gen_kwargs={ + "filepath": os.path.join(data_dir, "dev.json"), + "split": "dev", + }, + ), + ] + + # method parameters are unpacked from `gen_kwargs` as given in `_split_generators` + + # TODO: change the args of this function to match the keys in `gen_kwargs`. You may add any necessary kwargs. + + def _generate_examples(self, filepath, split: str) -> Tuple[int, Dict]: + """Yields examples as (key, example) tuples.""" + # TODO: This method handles input defined in _split_generators to yield (key, example) tuples from the dataset. + + # The `key` is for legacy reasons (tfds) and is not important in itself, but must be unique for each example. + + # NOTE: For local datasets you will have access to self.config.data_dir and self.config.data_files + + ''' + "report_id": { + "text": datasets.Value("string"), + "entities": { + "entity_id": { + "tokens": datasets.Value("string"), + "label": datasets.Value("string"), + "start_ix": datasets.Value("int32"), + "end_ix": datasets.Value("int32"), + "relations": [[datasets.Value("string")]] + }, + }, + "data_source": datasets.Value("string"), + "data_split": datasets.Value("string"), + } + ''' + + if self.config.schema == "source": + with open(filepath) as json_file: + data = json.load(json_file) + uid = 0 + for chart_id in data: + example = {} + chart_data = data[chart_id] + example = { + chart_id: { + "text" : chart_data["text"], + "entities" : chart_data["entities"], + "data_source" : chart_data["data_source"], + "data_split": chart_data["data_split"] + } + } + yield uid, example + uid +=1 + + # elif self.config.schema == "bigbio_kb": + # # TODO: yield (key, example) tuples in the bigbio schema + # for key, example in thing: + # yield key, example + + +# This template is based on the following template from the datasets package: +# https://github.com/huggingface/datasets/blob/master/templates/new_dataset_script.py + + +# This allows you to run your dataloader with `python [dataset_name].py` during development +# TODO: Remove this before making your PR +#if __name__ == "__main__": +# datasets.load_dataset(__file__) From 10371c1f9746b33fee1b311f9ed269a98b76b248 Mon Sep 17 00:00:00 2001 From: alisoncallahan Date: Wed, 20 Apr 2022 15:47:37 -0700 Subject: [PATCH 2/4] Updated description --- biodatasets/radgraph/radgraph.py | 16 +--------------- 1 file changed, 1 insertion(+), 15 deletions(-) diff --git a/biodatasets/radgraph/radgraph.py b/biodatasets/radgraph/radgraph.py index 0d90fe4c..f6936e46 100644 --- a/biodatasets/radgraph/radgraph.py +++ b/biodatasets/radgraph/radgraph.py @@ -14,21 +14,7 @@ # limitations under the License. """ -This template serves as a starting point for contributing a dataset to the BigScience Biomedical repo. - -When modifying it for your dataset, look for TODO items that offer specific instructions. - -Full documentation on writing dataset loading scripts can be found here: -https://huggingface.co/docs/datasets/add_dataset.html - -To create a dataset loading script you will create a class and implement 3 methods: - * `_info`: Establishes the schema for the dataset, and returns a datasets.DatasetInfo object. - * `_split_generators`: Downloads and extracts data for each split (e.g. train/val/test) or associate local data with each split. - * `_generate_examples`: Creates examples from data on disk that conform to each schema defined in `_info`. - -TODO: Before submitting your script, delete this doc string and replace it with a description of your dataset. - -[bigbio_schema_name] = (kb, pairs, qa, text, t2t, entailment) +The RadGraph dataset is derived from radiology reports and is designed for named entity recognition and relatation extraction. """ import json From 89ad7b0f0455fb09c7a1bfeb35b7a6a04bc6a233 Mon Sep 17 00:00:00 2001 From: alisoncallahan Date: Thu, 21 Apr 2022 15:53:09 -0700 Subject: [PATCH 3/4] fixes to source schema --- biodatasets/radgraph/radgraph.py | 39 ++++++++++++++++++-------------- 1 file changed, 22 insertions(+), 17 deletions(-) diff --git a/biodatasets/radgraph/radgraph.py b/biodatasets/radgraph/radgraph.py index f6936e46..34e634de 100644 --- a/biodatasets/radgraph/radgraph.py +++ b/biodatasets/radgraph/radgraph.py @@ -115,24 +115,28 @@ class RadgraphDataset(datasets.GeneratorBasedBuilder): DEFAULT_CONFIG_NAME = "radgraph_source" def _info(self) -> datasets.DatasetInfo: - if self.config.schema == "source": features = datasets.Features( { - "report_id" : { - "text": datasets.Value("string"), - "entities": { - "entity_id": { - "tokens": datasets.Value("string"), - "label": datasets.Value("string"), - "start_ix": datasets.Value("int32"), - "end_ix": datasets.Value("int32"), - "relations": [[datasets.Value("string")]] - }, + "report_id" : datasets.Value("string"), + "text": datasets.Value("string"), + "entities": [ + { + "entity_id": datasets.Value("string"), + "tokens": datasets.Value("string"), + "label": datasets.Value("string"), + "start_ix": datasets.Value("int32"), + "end_ix": datasets.Value("int32"), + "relations": [ + { + "type": datasets.Value("string"), # e.g. "modify" + "arg": datasets.Value("string") # e.g. "7" + } + ] }, - "data_source": datasets.Value("string"), - "data_split": datasets.Value("string"), - } + ], + "data_source": datasets.Value("string"), + "data_split": datasets.Value("string"), } ) @@ -224,15 +228,16 @@ def _generate_examples(self, filepath, split: str) -> Tuple[int, Dict]: data = json.load(json_file) uid = 0 for chart_id in data: + print(chart_id) example = {} chart_data = data[chart_id] example = { - chart_id: { + "report_id": chart_id, "text" : chart_data["text"], - "entities" : chart_data["entities"], + "entities" : [{"entity_id":"", "tokens": "", "label": "", "start_ix": 3, "end_ix":2, "relations":[["apple"]]}], + #"entities" : chart_data["entities"], "data_source" : chart_data["data_source"], "data_split": chart_data["data_split"] - } } yield uid, example uid +=1 From 0e645e23d26cb9ecf3b52388aa73f2846ac58020 Mon Sep 17 00:00:00 2001 From: alisoncallahan Date: Tue, 26 Apr 2022 22:08:21 -0700 Subject: [PATCH 4/4] updated radgraph data loader to complete source schema and _generate_examples for source schema --- biodatasets/radgraph/radgraph.py | 182 ++++++++++++++++++++++--------- 1 file changed, 130 insertions(+), 52 deletions(-) diff --git a/biodatasets/radgraph/radgraph.py b/biodatasets/radgraph/radgraph.py index 34e634de..d2647b16 100644 --- a/biodatasets/radgraph/radgraph.py +++ b/biodatasets/radgraph/radgraph.py @@ -115,6 +115,7 @@ class RadgraphDataset(datasets.GeneratorBasedBuilder): DEFAULT_CONFIG_NAME = "radgraph_source" def _info(self) -> datasets.DatasetInfo: + if self.config.schema == "source": features = datasets.Features( { @@ -127,19 +128,20 @@ def _info(self) -> datasets.DatasetInfo: "label": datasets.Value("string"), "start_ix": datasets.Value("int32"), "end_ix": datasets.Value("int32"), + "labeler": datasets.Value("string"), "relations": [ { + "relation_id": datasets.Value("string"), "type": datasets.Value("string"), # e.g. "modify" "arg": datasets.Value("string") # e.g. "7" - } - ] + }, + ], }, ], "data_source": datasets.Value("string"), - "data_split": datasets.Value("string"), + "data_split": datasets.Value("string") } ) - # Choose the appropriate bigbio schema for your task and copy it here. You can find information on the schemas in the CONTRIBUTING guide. # In rare cases you may get a dataset that supports multiple tasks requiring multiple schemas. In that case you can define multiple bigbio configs with a bigbio_[bigbio_schema_name] format. @@ -157,7 +159,7 @@ def _info(self) -> datasets.DatasetInfo: license=_LICENSE, citation=_CITATION, ) - + def _split_generators(self, dl_manager) -> List[datasets.SplitGenerator]: """Returns SplitGenerators.""" @@ -194,65 +196,141 @@ def _split_generators(self, dl_manager) -> List[datasets.SplitGenerator]: ), ] + def _get_radgraph_entity(self, entity_id, entity_data, labeler): + """Build radgraph entity from source entity JSON. + + Parameters + ---------- + entity_id : string + entity identifier from source data + entity_data: dict + entity record consisting of entity tokens, label, start index and end index + labeler: string + labeler identifier from source data + + Returns + ------- + dict + entity information + """ + return { + "labeler" : labeler, + "entity_id": entity_id, + "tokens": entity_data["tokens"], + "label": entity_data["label"], + "start_ix": entity_data["start_ix"], + "end_ix": entity_data["end_ix"] + } + + def _get_radgraph_relations(self, relations_data, uid): + """Build entity relations from source entity relations JSON. + + Parameters + ---------- + relations_data: list + list of relation records where each record is also a list, where the first element is the relation type, and the second element is the entity ID it refers to + uid: int + unique identifier + + Returns + ------- + int + unique identifier + dict + relations information + """ + relations = [] + for relation_list in relations_data: + relation = { + "relation_id": str(uid), + "type": relation_list[0], + "arg": relation_list[1] + } + relations.append(relation) + uid +=1 + return(uid, relations) + + def _parse_train_dev_data(self, data, chart_id, uid): + """Parse train or dev JSON, return example""" + example = {} + entities = [] + chart_data = data[chart_id] + example = { + "report_id": chart_id, + "text" : chart_data["text"], + "data_source" : chart_data["data_source"], + "data_split": chart_data["data_split"] + } + for entity_id in chart_data["entities"]: + entity_data = chart_data["entities"][entity_id] + entity = self._get_radgraph_entity(entity_id, entity_data, "") + uid, relations = self._get_radgraph_relations(entity_data["relations"], uid) + + entity["relations"] = relations + entities.append(entity) + + example["entities"] = entities + + return(uid, example) + + def _parse_test_data(self, data, chart_id, uid): + """Parse test JSON, return example""" + example = {} + entities = [] + chart_data = data[chart_id] + + example = { + "report_id": chart_id, + "text" : chart_data["text"], + "data_source" : chart_data["data_source"], + "data_split": chart_data["data_split"], + } + + for entity_id in chart_data["labeler_1"]["entities"]: + entity_data = chart_data["labeler_1"]["entities"][entity_id] + entity = self._get_radgraph_entity(entity_id, entity_data, "labeler_1") + uid, relations = self._get_radgraph_relations(entity_data["relations"], uid) + + entity["relations"] = relations + entities.append(entity) + + for entity_id in chart_data["labeler_2"]["entities"]: + entity_data = chart_data["labeler_2"]["entities"][entity_id] + + entity = self._get_radgraph_entity(entity_id, entity_data, "labeler_2") + uid, relations = self._get_radgraph_relations(entity_data["relations"], uid) + + entity["relations"] = relations + entities.append(entity) + + example["entities"] = entities + + return(uid, example) + # method parameters are unpacked from `gen_kwargs` as given in `_split_generators` # TODO: change the args of this function to match the keys in `gen_kwargs`. You may add any necessary kwargs. def _generate_examples(self, filepath, split: str) -> Tuple[int, Dict]: """Yields examples as (key, example) tuples.""" - # TODO: This method handles input defined in _split_generators to yield (key, example) tuples from the dataset. - - # The `key` is for legacy reasons (tfds) and is not important in itself, but must be unique for each example. - - # NOTE: For local datasets you will have access to self.config.data_dir and self.config.data_files - - ''' - "report_id": { - "text": datasets.Value("string"), - "entities": { - "entity_id": { - "tokens": datasets.Value("string"), - "label": datasets.Value("string"), - "start_ix": datasets.Value("int32"), - "end_ix": datasets.Value("int32"), - "relations": [[datasets.Value("string")]] - }, - }, - "data_source": datasets.Value("string"), - "data_split": datasets.Value("string"), - } - ''' - + if self.config.schema == "source": with open(filepath) as json_file: data = json.load(json_file) uid = 0 - for chart_id in data: - print(chart_id) - example = {} - chart_data = data[chart_id] - example = { - "report_id": chart_id, - "text" : chart_data["text"], - "entities" : [{"entity_id":"", "tokens": "", "label": "", "start_ix": 3, "end_ix":2, "relations":[["apple"]]}], - #"entities" : chart_data["entities"], - "data_source" : chart_data["data_source"], - "data_split": chart_data["data_split"] - } - yield uid, example - uid +=1 + if "train" in filepath or "dev" in filepath: + for chart_id in data: + uid, example = self._parse_train_dev_data(data, chart_id, uid) + yield uid, example + uid +=1 + elif "test" in filepath: + + for chart_id in data: + uid, example = self._parse_test_data(data, chart_id, uid) + yield uid, example + uid +=1 # elif self.config.schema == "bigbio_kb": # # TODO: yield (key, example) tuples in the bigbio schema # for key, example in thing: # yield key, example - - -# This template is based on the following template from the datasets package: -# https://github.com/huggingface/datasets/blob/master/templates/new_dataset_script.py - - -# This allows you to run your dataloader with `python [dataset_name].py` during development -# TODO: Remove this before making your PR -#if __name__ == "__main__": -# datasets.load_dataset(__file__)