1+ # Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
2+ # Licensed under the Apache License, Version 2.0 (the "License");
3+ # you may not use this file except in compliance with the License.
4+ # You may obtain a copy of the License at
5+ #
6+ # http://www.apache.org/licenses/LICENSE-2.0
7+ #
8+ # Unless required by applicable law or agreed to in writing, software
9+ # distributed under the License is distributed on an "AS IS" BASIS,
10+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11+ # See the License for the specific language governing permissions and
12+ # limitations under the License.
13+
14+ import os
15+ import csv
16+ import zipfile
17+ import argparse
18+ import re
19+
20+ class ChemProtTextFormatting :
21+ """A basic formatter to preprocess the chemprot dataset.
22+ """
23+
24+ def __init__ (self , input_folder , output_folder ):
25+
26+ chemprot_folder = input_folder
27+ with zipfile .ZipFile (os .path .join (chemprot_folder , "ChemProt_Corpus.zip" ), "r" ) as zip :
28+ zip .extractall (chemprot_folder )
29+
30+ chemprot_folder = os .path .join (input_folder , "ChemProt_Corpus" )
31+
32+ with zipfile .ZipFile (os .path .join (chemprot_folder , "chemprot_development.zip" )) as zip :
33+ zip .extractall (chemprot_folder )
34+
35+ if not os .path .exists (output_folder ):
36+ os .makedirs (output_folder )
37+
38+ self .format (os .path .join (chemprot_folder , "chemprot_development" ),
39+ "chemprot_development_entities.tsv" , "chemprot_development_relations.tsv" ,
40+ "chemprot_development_abstracts.tsv" , os .path .join (output_folder , "dev.tsv" ))
41+
42+ with zipfile .ZipFile (os .path .join (chemprot_folder , "chemprot_test_gs.zip" )) as zip :
43+ zip .extractall (chemprot_folder )
44+ self .format (os .path .join (chemprot_folder , "chemprot_test_gs" ),
45+ "chemprot_test_entities_gs.tsv" , "chemprot_test_relations_gs.tsv" ,
46+ "chemprot_test_abstracts_gs.tsv" , os .path .join (output_folder , "test.tsv" ))
47+
48+ with zipfile .ZipFile (os .path .join (chemprot_folder , "chemprot_training.zip" )) as zip :
49+ zip .extractall (chemprot_folder )
50+ self .format (os .path .join (chemprot_folder , "chemprot_training" ),
51+ "chemprot_training_entities.tsv" , "chemprot_training_relations.tsv" ,
52+ "chemprot_training_abstracts.tsv" , os .path .join (output_folder , "train.tsv" ))
53+
54+
55+
56+ def format (self , chemprot_path , entity_filename , relations_filename , abstracts_filename , output_filename ):
57+ """
58+ Constructs ChemProt dataset for Relation Extraction.
59+
60+ Args:
61+ chemprot_path: Path to files
62+ entity_filename: Contains labelled mention annotations of chemical compounds and genes/proteins.
63+ <PMID> <EntityNumber> <Type of Entity> <Start Character offset> <End Character Offset> <Text String>
64+ relations_filename: Contains a subset of chemical-protein relations annotations for the Chemprot dataset
65+ <PMID> <CPR Group> <EntityNumber1> <EntityNumber2>
66+ abstracts_filename: Contains plain text CHEMPROT PubMed Data
67+ <PMID> <Title of the Article> <Abstract of the Article>
68+ output_filename: Path to output file that will contain preprocessed data
69+ <PMID.EntityNumber1.EntityNumber2> <Preprocessed Sentence> <CPR Group>
70+ """
71+
72+ data = {}
73+ train_entities = csv .reader (open (os .path .join (chemprot_path , entity_filename ),
74+ mode = "r" ), delimiter = "\t " )
75+ for entity in train_entities :
76+ id = entity [0 ]
77+ if data .get (id , None ) is None :
78+ data [id ] = {"relations" :{}, "entities" :{"CHEMICAL" :{}, "GENE" :{}}}
79+ data [id ]["entities" ]["CHEMICAL" if entity [2 ] == "CHEMICAL" else "GENE" ][entity [1 ]] = (int (entity [3 ]), int (entity [4 ]), entity [2 ])
80+
81+ train_relations = csv .reader (open (os .path .join (chemprot_path , relations_filename ),
82+ mode = "r" ), delimiter = "\t " )
83+ for relation in train_relations :
84+ try :
85+ id = relation [0 ]
86+ data [id ]["relations" ][(relation [4 ].split ("Arg1:" )[- 1 ], relation [5 ].split ("Arg2:" )[- 1 ])] = relation [1 ] if relation [2 ] == "Y " else "false"
87+ except :
88+ print ("invalid id" )
89+ raise ValueError
90+ # print(data[list(data.keys())[0]])
91+
92+ with open (output_filename , 'w' ) as ofile :
93+ train_abstracts = csv .reader (open (os .path .join (chemprot_path , abstracts_filename ),
94+ mode = "r" ), delimiter = "\t " )
95+ owriter = csv .writer (ofile , delimiter = '\t ' , lineterminator = os .linesep )
96+ owriter .writerow (["index" , "sentence" , "label" ])
97+
98+ num_sentences = 0
99+ rejected = 0
100+ for abstract in train_abstracts :
101+ id = abstract [0 ]
102+ line = abstract [1 ] + "\n " + abstract [2 ]
103+
104+ for tag1 in data [id ]["entities" ]["CHEMICAL" ].keys ():
105+ for tag2 in data [id ]["entities" ]["GENE" ].keys ():
106+ tag1_details = data [id ]["entities" ]["CHEMICAL" ][tag1 ]
107+ tag2_details = data [id ]["entities" ]["GENE" ][tag2 ]
108+ if ((tag1_details [0 ] <= tag2_details [0 ] and tag2_details [0 ] <= tag1_details [1 ]) # x1 <= y1 <= x2
109+ or (tag1_details [0 ] <= tag2_details [1 ] and tag2_details [0 ] <= tag1_details [1 ])): # x1 <= y2 <= x2
110+ continue
111+
112+ relation = data [id ]["relations" ].get ((tag2 , tag1 ), None )
113+ relation = data [id ]["relations" ].get ((tag1 , tag2 ), None ) if relation is None else relation
114+ if relation is None :
115+ relation = "false"
116+
117+ start = 0
118+ line_protected = re .sub (r"(.)\.(?=[\d])" , r"\1[PROTECTED_DOT]" , line )
119+ for sentence in re .split (r'\.|\?' , line_protected ):
120+ sentence = sentence .replace ("[PROTECTED_DOT]" , "." )
121+ original_sentence = sentence
122+ end = start + len (sentence )
123+
124+ if (tag1_details [0 ] >= start and tag1_details [1 ] <= end ) and \
125+ (tag2_details [0 ] >= start and tag2_details [1 ] <= end ):
126+ for offset_start , offset_end , value in sorted (list (data [id ]["entities" ]["CHEMICAL" ].values ()) + list (data [id ]["entities" ]["GENE" ].values ()),
127+ reverse = True ):
128+ if (offset_start , offset_end ) == (tag1_details [0 ], tag1_details [1 ]) or (offset_start , offset_end ) == (tag2_details [0 ], tag2_details [1 ]):
129+ if sentence [offset_start - start ] == "@" :
130+ offset_end = start + sentence .find ('$' ,offset_start - start ) + 1
131+ word = value
132+ elif offset_start < start or offset_end > end or sentence [offset_start - start ] == "@" :
133+ continue
134+ else :
135+ word = "OTHER"
136+ sentence = sentence [:offset_start - start ] + "@" + word + "$" + sentence [offset_end - start :]
137+ sentence = sentence .strip ()
138+ owriter .writerow ([id + "." + tag1 + "." + tag2 , sentence , relation ])
139+ num_sentences += 1
140+ if id == "23538201" and start == 1048 :
141+ print ("Accepted" , tag1 , tag2 )
142+
143+ else :
144+ rejected += 1
145+
146+ start = end + 1
147+ print ("Succesfully written {} samples to {}" .format (num_sentences , output_filename ))
148+ print ("Rejected are" , rejected )
149+
150+
151+ if __name__ == "__main__" :
152+ parser = argparse .ArgumentParser (
153+ description = 'Preprocessing Application for ChemProt'
154+ )
155+
156+ parser .add_argument (
157+ '--input_folder' ,
158+ type = str ,
159+ help = 'Specify the input files in a comma-separated list (no spaces)'
160+ )
161+ parser .add_argument (
162+ '--output_folder' ,
163+ type = str ,
164+ help = 'Specify the input files in a comma-separated list (no spaces)'
165+ )
166+
167+
168+ args = parser .parse_args ()
169+ preprocess_chemprot = ChemProtTextFormatting (args .input_folder , args .output_folder )
0 commit comments