|
| 1 | +#!/usr/bin/env python3 |
| 2 | +# coding: utf-8 |
| 3 | + |
| 4 | +""" |
| 5 | +Class used in scripts/prodigy_to_tsv.py which converts token annotated jsonl |
| 6 | +files to tab-separated-values files for use in the deep reference parser |
| 7 | +""" |
| 8 | + |
| 9 | +import csv |
| 10 | +import re |
| 11 | + |
| 12 | +import numpy as np |
| 13 | +import plac |
| 14 | + |
| 15 | +from ..io import read_jsonl |
| 16 | + |
| 17 | +from ..logger import logger |
| 18 | + |
| 19 | + |
| 20 | +class TokenLabelPairs: |
| 21 | + """ |
| 22 | + Convert prodigy format docs or list of lists into tuples of (token, label). |
| 23 | + """ |
| 24 | + |
| 25 | + def __init__(self, line_limit=73, respect_line_endings=True, respect_doc_endings=True): |
| 26 | + """ |
| 27 | + Args: |
| 28 | + line_limit(int): Maximum number of tokens allowed per training |
| 29 | + example. If you are planning to use this data for making |
| 30 | + predictions, then this should correspond to the max_words |
| 31 | + attribute for the DeepReferenceParser class used to train the |
| 32 | + model. |
| 33 | + respect_line_endings(bool): If true, line endings appearing in the |
| 34 | + text will be respected, leading to much shorter line lengths |
| 35 | + usually <10. Typically this results in a much worser performing |
| 36 | + model, but follows the convention set by Rodrigues et al. |
| 37 | + respect_doc_endings(bool): If true, a line ending is added at the |
| 38 | + end of each document. If false, then the end of a document flows |
| 39 | + into the beginning of the next document. |
| 40 | + """ |
| 41 | + |
| 42 | + self.line_count = 0 |
| 43 | + self.line_lengths = [] |
| 44 | + self.line_limit = line_limit |
| 45 | + self.respect_doc_endings = respect_doc_endings |
| 46 | + self.respect_line_endings = respect_line_endings |
| 47 | + |
| 48 | + def run(self, docs): |
| 49 | + """ |
| 50 | + """ |
| 51 | + |
| 52 | + out = [] |
| 53 | + |
| 54 | + for doc in docs: |
| 55 | + out.extend(self.yield_token_label_pair(doc)) |
| 56 | + |
| 57 | + self.stats(out) |
| 58 | + |
| 59 | + return out |
| 60 | + |
| 61 | + |
| 62 | + def stats(self, out): |
| 63 | + |
| 64 | + avg_line_len = np.round(np.mean(self.line_lengths), 2) |
| 65 | + |
| 66 | + logger.debug("Returning %s examples", self.line_count) |
| 67 | + logger.debug("Average line length: %s", avg_line_len) |
| 68 | + |
| 69 | + def yield_token_label_pair(self, doc, lists=False): |
| 70 | + """ |
| 71 | + Expect list of jsons loaded from a jsonl |
| 72 | +
|
| 73 | + Args: |
| 74 | + doc (dict): Document in prodigy format or list of lists |
| 75 | + lists (bool): Expect a list of lists rather than a prodigy format |
| 76 | + dict? |
| 77 | +
|
| 78 | + NOTE: Makes the assumption that every token has been labelled in spans. This |
| 79 | + assumption will be true if the data has been labelled with prodigy, then |
| 80 | + spans covering entire references have been converted to token spans. OR that |
| 81 | + there are no spans at all, and this is being used to prepare data for |
| 82 | + prediction. |
| 83 | + """ |
| 84 | + |
| 85 | + # Ensure that spans and tokens are sorted (they should be) |
| 86 | + |
| 87 | + if lists: |
| 88 | + tokens = doc |
| 89 | + else: |
| 90 | + tokens = sorted(doc["tokens"], key=lambda k: k["id"]) |
| 91 | + |
| 92 | + # For prediction, documents may not yet have spans. If they do, sort |
| 93 | + # them too based on token_start which is equivalent to id in |
| 94 | + # doc["tokens"]. |
| 95 | + |
| 96 | + spans = doc.get("spans") |
| 97 | + |
| 98 | + if spans: |
| 99 | + spans = sorted(doc["spans"], key=lambda k: k["token_start"]) |
| 100 | + |
| 101 | + # Set a token counter that is used to limit the number of tokens to |
| 102 | + # line_limit. |
| 103 | + |
| 104 | + token_counter = int(0) |
| 105 | + |
| 106 | + doc_len = len(tokens) |
| 107 | + |
| 108 | + for i, token in enumerate(tokens, 1): |
| 109 | + |
| 110 | + label = None |
| 111 | + |
| 112 | + # For case when tokens have been labelled with spans (for training |
| 113 | + # data). |
| 114 | + |
| 115 | + if spans: |
| 116 | + # Need to remove one from index as it starts at 1! |
| 117 | + label = spans[i - 1].get("label") |
| 118 | + |
| 119 | + text = token["text"] |
| 120 | + |
| 121 | + # If the token is empty even if it has been labelled, pass it |
| 122 | + |
| 123 | + if text == "": |
| 124 | + |
| 125 | + pass |
| 126 | + |
| 127 | + # If the token is a newline (and possibly other characters) and we want |
| 128 | + # to respect line endings in the text, then yield a (None, None) tuple |
| 129 | + # which will be converted to a blank line when the resulting tsv file |
| 130 | + # is read. |
| 131 | + |
| 132 | + elif re.search(r"\n", text) and self.respect_line_endings: |
| 133 | + |
| 134 | + # Is it blank after whitespace is removed? |
| 135 | + |
| 136 | + if text.strip() == "": |
| 137 | + |
| 138 | + yield (None, None) |
| 139 | + |
| 140 | + self.line_lengths.append(token_counter) |
| 141 | + self.line_count += 1 |
| 142 | + |
| 143 | + token_counter = 0 |
| 144 | + |
| 145 | + elif token_counter == self.line_limit: |
| 146 | + |
| 147 | + # Yield None, None to signify a line ending, then yield the next |
| 148 | + # token. |
| 149 | + |
| 150 | + yield (None, None) |
| 151 | + yield (text.strip(), label) |
| 152 | + |
| 153 | + # Set to one to account for the first token being added. |
| 154 | + |
| 155 | + self.line_lengths.append(token_counter) |
| 156 | + self.line_count += 1 |
| 157 | + |
| 158 | + token_counter = 1 |
| 159 | + |
| 160 | + elif i == doc_len and self.respect_doc_endings: |
| 161 | + |
| 162 | + # Case when the end of the document has been reached, but it is |
| 163 | + # less than self.lime_limit. This assumes that we want to retain |
| 164 | + # a line ending which denotes the end of a document, and the |
| 165 | + # start of new one. |
| 166 | + |
| 167 | + yield (text.strip(), label) |
| 168 | + yield (None, None) |
| 169 | + |
| 170 | + self.line_lengths.append(token_counter) |
| 171 | + self.line_count += 1 |
| 172 | + |
| 173 | + else: |
| 174 | + |
| 175 | + # Returned the stripped label. |
| 176 | + |
| 177 | + yield (text.strip(), label) |
| 178 | + |
| 179 | + token_counter += 1 |
| 180 | + |
| 181 | + |
| 182 | +@plac.annotations( |
| 183 | + input_file=( |
| 184 | + "Path to jsonl file containing prodigy docs.", |
| 185 | + "positional", |
| 186 | + None, |
| 187 | + str |
| 188 | + ), |
| 189 | + output_file=( |
| 190 | + "Path to output tsv file.", |
| 191 | + "positional", |
| 192 | + None, |
| 193 | + str |
| 194 | + ) |
| 195 | +) |
| 196 | +def prodigy_to_tsv(input_file, output_file): |
| 197 | + """ |
| 198 | + Convert token annotated jsonl to token annotated tsv ready for use in the |
| 199 | + Rodrigues model. |
| 200 | + """ |
| 201 | + |
| 202 | + annotated_data = read_jsonl(input_file) |
| 203 | + |
| 204 | + logger.info("Loaded %s prodigy docs", len(annotated_data)) |
| 205 | + |
| 206 | + tlp = TokenLabelPairs() |
| 207 | + token_label_pairs = list(tlp.run(annotated_data)) |
| 208 | + |
| 209 | + with open(output_file, 'w') as fb: |
| 210 | + writer = csv.writer(fb, delimiter="\t") |
| 211 | + # Write DOCSTART and a blank line |
| 212 | + writer.writerows([("DOCSTART", None), (None, None)]) |
| 213 | + writer.writerows(token_label_pairs) |
| 214 | + |
| 215 | + logger.info("Wrote %s token/label pairs to %s", len(token_label_pairs), |
| 216 | + output_file) |
| 217 | + |
0 commit comments