create-primeqa-app/examples/harry-potter-corpus/process.py at 32256ced977bb5d6914361777427ba2faee01194 · primeqa/create-primeqa-app · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
#! /usr/bin/env python3

from typing import Any, Generator
import sys
from io import StringIO
import itertools
import re
import spacy
from spacy.language import Language
import csv

def log(stuff: Any):
    """Logs str of values to stderr."""
    print(str(stuff), file=sys.stderr)

def lines_from_file(file_name: str) -> Generator[str, None, None]:
    """Reads a file and yields its lines."""
    log(f"Processing {file_name}...")
    for line in open(file_name, "r", encoding="utf-8"):
        yield line

def strip_newline(lines: Generator[str, None, None]) -> Generator[str, None, None]:
    """Removes trailing newlines from lines."""
    for line in lines:
        yield line.rstrip("\n")

def skip(pattern: str, lines: Generator[str, None, None]) -> Generator[str, None, None]:
    """Skips lines matching the pattern."""
    for line in lines:
        if not re.search(pattern, line):
            yield line

def fix_straddling_paragraphs(nlp: Language, lines: Generator[str, None, None]) -> Generator[str, None, None]:
    """Combines lines according to sentence continuity"""
    def _replace_stylish_quotes(text: str) -> str:
        """Replaces stylish quotes and apostrophes."""
        return text.replace("“", "").replace("”", "").replace("’", "\'")

    def _count_sentences(text: str) -> int:
        """Counts the quantity of sentences in a fragment."""
        return sum(1 for _ in nlp(_replace_stylish_quotes(text)).sents)

    accum: str = ""
    accum_sentence_count: int = 0
    for line in lines:
        sentence_count = _count_sentences(line)
        concatenation = accum.rstrip(" ") + " " + line
        concatenation_sentence_count = _count_sentences(concatenation)
        if concatenation_sentence_count == accum_sentence_count + sentence_count:
            # expected boundary between paragraphs: returns last paragraph and resets sentence count:
            yield accum
            accum = line
            accum_sentence_count = sentence_count
        else:
            # straddling paragraph: accumulates text and sentence count:
            accum = concatenation
            accum_sentence_count = concatenation_sentence_count
    yield accum

def word_qty(doc) -> int:
    """Counts words from text fragment."""
    count: int = 0
    for i in doc:
        if not (i.is_space or i.is_punct):
            count = count + 1
    return count

def combine_up_to_n_words(nlp: Language, n: int, lines: Generator[str, None, None]) -> Generator[str, None, None]:
    """Combines contiguous short paragraphs up to n words."""
    par: str = ""
    count: int = 0
    for line in lines:
        current_count: int = word_qty(nlp(line))
        if count + current_count <= n:
            if par != "":
                line = " " + line
            par = par.rstrip(" ") + line
            count = count + current_count
        else:
            yield par
            par = line
            count = current_count
    if par != "":
        yield par
    par = ""
    count = 0

def split_by_sentences_up_to_n_words(nlp: Language, n: int, lines: Generator[str, None, None]) -> Generator[str, None, None]:
    """Splits paragraphs longer than n words in sentence boundaries."""
    for line in lines:
        doc = nlp(line)
        word_count: int = word_qty(doc)
        if word_count > n:
            accum: str = ""
            word_count_accum: int = 0
            for sent in doc.sents:
                sentence = sent.text
                sentence_word_count = word_qty(sent)
                if word_count_accum + sentence_word_count < n:
                    if accum != "":
                        accum = accum.rstrip() + " "
                    accum = accum + sentence
                    word_count_accum = word_count_accum + sentence_word_count
                else:
                    yield accum
                    accum = sentence
                    word_count_accum = sentence_word_count
            if accum != "":
                yield accum
        else:
            yield line

def number_lines(lines: Generator[str, None, None]) -> Generator[str, None, None]:
    """Prepends a tab-separated line number to a text line."""
    count: int = 1
    for line in lines:
        yield str(count) + "\t" + line
        count = count + 1

def to_tsv(title_prefix: str, lines: Generator[str, None, None]) -> Generator[str, None, None]:
    """Generates TSV lines from an input generator, appending title and paragraph numbering."""
    buffer = StringIO()
    writer = csv.writer(buffer, delimiter='\t', lineterminator='\n')

    def _stringify(data: list[str]) -> str:
        """Extracts the TSV serialization from the writer as a string."""
        writer.writerow(data)
        value: str = buffer.getvalue().strip("\r\n")
        buffer.seek(0)
        buffer.truncate(0)
        return value

    counter: int = 1
    for line in lines:
        data: list[str] = [ line, title_prefix + " Paragraph " + str(counter) ]
        yield _stringify(data)
        counter = counter + 1

def write(out, lines: Generator[str, None, None]):
    """Writes lines to output stream, appending a newline."""
    for line in lines:
        out.write(line + "\n")

if __name__ == "__main__":
    nlp: Language = spacy.load("en_core_web_sm")
    WORD_QTY: int = 180
    files: list[str] = sys.argv[1:]
    out = sys.stdout

    # Adds the header to the file
    out.write("id\ttext\ttitle\n")

    all_lines = itertools.chain()
    for file_name in files:
        # For each book:
        # Reads line by line.
        lines = lines_from_file(file_name)
        # Removes newline at the end of each line.
        lines = strip_newline(lines)
        # Skips page number footer by regex matching.
        lines = skip(r"^Page \|\s*[0-9]+ .*$", lines)
        lines = skip(r"^P a g e.*$", lines)
        lines = skip(r"P.*Rowling", lines)
        # Skips blank lines.
        lines = skip(r"^\s+$", lines)
        # Fixes pagebreak-straddling paragraphs checking sentence continuity (one paragraph per line up to this point).
        lines = fix_straddling_paragraphs(nlp, lines)
        # Combines contiguous short paragraphs as long as the result doesn't exceed WORD_QTY words.
        lines = combine_up_to_n_words(nlp, WORD_QTY, lines)
        # Splits paragraphs longer than WORD_QTY words, keeping whole sentences.
        lines = split_by_sentences_up_to_n_words(nlp, WORD_QTY, lines)

        # Formats each fragment as TSV appending "Book<N> Paragraph <M>" as title.
        book_name = re.sub(r"\.[a-zA-Z0-9]+$", "", file_name)
        lines = to_tsv(book_name, lines)

        # Concatenates book lines
        all_lines = itertools.chain(all_lines, lines)

    # Prepends line numbers (will be used as ids)
    all_lines = number_lines(all_lines)
    write(out, all_lines)