-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdata_stats.py
More file actions
97 lines (68 loc) · 2.72 KB
/
data_stats.py
File metadata and controls
97 lines (68 loc) · 2.72 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
from collections import Counter
import re
from coptic_utils import logger
FULL_DATA = "./data/full_data.csv"
RECONSTRUCTED_DATA = "./data/masked_test_reconstructed_lacuna.csv"
EMPTY_LACUNA_DATA = "./data/test_empty_lacuna.csv"
FILES = [FULL_DATA, RECONSTRUCTED_DATA, EMPTY_LACUNA_DATA]
def char_histogram(file_name):
with open(file_name, "r") as f:
file_text = f.read()
file_text = file_text.strip()
char_counts = Counter(file_text)
return char_counts
def char_counts(file_path):
logger.info(f"Gathering character stats for {file_path}")
with open(file_path, "r") as f:
file_text = f.read()
sentences = file_text.strip().split("\n")
logger.info(f"Number of sentences: {len(sentences)}")
sentences.sort(key=len)
total_char = 0
for sentence in sentences:
total_char += len(sentence)
logger.info(f"Number of characters: {total_char}")
logger.info(f"Shortest sentence: {len(sentences[0])}, {sentences[0]}")
logger.info(f"Longest sentence: {len(sentences[-1])}, {sentences[-1]}")
ave_sentence_len = total_char / len(sentences)
logger.info(f"Average sentence length: {round(ave_sentence_len, 2)}")
def gap_counts(file_path):
logger.info(f"Gathering gap stats for {file_path}")
with open(file_path, "r") as f:
file_text = f.read()
sentences = file_text.strip().split("\n")
gap_char_count = 0
for sentence in sentences:
gap_char_count += sentence.count("#")
logger.info(f"Gap characters: {gap_char_count}")
sentences_with_gaps = []
for sentence in sentences:
sentences_with_gaps.append(re.findall(r"#+", sentence))
sentences_with_gaps.sort(key=len)
masks_per_sentence = {}
for item in sentences_with_gaps:
masks_per_sentence.setdefault(len(item), 0)
masks_per_sentence[len(item)] += 1
logger.info(f"Masks per sentence: {masks_per_sentence}")
length_per_gap = {}
for gap_list in sentences_with_gaps:
if len(gap_list) > 0:
for gap in gap_list:
length_per_gap.setdefault(len(gap), 0)
length_per_gap[len(gap)] += 1
sorted_length_per_gap = dict(sorted(length_per_gap.items()))
logger.info(f"Length per gap: {sorted_length_per_gap}")
gap_count = 0
for key, value in sorted_length_per_gap.items():
gap_count += value
if gap_count > 0:
logger.info(
f"Total gap characters: {gap_char_count}, total gaps: {gap_count}, "
f"average length per gap {round(gap_char_count/gap_count, 2)}"
)
if __name__ == "__main__":
for file in FILES:
char_histo = char_histogram(file)
print(char_histo)
char_counts(file)
gap_counts(file)