-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathexample_to_schema.py
More file actions
66 lines (50 loc) · 2.35 KB
/
example_to_schema.py
File metadata and controls
66 lines (50 loc) · 2.35 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
#!/usr/bin/python3
# -*- coding: utf-8 -*-
# version: 26.04.2022
# author: Hannah Devinney
# EXAMPLE code for transforming a corpus (in this case, nltk's copy of the Brown Corpus) into the .json files
# compatible with the rest of EQUITBL and saving them in the corpora folder.
# Should be quite adaptable to other corpora.
import pandas as pd
import io
#the relevant bits from this package are in tools/corpus/corpus_mapping.py
from tools.corpus.corpus_mapping import map_to_schema
def main():
out_file_path = 'test_files/input/corpora/'
#load the brown corpus
from nltk.corpus import brown
#the brown corpus is organized by categories
cats = brown.categories()
#put corpus into a dataframe
#for illustrative purposes, we will initially include some information
#which we will later get rid of when mapping to the EQUITBL schema
#but because concat'ing dataframes is terrible, we'll actually build it as a list of dictionaries
data = []
#go through category by category:
for cat in cats:
f_ids = brown.fileids(cat)
#go through the category file by file
for f_id in f_ids:
#get the text as a list of lists of tokens
sent_lists = brown.sents(f_id)
#we just want the text so convert to a string
sentences = " ".join([token for sentence in sent_lists for token in sentence])
data.append({'file_id':f_id, 'category':cat, 'sentences':sentences})
#once you've gone through all of it, convert data into the dataframe:
full_df = pd.DataFrame(data)
#save it somewhere
full_save = out_file_path + 'brown_categories.json'
full_df.to_json(full_save)
#so now we have a dataframe with the columns 'file_id' 'category' and 'sentences'
#(and we will end up ignoring the 'category' field
#but e.g. if you only wanted to convert a subset of the corpus it would be useful!)
#we have to tell map_to_schema what IDs and TEXTs are currently called:
id_name = 'file_id'
text_name = 'sentences' #if you have multiple TEXT columns, list all of them to combine as a single 'article'
#map to the EQUITBL schema
new_df = map_to_schema(original_file=full_save, text_path=text_name, id_path=id_name)
#save!
save_as = out_file_path + 'brown_corpus.json'
new_df.to_json(save_as)
if __name__ == "__main__":
main()