EQUITBL/example_preprocess.py at main · hdevinney/EQUITBL · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
#!/usr/bin/python3
# -*- coding: utf-8 -*-

# version: 26.04.2022
# author: Hannah Devinney
# preprocess the brown corpus to bags of lemmaPOSes (chunks into documents using sliding window!!); save to gensim corpus/dict
# note that some preprocessing info MUST be in the config file!


import io
import pickle
import re
import pandas as pd
from configparser import ConfigParser
from argparse import ArgumentParser

#preprocessing (for English) is in equitbl/tools/corpus/preprocessing_eng
from tools.corpus import preprocessing_eng as preproc

from gensim import corpora
import gensim

def main():
    ##################### ARGUMENTS #######################
    a = ArgumentParser()
    a.add_argument('-config', dest='config_file', required=True, type=str, help="path to .ini configuration file (for specifying file paths)")
    opts = a.parse_args()
    config = ConfigParser()
    config.read(opts.config_file)

    #paths
    base_dir = config['FILE_PATHS']['project_root']
    input_dir = base_dir + 'input/corpora/'
    output_dir = base_dir + 'models/bow/'

    #files
    seedfile = base_dir + 'input/seeds/' + config['FILE_PATHS']['seed_file']
    input_corpus = input_dir + config['FILE_PATHS']['corpus_name'] + '.json'
    output_name = config['FILE_PATHS']['dictionary_name']

    #preprocessing choices
    CHUNK_SIZE = config['PREPROCESSING'].getint('chunk_size')
    MINIMUM = config['PREPROCESSING'].getint('minimum_freq')
    IGNORE = config['PREPROCESSING']['ignore_pos'].replace(" ", "").strip('[]').split(',')
    STOPWORDS = config['PREPROCESSING']['stopwords'].replace(" ", "").strip('[]').split(',')

    ########################################################

    #get seed words (will be exempted from pruning)
    with open(seedfile, 'r') as inputfile:
        seed_list = inputfile.readlines()
        seed_list = [seed.rstrip() for seed in seed_list] #clean up whitespace

    #process the input documents (get TOKENS (tagged and lemmatized))
    print("processing: {}".format(input_corpus))
    df = pd.read_json(open(input_corpus))
    lemma_dictionary = preproc.get_chunked_pos_lemmas_dictionary(df, IGNORE, STOPWORDS, CHUNK_SIZE)

    #prune vocabulary
    print("PRUNING INFREQUENT NON-SEED TERMS")
    init_docs, frequency = preproc.get_docs_and_frequencies(lemma_dictionary)
    documents = preproc.prune_dict(init_docs, frequency, seed_list, MINIMUM)
    print("completed")

    #convert to gensim dictionary; save
    dictionary = corpora.Dictionary(documents)
    dictionary.save(output_dir + output_name + ".gensim")
    print("Constructed dictionary")

    #process the documents; save info as a gensim corpus
    corpus = [dictionary.doc2bow(text) for text in documents]
    print("Constructed corpus BoW: " + str(len(corpus)))
    pickle.dump(corpus, open(output_dir + output_name + ".pkl", 'wb'), protocol=2)

    print("Saved dictionary and corpus in " + output_dir)

    #you may also want a .mm format for the corpus (for back up etc.)
    #if so, uncomment the following:
#    corpora.MmCorpus.serialize(output_dir + output_name + ".mm", corpus)
#    print("saved gensim version of corpus to " + output_dir + output_name + ".mm")


if __name__ == "__main__":
    main()