-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmass_build_bow.py
More file actions
58 lines (51 loc) · 2.24 KB
/
mass_build_bow.py
File metadata and controls
58 lines (51 loc) · 2.24 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
import os
from pathlib import Path
from vector_store import BoWStore
import time
import fire
from logger_config import get_logger
import json
from scripts.build_bow_vs import load_docs_from_jsonl, to_doc
logger = get_logger(__name__)
def build_bow(json_file, params_file, max_features):
if not os.path.exists('./vector_store'):
os.makedirs('./vector_store', exist_ok=True)
logger.info('Loading data...')
start = time.time()
data = load_docs_from_jsonl(json_file)
logger.info('Data loaded in {}s'.format(time.time() - start))
# Load parameters from the params_file
all_params = []
with open(params_file, 'r') as f:
lines = f.readlines()
for line in lines:
all_params.append(json.loads(line))
logger.info('Total parameters = {}'.format(len(all_params)))
for params in all_params:
n_min = params.get('min_n', 1)
n_max = params.get('max_n', 1)
max_df = params.get('max_df', 1.0)
min_df = params.get('min_df', 1)
vectorizer_config = {"ngram_range": (n_min, n_max),
"max_df": max_df,
"min_df": min_df,
"stop_words": "english",
"max_features": max_features,
}
logger.info("Vectorizer config: ")
logger.info(vectorizer_config)
tf_idf_store = BoWStore(top_k=20, vectorizer_kwargs=vectorizer_config)
logger.info('Vector store initialized.')
start = time.time()
logger.info('Building BOW vector store.')
tf_idf_store.build_vectorstore(data)
logger.info('Built vector store.')
runtime = time.time() - start
logger.info('Built time: {}s'.format(runtime))
# logger.info('Number of vocabulary: ' + str(len(tf_idf_store.get_vocabulary())))
# logger.info(tf_idf_store.get_vocabulary())
output_name = str(Path(os.path.basename(json_file)).with_suffix('')) + '_' + '_'.join([str(n_min), str(n_max), str(max_features), str(min_df), str(max_df)]) + '_bow.pkl'
tf_idf_store.export(os.path.join('./vector_store', output_name))
logger.info('Saved vector store: ' + output_name)
if __name__ == '__main__':
fire.Fire(build_bow)