-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathutilities.py
More file actions
53 lines (47 loc) · 1.87 KB
/
utilities.py
File metadata and controls
53 lines (47 loc) · 1.87 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
import ast
import pandas as pd
import gc
import os
def convert_byte_string(string):
return ast.literal_eval(string).decode("utf-8")
def read_results(filename):
with open(filename) as results_file:
results = results_file.read()
sample_texts = []
first_scores = []
for result in results.split("first_score=")[1:]:
first_scores.append(float(result.split(",")[0]))
second_scores = []
for result in results.split("second_score=")[1:]:
second_scores.append(float(result.split(",")[0]))
len_first = len(first_scores)
len_second = len(second_scores)
texts = results.split("\n\n\n\n")[1:]
for text in texts:
if text.startswith("("):
samples = []
lines = text[1:-1].split("\n")
for line in lines:
if not line:
break
samples.append(convert_byte_string(line))
sample_texts.append("".join(samples))
assert (
len_first * 3 == len_second * 4 or len_first == len_second
), "It seems that a collision appeared and this function cannot be used properly."
return first_scores, second_scores, sample_texts
def search_in_train(dataset_prefix, sample_texts, code_field):
res = sample_texts
uniques = pd.Series(res).astype(str).drop_duplicates()
parts = []
counter = 1
next_dataset = os.path.join(os.getcwd(), dataset_prefix + str(counter))
while os.path.isfile(next_dataset):
df = pd.read_table(next_dataset, sep="\1", escapechar="\2")
parts.append(
df[df[code_field].apply(lambda x: any(s in str(x) for s in uniques))].copy()
)
counter += 1
next_dataset = os.path.join(os.getcwd(), dataset_prefix + str(counter))
gc.collect()
return pd.concat(parts)