Skip to content

Commit d757171

Browse files
committed
shells to run bigdata
1 parent 61fb248 commit d757171

File tree

9 files changed

+493
-7
lines changed

9 files changed

+493
-7
lines changed

datasets/MIND/data/behavior.py

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
file_base = ["train", "dev", "test"]
16+
files = ["behaviors.tsv", "behaviors_dev.tsv", "behaviors_test.tsv"]
17+
for base, aim in zip(files, file_base):
18+
with open(aim, "w") as w1:
19+
with open(base, "r") as f:
20+
for l in f:
21+
line = l.split("\t")
22+
visit = line[3]
23+
if len(visit) == 0:
24+
continue
25+
sample = line[4].split(" ")
26+
pos_sample = ""
27+
neg_sample = ""
28+
for s in sample:
29+
if len(s) > 0 and (s[-2:] == "-1" or s[-2:] == "-0"):
30+
id = s.split("-")[0]
31+
if id in article_map:
32+
if s[-2:] == "-1":
33+
if len(pos_sample) > 0:
34+
pos_sample = pos_sample + " "
35+
pos_sample = pos_sample + id
36+
else:
37+
if len(neg_sample) > 0:
38+
neg_sample = neg_sample + " "
39+
neg_sample = neg_sample + id
40+
41+
if len(pos_sample) == 0 or len(neg_sample) == 0:
42+
continue
43+
44+
line = visit + "\t" + pos_sample + "\t" + neg_sample + "\n"
45+
if random.randint(1, 10) == 3:
46+
w2.write(line)
47+
else:
48+
w1.write(line)

datasets/MIND/data/combine.py

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
# This is a sample Python script.
16+
17+
# Press ⌃R to execute it or replace it with your code.
18+
# Press Double ⇧ to search everywhere for classes, files, tool windows, actions, and settings.
19+
20+
21+
def print_hi(name):
22+
# Use a breakpoint in the code line below to debug your script.
23+
print(f'Hi, {name}') # Press ⌘F8 to toggle the breakpoint.
24+
25+
26+
# Press the green button in the gutter to run the script.
27+
if __name__ == '__main__':
28+
count = 0
29+
cate_dict = {"<unk>": 0}
30+
sub_cate_dict = {"<unk>": 0}
31+
id_map = {}
32+
list = ["train_raw/news.tsv", "dev_raw/news.tsv", "test_raw/news.tsv"]
33+
with open("news_backup.tsv", "w") as new_w:
34+
with open("kkk/temp.txt", "w") as w:
35+
for file in list:
36+
with open(file, "r") as f:
37+
for l in f:
38+
line = l.split("\t")
39+
id = line[0]
40+
if id in id_map:
41+
continue
42+
new_w.write(l)
43+
id_map[id] = len(id_map)
44+
cate = line[1]
45+
sub_cate = line[2]
46+
title = line[3]
47+
content = line[4]
48+
w.write(title + "\n")
49+
w.write(content + "\n")
50+
if cate not in cate_dict:
51+
cate_dict[cate] = len(cate_dict)
52+
if sub_cate not in sub_cate_dict:
53+
sub_cate_dict[sub_cate] = len(sub_cate_dict)
54+
55+
with open("cate_map", "w") as w1:
56+
for key in cate_dict:
57+
w1.write(key + "\t" + str(cate_dict[key]) + "\n")
58+
with open("sub_cate_map", "w") as w2:
59+
for key in sub_cate_dict:
60+
w2.write(key + "\t" + str(sub_cate_dict[key]) + "\n")
61+
62+
#print(count)
63+
64+
# See PyCharm help at https://www.jetbrains.com/help/pycharm/
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
16+
def count(file):
17+
count = 0
18+
with open(file, "r") as r:
19+
for l in r:
20+
count = count + 1
21+
return count
22+
23+
24+
with open("dict/yaml_info.txt", "w") as w:
25+
word_count = count("dict/word_dict")
26+
cate = count("dict/cate_map")
27+
sub_cate = count("dict/sub_cate_map")
28+
w.write("word_dict_size: " + str(word_count) + "\n")
29+
w.write("category_size: " + str(cate) + "\n")
30+
w.write("sub_category_size " + str(sub_cate) + "\n")

datasets/MIND/data/lineCount.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
16+
def count(file):
17+
count = 0
18+
with open(file, "r") as r:
19+
for l in r:
20+
count = count + 1
21+
return count
22+
23+
24+
print(count("kkk/temp.txt"))
25+
print(count("convert_text8/convert_temp.txt"))

datasets/MIND/data/make_article.py

Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import random
16+
cate_dict = {}
17+
sub_cate_dict = {}
18+
with open("cate_map", "r") as r:
19+
for l in r:
20+
line = l.split("\t")
21+
cate_dict[line[0]] = line[1][:-1]
22+
23+
with open("sub_cate_map", "r") as r:
24+
for l in r:
25+
line = l.split("\t")
26+
sub_cate_dict[line[0]] = line[1][:-1]
27+
28+
print(cate_dict)
29+
print(sub_cate_dict)
30+
title_r = []
31+
content_r = []
32+
index = 0
33+
with open("convert_text8/convert_temp.txt", "r") as r1:
34+
for l in r1:
35+
if index % 2 == 0:
36+
title_r.append(l[:-1])
37+
else:
38+
content_r.append(l[:-1])
39+
index = index + 1
40+
print(index)
41+
inx = 0
42+
article_map = {}
43+
with open("article.txt", "w") as ar:
44+
with open("news_backup.tsv", "r") as r2:
45+
for l in r2:
46+
line = l.split("\t")
47+
id, cate, sub_cate = line[:3]
48+
article_map[id] = len(article_map)
49+
cate = cate_dict[cate]
50+
sub_cate = sub_cate_dict[sub_cate]
51+
title = title_r[inx]
52+
content = content_r[inx]
53+
inx = inx + 1
54+
ar.write(id + "\t" + cate + "\t" + sub_cate + "\t" + title + "\t" +
55+
content + "\n")
56+
57+
print(inx)
58+
print(len(article_map))
59+
file_base = ["train", "dev", "test"]
60+
files = [
61+
"train_raw/behaviors.tsv", "dev_raw/behaviors.tsv",
62+
"test_raw/behaviors.tsv"
63+
]
64+
for base, aim in zip(files, file_base):
65+
with open(aim + "/browse.txt", "w") as w1:
66+
print("generate " + aim)
67+
with open(base, "r") as f:
68+
for l in f:
69+
line = l.split("\t")
70+
visit = line[3]
71+
if len(visit) == 0:
72+
continue
73+
sample = line[4].split(" ")
74+
pos_sample = ""
75+
neg_sample = ""
76+
for s in sample:
77+
if len(s) > 0 and (s[-2:] == "-1" or s[-2:] == "-0"):
78+
id = s.split("-")[0]
79+
if id in article_map:
80+
if s[-2:] == "-1":
81+
if len(pos_sample) > 0:
82+
pos_sample = pos_sample + " "
83+
pos_sample = pos_sample + id
84+
else:
85+
if len(neg_sample) > 0:
86+
neg_sample = neg_sample + " "
87+
neg_sample = neg_sample + id
88+
89+
if len(pos_sample) == 0 or len(neg_sample) == 0:
90+
continue
91+
92+
line = visit + "\t" + pos_sample + "\t" + neg_sample + "\n"
93+
w1.write(line)

0 commit comments

Comments
 (0)