Skip to content

Commit 5f8f41e

Browse files
committed
update data_npy with data_txt
1 parent 3e6ddcb commit 5f8f41e

File tree

10 files changed

+820
-6
lines changed

10 files changed

+820
-6
lines changed

datasets/criteo_autofis/Criteo.py

Lines changed: 544 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import numpy as np
16+
import os
17+
from Criteo import Criteo
18+
from tqdm import tqdm
19+
20+
batch_size = 1024
21+
22+
train_data_param = {
23+
'gen_type': 'train',
24+
'random_sample': True,
25+
'batch_size': batch_size,
26+
'split_fields': False,
27+
'on_disk': True,
28+
'squeeze_output': True,
29+
}
30+
test_data_param = {
31+
'gen_type': 'test',
32+
'random_sample': False,
33+
'batch_size': batch_size,
34+
'split_fields': False,
35+
'on_disk': True,
36+
'squeeze_output': True,
37+
}
38+
39+
dataset = Criteo(initialized=True)
40+
train_gen = dataset.batch_generator(train_data_param)
41+
test_gen = dataset.batch_generator(test_data_param)
42+
43+
output_dir = 'data/whole_data'
44+
xs = []
45+
ys = []
46+
for x, y in tqdm(train_gen):
47+
xs.append(x)
48+
ys.append(y)
49+
50+
x = np.concatenate(xs, 0)
51+
y = np.concatenate(ys, 0)
52+
print(x.shape)
53+
#np.save(os.path.join(output_dir, 'train', 'train_x.npy'), x)
54+
#np.save(os.path.join(output_dir, 'train', 'train_y.npy'), y)
55+
np.savetxt(os.path.join(output_dir, 'train', 'train_x.txt'), x, fmt='%d')
56+
np.savetxt(os.path.join(output_dir, 'train', 'train_y.txt'), y, fmt='%d')
57+
58+
xs = []
59+
ys = []
60+
for x, y in tqdm(test_gen):
61+
xs.append(x)
62+
ys.append(y)
63+
64+
x = np.concatenate(xs, 0)
65+
y = np.concatenate(ys, 0)
66+
print(x.shape)
67+
#np.save(os.path.join(output_dir, 'test', 'test_x.npy'), x)
68+
#np.save(os.path.join(output_dir, 'test', 'test_y.npy'), y)
69+
np.savetxt(os.path.join(output_dir, 'test', 'test_x.txt'), x, fmt='%d')
70+
np.savetxt(os.path.join(output_dir, 'test', 'text_y.txt'), y, fmt='%d')

models/rank/autofis/config.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
# global settings
1616

1717
runner:
18-
train_data_dir: "data/sample_data/train"
18+
train_data_dir: "data/sample_data"
1919
train_reader_path: "criteo_reader" # importlib format
2020
use_gpu: False
2121
use_auc: False
@@ -24,7 +24,7 @@ runner:
2424
print_interval: 1
2525
#model_init_path: "output_model/0" # init model
2626
model_save_path: "output_model_autofis"
27-
test_data_dir: "data/sample_data/test"
27+
test_data_dir: "data/sample_data"
2828
infer_reader_path: "criteo_reader" # importlib format
2929
infer_batch_size: 200
3030
infer_load_path: "output_model_autofis"

models/rank/autofis/criteo_reader.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -22,10 +22,10 @@ class RecDataset(Dataset):
2222
def __init__(self, file_list, config):
2323
super(RecDataset, self).__init__()
2424
for file in file_list:
25-
if file.endswith('x.npy'):
26-
self.x = np.load(file)
27-
elif file.endswith('y.npy'):
28-
self.y = np.load(file)
25+
if file.endswith('x.txt'):
26+
self.x = np.loadtxt(file, dtype=np.int64)
27+
elif file.endswith('y.txt'):
28+
self.y = np.loadtxt(file, dtype=np.int64)
2929

3030
def __getitem__(self, item):
3131
return self.x[item], self.y[item]

models/rank/autofis/data/sample_data/sample_train_x.txt

Lines changed: 100 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
0
2+
1
3+
1
4+
0
5+
1
6+
1
7+
1
8+
1
9+
0
10+
1
11+
1
12+
1
13+
0
14+
1
15+
0
16+
1
17+
1
18+
0
19+
0
20+
1
21+
0
22+
1
23+
1
24+
0
25+
0
26+
1
27+
1
28+
1
29+
1
30+
1
31+
0
32+
1
33+
0
34+
0
35+
0
36+
0
37+
1
38+
0
39+
0
40+
1
41+
0
42+
1
43+
1
44+
0
45+
0
46+
0
47+
1
48+
0
49+
1
50+
1
51+
1
52+
0
53+
0
54+
0
55+
1
56+
0
57+
0
58+
0
59+
1
60+
1
61+
1
62+
0
63+
1
64+
1
65+
0
66+
0
67+
1
68+
1
69+
0
70+
1
71+
0
72+
0
73+
0
74+
1
75+
1
76+
0
77+
1
78+
0
79+
1
80+
1
81+
1
82+
0
83+
1
84+
0
85+
1
86+
0
87+
0
88+
0
89+
0
90+
1
91+
0
92+
1
93+
0
94+
0
95+
0
96+
1
97+
0
98+
1
99+
0
100+
1
-305 KB
Binary file not shown.
-7.94 KB
Binary file not shown.
-305 KB
Binary file not shown.
-7.94 KB
Binary file not shown.

0 commit comments

Comments
 (0)