Skip to content

Commit a39c385

Browse files
authored
Add Amazon Clothing dataset (#197)
1 parent 9cf5dcb commit a39c385

File tree

4 files changed

+161
-16
lines changed

4 files changed

+161
-16
lines changed

cornac/datasets/__init__.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,10 @@
1313
# limitations under the License.
1414
# ============================================================================
1515

16+
from . import amazon_clothing
17+
from . import amazon_office
18+
from . import citeulike
19+
from . import epinions
1620
from . import movielens
17-
from . import tradesy
1821
from . import netflix
19-
from . import amazon_office
20-
from . import citeulike
22+
from . import tradesy

cornac/datasets/amazon_clothing.py

Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
# Copyright 2018 The Cornac Authors. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
# ============================================================================
15+
"""
16+
This data is built based on the Amazon datasets provided by Julian McAuley @ http://jmcauley.ucsd.edu/data/amazon/.
17+
We make sure all items having three types of auxiliary data: text, image, and context (items appearing together).
18+
"""
19+
20+
from typing import List
21+
22+
import numpy as np
23+
24+
from ..utils import cache
25+
from ..data import Reader
26+
from ..data.reader import read_text
27+
28+
29+
def load_rating(reader: Reader = None) -> List:
30+
"""Load the user-item ratings
31+
32+
Parameters
33+
----------
34+
reader: `obj:cornac.data.Reader`, default: None
35+
Reader object used to read the data.
36+
37+
Returns
38+
-------
39+
data: array-like
40+
Data in the form of a list of tuples (user, item, rating).
41+
"""
42+
fpath = cache(url='https://static.preferred.ai/cornac/datasets/amazon_clothing/rating.zip',
43+
unzip=True, relative_path='amazon_clothing/rating.txt')
44+
reader = Reader() if reader is None else reader
45+
return reader.read(fpath, sep='\t')
46+
47+
48+
def load_text():
49+
"""Load the item text descriptions
50+
51+
Returns
52+
-------
53+
texts: List
54+
List of text documents, one per item.
55+
56+
ids: List
57+
List of item ids aligned with indices in `texts`.
58+
"""
59+
fpath = cache(url='https://static.preferred.ai/cornac/datasets/amazon_clothing/text.zip',
60+
unzip=True, relative_path='amazon_clothing/text.txt')
61+
texts, ids = read_text(fpath, sep='::')
62+
return texts, ids
63+
64+
65+
def load_image():
66+
"""Load the item image in the form of visual features (extracted from pre-trained CNN)
67+
68+
Returns
69+
-------
70+
features: numpy.ndarray
71+
Feature matrix with shape (n, 4096) with n is the number of items.
72+
73+
item_ids: List
74+
List of item ids aligned with indices in `features`.
75+
"""
76+
features = np.load(cache(url='https://static.preferred.ai/cornac/datasets/amazon_clothing/image.zip',
77+
unzip=True, relative_path='amazon_clothing/image_features.npy'))
78+
item_ids = read_text(cache(url='https://static.preferred.ai/cornac/datasets/amazon_clothing/item_ids.zip',
79+
unzip=True, relative_path='amazon_clothing/item_ids.txt'))
80+
return features, item_ids
81+
82+
83+
def load_context(reader: Reader = None) -> List:
84+
"""Load the item-item interactions
85+
86+
Parameters
87+
----------
88+
reader: `obj:cornac.data.Reader`, default: None
89+
Reader object used to read the data.
90+
91+
Returns
92+
-------
93+
data: array-like
94+
Data in the form of a list of tuples (item, item, 1).
95+
"""
96+
fpath = cache(url='https://static.preferred.ai/cornac/datasets/amazon_clothing/context.zip',
97+
unzip=True, relative_path='amazon_clothing/context.txt')
98+
reader = Reader() if reader is None else reader
99+
return reader.read(fpath, fmt='UI', sep='\t')

docs/source/datasets.rst

Lines changed: 19 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -4,20 +4,10 @@ Built-in datasets
44

55
.. automodule:: cornac.datasets
66
:members:
7-
8-
MovieLens
9-
-----------------------------------------
10-
.. automodule:: cornac.datasets.movielens
11-
:members:
12-
13-
Netflix
14-
---------------------------------------
15-
.. automodule:: cornac.datasets.netflix
16-
:members:
177

18-
Tradesy
19-
---------------------------------------
20-
.. automodule:: cornac.datasets.tradesy
8+
Amazon Clothing
9+
-----------------------------------------------
10+
.. automodule:: cornac.datasets.amazon_clothing
2111
:members:
2212

2313
Amazon Office
@@ -34,3 +24,19 @@ Epinions
3424
-----------------------------------------
3525
.. automodule:: cornac.datasets.epinions
3626
:members:
27+
28+
MovieLens
29+
-----------------------------------------
30+
.. automodule:: cornac.datasets.movielens
31+
:members:
32+
33+
Netflix
34+
---------------------------------------
35+
.. automodule:: cornac.datasets.netflix
36+
:members:
37+
38+
Tradesy
39+
---------------------------------------
40+
.. automodule:: cornac.datasets.tradesy
41+
:members:
42+
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
# Copyright 2018 The Cornac Authors. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
# ============================================================================
15+
16+
import unittest
17+
import random
18+
import time
19+
20+
from cornac.datasets import amazon_clothing
21+
22+
23+
class TestAmazonClothing(unittest.TestCase):
24+
25+
def test_amazon_clothing(self):
26+
random.seed(time.time())
27+
if random.random() > 0.8:
28+
# ignore image because of big size
29+
ratings = amazon_clothing.load_rating()
30+
texts, item_ids = amazon_clothing.load_text()
31+
contexts = amazon_clothing.load_context()
32+
self.assertEqual(len(ratings), 13689)
33+
self.assertEqual(len(texts), 3393)
34+
self.assertEqual(len(contexts), 9198)
35+
36+
37+
if __name__ == '__main__':
38+
unittest.main()

0 commit comments

Comments
 (0)