Skip to content

Commit 51388f5

Browse files
committed
add flickr and blog datasets, add dataset split functions
1 parent f11473f commit 51388f5

File tree

15 files changed

+408
-28
lines changed

15 files changed

+408
-28
lines changed

dhg/data/__init__.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,12 +7,16 @@
77
from .amazon import AmazonBook
88
from .coauthorship import CoauthorshipCora, CoauthorshipDBLP
99
from .cocitation import CocitationCora, CocitationCiteseer, CocitationPubmed
10+
from .blogcatalog import BlogCatalog
11+
from .flickr import Flickr
1012

1113
__all__ = [
1214
"BaseData",
1315
"Cora",
1416
"Citeseer",
1517
"Pubmed",
18+
"BlogCatalog",
19+
"Flickr",
1620
"Cooking200",
1721
"MovieLens1M",
1822
"Yelp2018",

dhg/data/amazon.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ class AmazonBook(BaseData):
2121
The first item of each line in the ``adj_list`` is the user id, and the rest is the item id.
2222
2323
Args:
24-
``data_root`` (``str``, optional): The ``data_root`` has stored the data. If set to ``None``, this function will auto-download from server and save into the default direction ``~/.dhg/datasets/``. Defaults to None.
24+
``data_root`` (``str``, optional): The ``data_root`` has stored the data. If set to ``None``, this function will auto-download from server and save into the default direction ``~/.dhg/datasets/``. Defaults to ``None``.
2525
"""
2626

2727
def __init__(self, data_root: Optional[str] = None) -> None:

dhg/data/blogcatalog.py

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
from typing import Optional
2+
from functools import partial
3+
4+
from dhg.datapipe import load_from_pickle, norm_ft, to_tensor, to_long_tensor, to_bool_tensor
5+
6+
from .base import BaseData
7+
8+
9+
class BlogCatalog(BaseData):
10+
r"""The BlogCatalog dataset is a social network dataset for vertex classification task.
11+
This is a network of social relationships of bloggers from the BlogCatalog website,
12+
where nodes' attributes are constructed by keywords, which are generated by users as a short description of their blogs.
13+
The labels represent the topic categories provided by the authors.
14+
15+
.. note::
16+
The L1-normalization for the feature is not recommended for this dataset.
17+
18+
The content of the BlogCatalog dataset includes the following:
19+
20+
- ``num_classes``: The number of classes: :math:`6`.
21+
- ``num_vertices``: The number of vertices: :math:`5,196`.
22+
- ``num_edges``: The number of edges: :math:`343,486`.
23+
- ``dim_features``: The dimension of features: :math:`8,189`.
24+
- ``features``: The vertex feature matrix. ``torch.Tensor`` with size :math:`(5,196 \times 8,189)`.
25+
- ``edge_list``: The edge list. ``List`` with length :math:`(343,486 \times 2)`.
26+
- ``labels``: The label list. ``torch.LongTensor`` with size :math:`(5,196, )`.
27+
28+
Args:
29+
``data_root`` (``str``, optional): The ``data_root`` has stored the data. If set to ``None``, this function will auto-download from server and save into the default direction ``~/.dhg/datasets/``. Defaults to ``None``.
30+
"""
31+
32+
def __init__(self, data_root: Optional[str] = None) -> None:
33+
super().__init__("BlogCatalog", data_root)
34+
self._content = {
35+
"num_classes": 6,
36+
"num_vertices": 5196,
37+
"num_edges": 171743,
38+
"dim_features": 8189,
39+
"features": {
40+
"upon": [{"filename": "features.pkl", "md5": "ecdd26c63f483c4d919a156f9c8e92fc"}],
41+
"loader": load_from_pickle,
42+
"preprocess": [to_tensor], # partial(norm_ft, ord=1)
43+
},
44+
"edge_list": {
45+
"upon": [{"filename": "edge_list.pkl", "md5": "03ffbc8c9a4d9abeab0f127c717888f0"}],
46+
"loader": load_from_pickle,
47+
},
48+
"labels": {
49+
"upon": [{"filename": "labels.pkl", "md5": "246e7096dd834a631c33fe0c7afb89b4"}],
50+
"loader": load_from_pickle,
51+
"preprocess": [to_long_tensor],
52+
},
53+
}

dhg/data/coauthorship.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ class CoauthorshipCora(BaseData):
2424
- ``test_mask``: The test mask. ``torch.BoolTensor`` with size :math:`(2,708, )`.
2525
2626
Args:
27-
``data_root`` (``str``, optional): The ``data_root`` has stored the data. If set to ``None``, this function will auto-download from server and save into the default direction ``~/.dhg/datasets/``. Defaults to None.
27+
``data_root`` (``str``, optional): The ``data_root`` has stored the data. If set to ``None``, this function will auto-download from server and save into the default direction ``~/.dhg/datasets/``. Defaults to ``None``.
2828
"""
2929
def __init__(self, data_root: Optional[str] = None) -> None:
3030
super().__init__("coauthorship_cora", data_root)

dhg/data/cocitation.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ class CocitationCora(BaseData):
2424
- ``test_mask``: The test mask. ``torch.BoolTensor`` with size :math:`(2,708, )`.
2525
2626
Args:
27-
``data_root`` (``str``, optional): The ``data_root`` has stored the data. If set to ``None``, this function will auto-download from server and save into the default direction ``~/.dhg/datasets/``. Defaults to None.
27+
``data_root`` (``str``, optional): The ``data_root`` has stored the data. If set to ``None``, this function will auto-download from server and save into the default direction ``~/.dhg/datasets/``. Defaults to ``None``.
2828
"""
2929

3030
def __init__(self, data_root: Optional[str] = None) -> None:
@@ -84,7 +84,7 @@ class CocitationCiteseer(BaseData):
8484
- ``test_mask``: The test mask. ``torch.BoolTensor`` with size :math:`(3,327, )`.
8585
8686
Args:
87-
``data_root`` (``str``, optional): The ``data_root`` has stored the data. If set to ``None``, this function will auto-download from server and save into the default direction ``~/.dhg/datasets/``. Defaults to None.
87+
``data_root`` (``str``, optional): The ``data_root`` has stored the data. If set to ``None``, this function will auto-download from server and save into the default direction ``~/.dhg/datasets/``. Defaults to ``None``.
8888
"""
8989

9090
def __init__(self, data_root: Optional[str] = None) -> None:
@@ -144,7 +144,7 @@ class CocitationPubmed(BaseData):
144144
- ``test_mask``: The test mask. ``torch.BoolTensor`` with size :math:`(19,717, )`.
145145
146146
Args:
147-
``data_root`` (``str``, optional): The ``data_root`` has stored the data. If set to ``None``, this function will auto-download from server and save into the default direction ``~/.dhg/datasets/``. Defaults to None.
147+
``data_root`` (``str``, optional): The ``data_root`` has stored the data. If set to ``None``, this function will auto-download from server and save into the default direction ``~/.dhg/datasets/``. Defaults to ``None``.
148148
"""
149149

150150
def __init__(self, data_root: Optional[str] = None) -> None:

dhg/data/cooking_200.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ class Cooking200(BaseData):
2626
- ``test_mask``: The test mask. ``torch.BoolTensor`` with size :math:`(7,403)`.
2727
2828
Args:
29-
``data_root`` (``str``, optional): The ``data_root`` has stored the data. If set to ``None``, this function will auto-download from server and save into the default direction ``~/.dhg/datasets/``. Defaults to None.
29+
``data_root`` (``str``, optional): The ``data_root`` has stored the data. If set to ``None``, this function will auto-download from server and save into the default direction ``~/.dhg/datasets/``. Defaults to ``None``.
3030
"""
3131

3232
def __init__(self, data_root: Optional[str] = None) -> None:

dhg/data/flickr.py

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
from typing import Optional
2+
from functools import partial
3+
4+
from dhg.datapipe import load_from_pickle, norm_ft, to_tensor, to_long_tensor, to_bool_tensor
5+
6+
from .base import BaseData
7+
8+
9+
class Flickr(BaseData):
10+
r"""The Flickr dataset is a social network dataset for vertex classification task.
11+
It is a social network where nodes represent users and edges correspond to friendships among users.
12+
The labels represent the interest groups of the users.
13+
14+
.. note::
15+
The L1-normalization for the feature is not recommended for this dataset.
16+
17+
The content of the Flickr dataset includes the following:
18+
19+
- ``num_classes``: The number of classes: :math:`9`.
20+
- ``num_vertices``: The number of vertices: :math:`7,575`.
21+
- ``num_edges``: The number of edges: :math:`479,476`.
22+
- ``dim_features``: The dimension of features: :math:`12,047`.
23+
- ``features``: The vertex feature matrix. ``torch.Tensor`` with size :math:`(7,575 \times 12,047)`.
24+
- ``edge_list``: The edge list. ``List`` with length :math:`(479,476 \times 2)`.
25+
- ``labels``: The label list. ``torch.LongTensor`` with size :math:`(7,575, )`.
26+
27+
Args:
28+
``data_root`` (``str``, optional): The ``data_root`` has stored the data. If set to ``None``, this function will auto-download from server and save into the default direction ``~/.dhg/datasets/``. Defaults to ``None``.
29+
"""
30+
31+
def __init__(self, data_root: Optional[str] = None) -> None:
32+
super().__init__("Flickr", data_root)
33+
self._content = {
34+
"num_classes": 9,
35+
"num_vertices": 7575,
36+
"num_edges": 239738,
37+
"dim_features": 12047,
38+
"features": {
39+
"upon": [{"filename": "features.pkl", "md5": "8e889c8532a91ddcb29d6a9c377b5528"}],
40+
"loader": load_from_pickle,
41+
"preprocess": [to_tensor], # partial(norm_ft, ord=1)
42+
},
43+
"edge_list": {
44+
"upon": [{"filename": "edge_list.pkl", "md5": "ea7412a30539fbc95f76ee3712a07017"}],
45+
"loader": load_from_pickle,
46+
},
47+
"labels": {
48+
"upon": [{"filename": "labels.pkl", "md5": "9603c29e31b863a34fc707b606c02880"}],
49+
"loader": load_from_pickle,
50+
"preprocess": [to_long_tensor],
51+
},
52+
}

dhg/data/gowalla.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ class Gowalla(BaseData):
2222
The first item of each line in the ``adj_list`` is the user id, and the rest is the item id.
2323
2424
Args:
25-
``data_root`` (``str``, optional): The ``data_root`` has stored the data. If set to ``None``, this function will auto-download from server and save into the default direction ``~/.dhg/datasets/``. Defaults to None.
25+
``data_root`` (``str``, optional): The ``data_root`` has stored the data. If set to ``None``, this function will auto-download from server and save into the default direction ``~/.dhg/datasets/``. Defaults to ``None``.
2626
"""
2727
def __init__(self, data_root: Optional[str] = None) -> None:
2828
super().__init__("gowalla", data_root)

dhg/data/movielens.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ class MovieLens1M(BaseData):
2222
The first item of each line in the ``adj_list`` is the user id, and the rest is the item id.
2323
2424
Args:
25-
``data_root`` (``str``, optional): The ``data_root`` has stored the data. If set to ``None``, this function will auto-download from server and save into the default direction ``~/.dhg/datasets/``. Defaults to None.
25+
``data_root`` (``str``, optional): The ``data_root`` has stored the data. If set to ``None``, this function will auto-download from server and save into the default direction ``~/.dhg/datasets/``. Defaults to ``None``.
2626
"""
2727
def __init__(self, data_root: Optional[str] = None) -> None:
2828
super().__init__("movielens_1m", data_root)

dhg/data/planetoid.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ class Cora(BaseData):
2323
- ``test_mask``: The test mask. ``torch.BoolTensor`` with size :math:`(2,708, )`.
2424
2525
Args:
26-
``data_root`` (``str``, optional): The ``data_root`` has stored the data. If set to ``None``, this function will auto-download from server and save into the default direction ``~/.dhg/datasets/``. Defaults to None.
26+
``data_root`` (``str``, optional): The ``data_root`` has stored the data. If set to ``None``, this function will auto-download from server and save into the default direction ``~/.dhg/datasets/``. Defaults to ``None``.
2727
"""
2828
def __init__(self, data_root: Optional[str] = None) -> None:
2929
super().__init__('cora', data_root)
@@ -79,7 +79,7 @@ class Citeseer(BaseData):
7979
- ``test_mask``: The test mask. ``torch.BoolTensor`` with size :math:`(3,327, )`.
8080
8181
Args:
82-
``data_root`` (``str``, optional): The ``data_root`` has stored the data. If set to ``None``, this function will auto-download from server and save into the default direction ``~/.dhg/datasets/``. Defaults to None.
82+
``data_root`` (``str``, optional): The ``data_root`` has stored the data. If set to ``None``, this function will auto-download from server and save into the default direction ``~/.dhg/datasets/``. Defaults to ``None``.
8383
"""
8484
def __init__(self, data_root: Optional[str] = None) -> None:
8585
super().__init__('citeseer', data_root)
@@ -135,7 +135,7 @@ class Pubmed(BaseData):
135135
- ``test_mask``: The test mask. ``torch.BoolTensor`` with size :math:`(19,717, )`.
136136
137137
Args:
138-
``data_root`` (``str``, optional): The ``data_root`` has stored the data. If set to ``None``, this function will auto-download from server and save into the default direction ``~/.dhg/datasets/``. Defaults to None.
138+
``data_root`` (``str``, optional): The ``data_root`` has stored the data. If set to ``None``, this function will auto-download from server and save into the default direction ``~/.dhg/datasets/``. Defaults to ``None``.
139139
"""
140140
def __init__(self, data_root: Optional[str] = None) -> None:
141141
super().__init__('pubmed', data_root)

0 commit comments

Comments
 (0)