Skip to content

Commit 8b115bd

Browse files
committed
add three bipartite graph datasets
1 parent 9387412 commit 8b115bd

File tree

11 files changed

+508
-192
lines changed

11 files changed

+508
-192
lines changed

dhg/data/__init__.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
from .base import BaseData
2-
from .planetoid import Cora, Citeseer, Pubmed
32
from .cooking_200 import Cooking200
43
from .movielens import MovieLens1M
54
from .yelp import Yelp2018
@@ -11,6 +10,10 @@
1110
from .flickr import Flickr
1211
from .github import Github
1312
from .facebook import Facebook
13+
from .tencent import TencentBiGraph
14+
from .cora import Cora, CoraBiGraph
15+
from .citeseer import Citeseer, CiteseerBiGraph
16+
from .pubmed import Pubmed, PubmedBiGraph
1417

1518
__all__ = [
1619
"BaseData",
@@ -26,6 +29,10 @@
2629
"Yelp2018",
2730
"Gowalla",
2831
"AmazonBook",
32+
"TencentBiGraph",
33+
"CoraBiGraph",
34+
"CiteseerBiGraph",
35+
"PubmedBiGraph",
2936
"CoauthorshipCora",
3037
"CoauthorshipDBLP",
3138
"CocitationCora",

dhg/data/citeseer.py

Lines changed: 118 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,118 @@
1+
from typing import Optional
2+
from functools import partial
3+
4+
from dhg.datapipe import load_from_pickle, norm_ft, to_tensor, to_long_tensor, to_bool_tensor
5+
6+
from .base import BaseData
7+
8+
9+
class Citeseer(BaseData):
10+
r"""The Citeseer dataset is a citation network dataset for vertex classification task.
11+
More details can be found in this `website <https://relational.fit.cvut.cz/dataset/CiteSeer>`_.
12+
13+
- ``num_classes``: The number of classes: :math:`6`.
14+
- ``num_vertices``: The number of vertices: :math:`3,327`.
15+
- ``num_edges``: The number of edges: :math:`9,464`.
16+
- ``dim_features``: The dimension of features: :math:`3,703`.
17+
- ``features``: The vertex feature matrix. ``torch.Tensor`` with size :math:`(3,327 \times 3,703)`.
18+
- ``edge_list``: The edge list. ``List`` with length :math:`(9,464 \times 2)`.
19+
- ``labels``: The label list. ``torch.LongTensor`` with size :math:`(3,327, )`.
20+
- ``train_mask``: The train mask. ``torch.BoolTensor`` with size :math:`(3,327, )`.
21+
- ``val_mask``: The validation mask. ``torch.BoolTensor`` with size :math:`(3,327, )`.
22+
- ``test_mask``: The test mask. ``torch.BoolTensor`` with size :math:`(3,327, )`.
23+
24+
Args:
25+
``data_root`` (``str``, optional): The ``data_root`` has stored the data. If set to ``None``, this function will auto-download from server and save into the default direction ``~/.dhg/datasets/``. Defaults to ``None``.
26+
"""
27+
28+
def __init__(self, data_root: Optional[str] = None) -> None:
29+
super().__init__("citeseer", data_root)
30+
self._content = {
31+
"num_classes": 6,
32+
"num_vertices": 3327,
33+
"num_edges": 9464,
34+
"dim_features": 3703,
35+
"features": {
36+
"upon": [{"filename": "features.pkl", "md5": "7458c683e584a0c5ce1ab7af763777c6"}],
37+
"loader": load_from_pickle,
38+
"preprocess": [to_tensor, partial(norm_ft, ord=1)],
39+
},
40+
"edge_list": {
41+
"upon": [{"filename": "edge_list.pkl", "md5": "1948e9f712bc16ba8ef48a3e79fc2246"}],
42+
"loader": load_from_pickle,
43+
},
44+
"labels": {
45+
"upon": [{"filename": "labels.pkl", "md5": "f5bcf7815e463af4f88d40195f0d378c"}],
46+
"loader": load_from_pickle,
47+
"preprocess": [to_long_tensor],
48+
},
49+
"train_mask": {
50+
"upon": [{"filename": "train_mask.pkl", "md5": "9aae62b41403b976c4cc048685c966e6"}],
51+
"loader": load_from_pickle,
52+
"preprocess": [to_bool_tensor],
53+
},
54+
"val_mask": {
55+
"upon": [{"filename": "val_mask.pkl", "md5": "4527d7dc1e2604cdaa9e18916f32714b"}],
56+
"loader": load_from_pickle,
57+
"preprocess": [to_bool_tensor],
58+
},
59+
"test_mask": {
60+
"upon": [{"filename": "test_mask.pkl", "md5": "af49e6f6f53c73b7d3a62d6f9b2a3871"}],
61+
"loader": load_from_pickle,
62+
"preprocess": [to_bool_tensor],
63+
},
64+
}
65+
66+
67+
class CiteseerBiGraph(BaseData):
68+
r"""The CiteseerBiGraph dataset is a citation network dataset for vertex classification task.
69+
These are synthetic bipartite graph datasets that are generated from citation networks (single graph)
70+
where documents and citation links between them are treated as nodes and undirected edges, respectively.
71+
More details see the `Cascade-BGNN: Toward Efficient Self-supervised Representation Learning on Large-scale Bipartite Graphs <https://arxiv.org/pdf/1906.11994.pdf>`_ paper.
72+
73+
The content of the CiteseerBiGraph dataset includes the following:
74+
75+
- ``num_u_classes``: The number of classes in set :math:`U` : :math:`6`.
76+
- ``num_u_vertices``: The number of vertices in set :math:`U` : :math:`1,237`.
77+
- ``num_v_vertices``: The number of vertices in set :math:`V` : :math:`742`.
78+
- ``num_edges``: The number of edges: :math:`1,665`.
79+
- ``dim_u_features``: The dimension of features in set :math:`U` : :math:`3,703`.
80+
- ``dim_v_features``: The dimension of features: :math:`3,703`.
81+
- ``u_features``: The vertex feature matrix in set :math:`U`. ``torch.Tensor`` with size :math:`(1,237 \times 3,703)`.
82+
- ``v_features``: The vertex feature matrix in set :math:`V` . ``torch.Tensor`` with size :math:`(742 \times 3,703)`.
83+
- ``edge_list``: The edge list. ``List`` with length :math:`(1,665 \times 2)`.
84+
- ``u_labels``: The label list in set :math:`U` . ``torch.LongTensor`` with size :math:`(1,237, )`.
85+
86+
Args:
87+
``data_root`` (``str``, optional): The ``data_root`` has stored the data. If set to ``None``, this function will auto-download from server and save into the default direction ``~/.dhg/datasets/``. Defaults to ``None``.
88+
"""
89+
90+
def __init__(self, data_root: Optional[str] = None) -> None:
91+
super().__init__("citeseer_bigraph", data_root)
92+
self._content = {
93+
"num_u_classes": 6,
94+
"num_u_vertices": 1237,
95+
"num_v_vertices": 742,
96+
"num_edges": 1665,
97+
"dim_u_features": 3703,
98+
"dim_v_features": 3703,
99+
"u_features": {
100+
"upon": [{"filename": "u_features.pkl", "md5": "d8c1ccd6026cbb1f05cc3c534b239e00"}],
101+
"loader": load_from_pickle,
102+
"preprocess": [to_tensor, partial(norm_ft, ord=1)],
103+
},
104+
"v_features": {
105+
"upon": [{"filename": "v_features.pkl", "md5": "7ca1d16ad557945f9b66ef6ac40c0210"}],
106+
"loader": load_from_pickle,
107+
"preprocess": [to_tensor, partial(norm_ft, ord=1)],
108+
},
109+
"edge_list": {
110+
"upon": [{"filename": "edge_list.pkl", "md5": "2a632085fb8f691af6399fbb71dc1f67"}],
111+
"loader": load_from_pickle,
112+
},
113+
"u_labels": {
114+
"upon": [{"filename": "u_labels.pkl", "md5": "b4d0034c29f6f5b6da17f3037c2af605"}],
115+
"loader": load_from_pickle,
116+
"preprocess": [to_long_tensor],
117+
},
118+
}

dhg/data/cora.py

Lines changed: 121 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,121 @@
1+
from typing import Optional
2+
from functools import partial
3+
4+
from dhg.datapipe import load_from_pickle, norm_ft, to_tensor, to_long_tensor, to_bool_tensor
5+
6+
from .base import BaseData
7+
8+
9+
class Cora(BaseData):
10+
r"""The Cora dataset is a citation network dataset for vertex classification task.
11+
More details can be found in this `website <https://relational.fit.cvut.cz/dataset/CORA>`_.
12+
13+
The content of the Cora dataset includes the following:
14+
15+
- ``num_classes``: The number of classes: :math:`7`.
16+
- ``num_vertices``: The number of vertices: :math:`2,708`.
17+
- ``num_edges``: The number of edges: :math:`10,858`.
18+
- ``dim_features``: The dimension of features: :math:`1,433`.
19+
- ``features``: The vertex feature matrix. ``torch.Tensor`` with size :math:`(2,708 \times 1,433)`.
20+
- ``edge_list``: The edge list. ``List`` with length :math:`(10,858 \times 2)`.
21+
- ``labels``: The label list. ``torch.LongTensor`` with size :math:`(2,708, )`.
22+
- ``train_mask``: The train mask. ``torch.BoolTensor`` with size :math:`(2,708, )`.
23+
- ``val_mask``: The validation mask. ``torch.BoolTensor`` with size :math:`(2,708, )`.
24+
- ``test_mask``: The test mask. ``torch.BoolTensor`` with size :math:`(2,708, )`.
25+
26+
Args:
27+
``data_root`` (``str``, optional): The ``data_root`` has stored the data. If set to ``None``, this function will auto-download from server and save into the default direction ``~/.dhg/datasets/``. Defaults to ``None``.
28+
"""
29+
30+
def __init__(self, data_root: Optional[str] = None) -> None:
31+
super().__init__("cora", data_root)
32+
self._content = {
33+
"num_classes": 7,
34+
"num_vertices": 2708,
35+
"num_edges": 10858,
36+
"dim_features": 1433,
37+
"features": {
38+
"upon": [{"filename": "features.pkl", "md5": "05b45e9c38cc95f4fc44b3668cc9ddc9"}],
39+
"loader": load_from_pickle,
40+
"preprocess": [to_tensor, partial(norm_ft, ord=1)],
41+
},
42+
"edge_list": {
43+
"upon": [{"filename": "edge_list.pkl", "md5": "f488389c1edd0d898ce273fbd27822b3"}],
44+
"loader": load_from_pickle,
45+
},
46+
"labels": {
47+
"upon": [{"filename": "labels.pkl", "md5": "e506014762052c6a36cb583c28bdae1d"}],
48+
"loader": load_from_pickle,
49+
"preprocess": [to_long_tensor],
50+
},
51+
"train_mask": {
52+
"upon": [{"filename": "train_mask.pkl", "md5": "a11357a40e1f0b5cce728d1a961b8e13"}],
53+
"loader": load_from_pickle,
54+
"preprocess": [to_bool_tensor],
55+
},
56+
"val_mask": {
57+
"upon": [{"filename": "val_mask.pkl", "md5": "355544da566452601bcfa74d30539a71"}],
58+
"loader": load_from_pickle,
59+
"preprocess": [to_bool_tensor],
60+
},
61+
"test_mask": {
62+
"upon": [{"filename": "test_mask.pkl", "md5": "bbfc87d661560f55f6946f8cb9d602b9"}],
63+
"loader": load_from_pickle,
64+
"preprocess": [to_bool_tensor],
65+
},
66+
}
67+
68+
69+
class CoraBiGraph(BaseData):
70+
r"""The CoraBiGraph dataset is a citation network dataset for vertex classification task.
71+
These are synthetic bipartite graph datasets that are generated from citation networks (single graph)
72+
where documents and citation links between them are treated as nodes and undirected edges, respectively.
73+
More details see the `Cascade-BGNN: Toward Efficient Self-supervised Representation Learning on Large-scale Bipartite Graphs <https://arxiv.org/pdf/1906.11994.pdf>`_ paper.
74+
75+
The content of the CoraBiGraph dataset includes the following:
76+
77+
- ``num_u_classes``: The number of classes in set :math:`U` : :math:`7`.
78+
- ``num_u_vertices``: The number of vertices in set :math:`U` : :math:`1,312`.
79+
- ``num_v_vertices``: The number of vertices in set :math:`V` : :math:`789`.
80+
- ``num_edges``: The number of edges: :math:`2,314`.
81+
- ``dim_u_features``: The dimension of features in set :math:`U` : :math:`1,433`.
82+
- ``dim_v_features``: The dimension of features: :math:`1,433`.
83+
- ``u_features``: The vertex feature matrix in set :math:`U`. ``torch.Tensor`` with size :math:`(1,312 \times 1,433)`.
84+
- ``v_features``: The vertex feature matrix in set :math:`V` . ``torch.Tensor`` with size :math:`(789 \times 1,433)`.
85+
- ``edge_list``: The edge list. ``List`` with length :math:`(2,314 \times 2)`.
86+
- ``u_labels``: The label list in set :math:`U` . ``torch.LongTensor`` with size :math:`(1,312, )`.
87+
88+
Args:
89+
``data_root`` (``str``, optional): The ``data_root`` has stored the data. If set to ``None``, this function will auto-download from server and save into the default direction ``~/.dhg/datasets/``. Defaults to ``None``.
90+
"""
91+
92+
def __init__(self, data_root: Optional[str] = None) -> None:
93+
super().__init__("cora_bigraph", data_root)
94+
self._content = {
95+
"num_u_classes": 7,
96+
"num_u_vertices": 1312,
97+
"num_v_vertices": 789,
98+
"num_edges": 2314,
99+
"dim_u_features": 1433,
100+
"dim_v_features": 1433,
101+
"u_features": {
102+
"upon": [{"filename": "u_features.pkl", "md5": "84f0ecee4233ca70d40d36f457470032"}],
103+
"loader": load_from_pickle,
104+
"preprocess": [to_tensor, partial(norm_ft, ord=1)],
105+
},
106+
"v_features": {
107+
"upon": [{"filename": "v_features.pkl", "md5": "de65cd478ea05333c26184bc8b2cb468"}],
108+
"loader": load_from_pickle,
109+
"preprocess": [to_tensor, partial(norm_ft, ord=1)],
110+
},
111+
"edge_list": {
112+
"upon": [{"filename": "edge_list.pkl", "md5": "e7b82c4a8305e1488beac1b788ad46e3"}],
113+
"loader": load_from_pickle,
114+
},
115+
"u_labels": {
116+
"upon": [{"filename": "u_labels.pkl", "md5": "65dff86f7920cdab61790d48a39f2e5b"}],
117+
"loader": load_from_pickle,
118+
"preprocess": [to_long_tensor],
119+
},
120+
}
121+

0 commit comments

Comments
 (0)