Skip to content

Commit afd49f9

Browse files
committed
add DBLP-4K dataset
1 parent 27f944f commit afd49f9

File tree

3 files changed

+76
-2
lines changed

3 files changed

+76
-2
lines changed

dhg/data/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
from .white_house import HouseCommittees
99
from .news import News20
1010
from .coauthorship import CoauthorshipCora, CoauthorshipDBLP
11-
from .dblp import DBLP8k
11+
from .dblp import DBLP4k, DBLP8k
1212
from .cocitation import CocitationCora, CocitationCiteseer, CocitationPubmed
1313
from .blogcatalog import BlogCatalog
1414
from .flickr import Flickr
@@ -39,6 +39,7 @@
3939
"PubmedBiGraph",
4040
"CoauthorshipCora",
4141
"CoauthorshipDBLP",
42+
"DBLP4k",
4243
"DBLP8k",
4344
"CocitationCora",
4445
"CocitationCiteseer",

dhg/data/dblp.py

Lines changed: 73 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
from typing import Optional
2+
from functools import partial
23

3-
from dhg.datapipe import load_from_pickle
4+
from dhg.datapipe import load_from_pickle, norm_ft, to_bool_tensor, to_tensor, to_long_tensor
45

56
from .base import BaseData
67

@@ -29,3 +30,74 @@ def __init__(self, data_root: Optional[str] = None) -> None:
2930
},
3031
}
3132

33+
34+
class DBLP4k(BaseData):
35+
r"""The DBLP-4k dataset is a citation network hypergraph dataset for node classification task.
36+
The dataset is an academic network from four research areas. There are 14,475 authors,
37+
14,376 papers, and 20 conferences, among which 4,057 authors, 20 conferences and
38+
100 papers are labeled with one of the four research areas (database, data mining, machine learning, and information retrieval).
39+
The vertice denotes author, and three types of correlation (co-paper, co-term, co-conference) can be used for building hyperedges.
40+
More details see the `PathSim: Meta Path-Based Top-K Similarity Search in Heterogeneous Information Networks <http://www.vldb.org/pvldb/vol4/p992-sun.pdf>`_ paper.
41+
42+
The content of the DBLP-4k dataset includes the following:
43+
44+
- ``num_classes``: The number of classes: :math:`4`.
45+
- ``num_vertices``: The number of vertices: :math:`4,057`.
46+
- ``num_paper_edges``: The number of hyperedges constructed by the co-paper correlation: :math:`14,328`.
47+
- ``num_term_edges``: The number of hyperedges constructed by the co-term correlation: :math:`7,723`.
48+
- ``num_conf_edges``: The number of hyperedges constructed by the co-conference correlation: :math:`20`.
49+
- ``dim_features``: The dimension of author features: :math:`334`.
50+
- ``features``: The author feature matrix. ``torch.Tensor`` with size :math:`(4,057 \times 334)`.
51+
- ``labels``: The label list. ``torch.LongTensor`` with size :math:`(4,057, )`.
52+
- ``edge_by_paper``: The hyperedge list constructed by the co-paper correlation. ``List`` with length :math:`(14,328 \times 2)`.
53+
- ``edge_by_term``: The hyperedge list constructed by the co-term correlation. ``List`` with length :math:`(7,723 \times 2)`.
54+
- ``edge_by_conf``: The hyperedge list constructed by the co-conference correlation. ``List`` with length :math:`(20 \times 2)`.
55+
- ``paper_author_dict``: The dictionary of ``{paper_id: [author_id, ...]}``. ``Dict`` with length :math:`(14,328)`.
56+
- ``term_paper_dict``: The dictionary of ``{term_id: [paper_id, ...]}``. ``Dict`` with length :math:`(7,723)`.
57+
- ``conf_paper_dict``: The dictionary of ``{conf_id: [paper_id, ...]}``. ``Dict`` with length :math:`(20)`.
58+
"""
59+
60+
def __init__(self, data_root: Optional[str] = None):
61+
super().__init__("dblp_4k", data_root)
62+
self._content = {
63+
'num_classes': 4,
64+
'num_vertices': 4057,
65+
'num_paper_edges': 14328,
66+
'num_term_edges': 7723,
67+
'num_conf_edges': 20,
68+
'dim_features': 334,
69+
"features": {
70+
"upon": [{"filename": "features.pkl", "md5": "7f8e6c3219026c284342d45c01e16406"}],
71+
"loader": load_from_pickle,
72+
"preprocess": [to_tensor, partial(norm_ft, ord=1)],
73+
},
74+
'labels': {
75+
'upon': [{'filename': 'labels.pkl', 'md5': '6ffe5ab8c5670d8b5df595b5c4c63184'}],
76+
'loader': load_from_pickle,
77+
'preprocess': [to_long_tensor]
78+
},
79+
'edge_by_paper': {
80+
'upon': [{'filename': 'edge_by_paper.pkl', 'md5': 'e473eddeb4692f732bc1e47ae94d62c2'}],
81+
'loader': load_from_pickle,
82+
},
83+
'edge_by_term': {
84+
'upon': [{'filename': 'edge_by_term.pkl', 'md5': '1ca7cfbf46a7f5fc743818c65392a0ed'}],
85+
'loader': load_from_pickle,
86+
},
87+
'edge_by_conf': {
88+
'upon': [{'filename': 'edge_by_conf.pkl', 'md5': '890d683b7d8f943ac6d7e87043e0355e'}],
89+
'loader': load_from_pickle,
90+
},
91+
'paper_author_dict': {
92+
'upon': [{'filename': 'paper_author_dict.pkl', 'md5': 'eb2922e010a78961b5b66e77f9bdf950'}],
93+
'loader': load_from_pickle,
94+
},
95+
'term_paper_dict': {
96+
'upon': [{'filename': 'term_paper_dict.pkl', 'md5': '1d71f988b52b0e1da9d12f1d3fe24350'}],
97+
'loader': load_from_pickle,
98+
},
99+
'conf_paper_dict': {
100+
'upon': [{'filename': 'conf_paper_dict.pkl', 'md5': 'cbf87d64dce4ef40d2ab8406e1ee10e1'}],
101+
'loader': load_from_pickle,
102+
},
103+
}

docs/source/api/data.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@ Hypergraph Datasets
6161
dhg.data.WalmartTrips
6262
dhg.data.HouseCommittees
6363
dhg.data.News20
64+
dhg.data.DBLP4k
6465
dhg.data.DBLP8k
6566

6667

0 commit comments

Comments
 (0)