Skip to content

Commit aa25822

Browse files
committed
add IMDB-4k dataset
1 parent cbd118d commit aa25822

File tree

3 files changed

+69
-4
lines changed

3 files changed

+69
-4
lines changed

dhg/data/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
from .cora import Cora, CoraBiGraph
1919
from .citeseer import Citeseer, CiteseerBiGraph
2020
from .pubmed import Pubmed, PubmedBiGraph
21+
from .imdb import IMDB4k
2122

2223
__all__ = [
2324
"BaseData",
@@ -48,4 +49,5 @@
4849
"WalmartTrips",
4950
"HouseCommittees",
5051
"News20",
52+
"IMDB4k",
5153
]

dhg/data/dblp.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ def __init__(self, data_root: Optional[str] = None) -> None:
3232

3333

3434
class DBLP4k(BaseData):
35-
r"""The DBLP-4k dataset is a citation network hypergraph dataset for node classification task.
35+
r"""The DBLP-4k dataset is a citation network dataset for node classification task.
3636
The dataset is an academic network from four research areas. There are 14,475 authors,
3737
14,376 papers, and 20 conferences, among which 4,057 authors, 20 conferences and
3838
100 papers are labeled with one of the four research areas (database, data mining, machine learning, and information retrieval).
@@ -49,12 +49,15 @@ class DBLP4k(BaseData):
4949
- ``dim_features``: The dimension of author features: :math:`334`.
5050
- ``features``: The author feature matrix. ``torch.Tensor`` with size :math:`(4,057 \times 334)`.
5151
- ``labels``: The label list. ``torch.LongTensor`` with size :math:`(4,057, )`.
52-
- ``edge_by_paper``: The hyperedge list constructed by the co-paper correlation. ``List`` with length :math:`(14,328 \times 2)`.
53-
- ``edge_by_term``: The hyperedge list constructed by the co-term correlation. ``List`` with length :math:`(7,723 \times 2)`.
54-
- ``edge_by_conf``: The hyperedge list constructed by the co-conference correlation. ``List`` with length :math:`(20 \times 2)`.
52+
- ``edge_by_paper``: The hyperedge list constructed by the co-paper correlation. ``List`` with length :math:`(14,328)`.
53+
- ``edge_by_term``: The hyperedge list constructed by the co-term correlation. ``List`` with length :math:`(7,723)`.
54+
- ``edge_by_conf``: The hyperedge list constructed by the co-conference correlation. ``List`` with length :math:`(20)`.
5555
- ``paper_author_dict``: The dictionary of ``{paper_id: [author_id, ...]}``. ``Dict`` with length :math:`(14,328)`.
5656
- ``term_paper_dict``: The dictionary of ``{term_id: [paper_id, ...]}``. ``Dict`` with length :math:`(7,723)`.
5757
- ``conf_paper_dict``: The dictionary of ``{conf_id: [paper_id, ...]}``. ``Dict`` with length :math:`(20)`.
58+
59+
Args:
60+
``data_root`` (``str``, optional): The ``data_root`` has stored the data. If set to ``None``, this function will auto-download from server and save into the default direction ``~/.dhg/datasets/``. Defaults to None.
5861
"""
5962

6063
def __init__(self, data_root: Optional[str] = None):

dhg/data/imdb.py

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
from typing import Optional
2+
from functools import partial
3+
4+
from dhg.datapipe import load_from_pickle, norm_ft, to_bool_tensor, to_tensor, to_long_tensor
5+
6+
from .base import BaseData
7+
8+
9+
class IMDB4k(BaseData):
10+
r"""The IMDB-4k dataset is a movie dataset for node classification task.
11+
The dataset is an online database about movies and television programs, including information such as cast, production crew, and plot summaries.
12+
This is a subset of IMDB scraped from online, containing 4278 movies, 2081 directors, and 5257 actors after data preprocessing.
13+
Movies are labeled as one of three classes (Action, Comedy, and Drama) based on their genre information.
14+
Each movie is also described by a bag-of-words representation of its plot keywords.
15+
The vertice denotes author, and two types of correlation (co-director, co-actor) can be used for building hyperedges.
16+
More details see the `MAGNN: Metapath Aggregated Graph Neural Network for Heterogeneous Graph Embedding <https://arxiv.org/pdf/2002.01680.pdf>`_ paper.
17+
18+
The content of the IMDB-4k dataset includes the following:
19+
20+
- ``num_classes``: The number of classes: :math:`3`.
21+
- ``num_vertices``: The number of vertices: :math:`4,278`.
22+
- ``num_director_edges``: The number of hyperedges constructed by the co-director correlation: :math:`2,081`.
23+
- ``num_actor_edges``: The number of hyperedges constructed by the co-actor correlation: :math:`5,257`.
24+
- ``dim_features``: The dimension of movie features: :math:`3,066`.
25+
- ``features``: The movie feature matrix. ``torch.Tensor`` with size :math:`(4,278 \times 3,066)`.
26+
- ``labels``: The label list. ``torch.LongTensor`` with size :math:`(4,278, )`.
27+
- ``edge_by_director``: The hyperedge list constructed by the co-director correlation. ``List`` with length :math:`(2,081)`.
28+
- ``edge_by_actor``: The hyperedge list constructed by the co-actor correlation. ``List`` with length :math:`(5,257)`.
29+
30+
Args:
31+
``data_root`` (``str``, optional): The ``data_root`` has stored the data. If set to ``None``, this function will auto-download from server and save into the default direction ``~/.dhg/datasets/``. Defaults to None.
32+
"""
33+
34+
def __init__(self, data_root: Optional[str] = None):
35+
super().__init__("imdb_4k", data_root)
36+
self._content = {
37+
'num_classes': 3,
38+
'num_vertices': 4278,
39+
'num_director_edges': 2081,
40+
'num_actor_edges': 5257,
41+
'dim_features': 3066,
42+
"features": {
43+
"upon": [{"filename": "features.pkl", "md5": "b9cca982d3d5066ddb2013951939c070"}],
44+
"loader": load_from_pickle,
45+
"preprocess": [to_tensor, partial(norm_ft, ord=1)],
46+
},
47+
'labels': {
48+
'upon': [{'filename': 'labels.pkl', 'md5': 'a45e5af53d5475ac87f5d8aa779b3a20'}],
49+
'loader': load_from_pickle,
50+
'preprocess': [to_long_tensor]
51+
},
52+
'edge_by_director': {
53+
'upon': [{'filename': 'edge_by_director.pkl', 'md5': '671b7c2010e8604f037523738323cd78'}],
54+
'loader': load_from_pickle,
55+
},
56+
'edge_by_actor': {
57+
'upon': [{'filename': 'edge_by_actor.pkl', 'md5': 'dff7557861445de77b05d6215746c9f1'}],
58+
'loader': load_from_pickle,
59+
},
60+
}

0 commit comments

Comments
 (0)