Skip to content

Commit a5a7965

Browse files
committed
add recipe-100k and recipe-200k datasets
1 parent e92ed01 commit a5a7965

File tree

4 files changed

+101
-1
lines changed

4 files changed

+101
-1
lines changed

dhg/data/__init__.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
from .citeseer import Citeseer, CiteseerBiGraph
2020
from .pubmed import Pubmed, PubmedBiGraph
2121
from .imdb import IMDB4k
22+
from .recipe import Recipe100k, Recipe200k
2223

2324
__all__ = [
2425
"BaseData",
@@ -50,4 +51,6 @@
5051
"HouseCommittees",
5152
"News20",
5253
"IMDB4k",
54+
"Recipe100k",
55+
"Recipe200k"
5356
]

dhg/data/news.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88

99
class News20(BaseData):
1010
r"""The 20 Newsgroups dataset is a newspaper network dataset for vertex classification task.
11-
The node features are the TF-IDF representations of news messages.
11+
The vertex features are the TF-IDF representations of news messages.
1212
More details see the `YOU ARE ALLSET: A MULTISET LEARNING FRAMEWORK FOR HYPERGRAPH NEURAL NETWORKS <https://openreview.net/pdf?id=hpBTIv2uy_E>`_ paper.
1313
1414
The content of the 20 Newsgroups dataset includes the following:

dhg/data/recipe.py

Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,95 @@
1+
from typing import Optional
2+
from functools import partial
3+
4+
from dhg.datapipe import load_from_pickle, norm_ft, to_tensor, to_long_tensor, to_bool_tensor
5+
6+
from .base import BaseData
7+
8+
9+
class Recipe100k(BaseData):
10+
r"""The Recipe100k dataset is a recipe-ingredient network dataset for vertex classification task.
11+
The vertex features are the bag of words from the sentence that making the recipe.
12+
Hyperedges are the ingredients of the recipe or the Keywords for food preparation steps.
13+
The original dataset is created in `SHARE: a System for Hierarchical Assistive Recipe Editing <https://arxiv.org/pdf/2105.08185.pdf>`_ paper.
14+
15+
The content of the Recipe100k dataset includes the following:
16+
17+
- ``num_classes``: The number of classes: :math:`8`.
18+
- ``num_vertices``: The number of vertices: :math:`101,585`.
19+
- ``num_edges``: The number of edges: :math:`12,387`.
20+
- ``dim_features``: The dimension of features: :math:`2,254`.
21+
- ``features``: The vertex feature matrix. ``torch.Tensor`` with size :math:`(101,585 \times 2,254)`.
22+
- ``edge_list``: The edge list. ``List`` with length :math:`12,387`.
23+
- ``labels``: The label list. ``torch.LongTensor`` with size :math:`(101,585, )`.
24+
25+
Args:
26+
``data_root`` (``str``, optional): The ``data_root`` has stored the data. If set to ``None``, this function will auto-download from server and save into the default direction ``~/.dhg/datasets/``. Defaults to ``None``.
27+
"""
28+
29+
def __init__(self, data_root: Optional[str] = None) -> None:
30+
super().__init__("recipe-100k-v2", data_root)
31+
self._content = {
32+
"num_classes": 8,
33+
"num_vertices": 101585,
34+
"num_edges": 12387,
35+
"dim_features": 2254,
36+
"features": {
37+
"upon": [{"filename": "features.pkl", "md5": "4fdd76cd4108fd07bdd62368067c1eaf"}],
38+
"loader": load_from_pickle,
39+
"preprocess": [to_tensor,],
40+
},
41+
"edge_list": {
42+
"upon": [{"filename": "edge_list.pkl", "md5": "3dc1d8fe7a0f91b5c56057500bda9021"}],
43+
"loader": load_from_pickle,
44+
},
45+
"labels": {
46+
"upon": [{"filename": "labels.pkl", "md5": "bd8a3bcaef27a58c6d1d5def255c5065"}],
47+
"loader": load_from_pickle,
48+
"preprocess": [to_long_tensor],
49+
},
50+
}
51+
52+
53+
class Recipe200k(BaseData):
54+
r"""The Recipe200k dataset is a recipe-ingredient network dataset for vertex classification task.
55+
The vertex features are the bag of words from the sentence that making the recipe.
56+
Hyperedges are the ingredients of the recipe or the Keywords for food preparation steps.
57+
The original dataset is created in `SHARE: a System for Hierarchical Assistive Recipe Editing <https://arxiv.org/pdf/2105.08185.pdf>`_ paper.
58+
59+
The content of the Recipe200k dataset includes the following:
60+
61+
- ``num_classes``: The number of classes: :math:`8`.
62+
- ``num_vertices``: The number of vertices: :math:`240,094`.
63+
- ``num_edges``: The number of edges: :math:`18,129`.
64+
- ``dim_features``: The dimension of features: :math:`3,200`.
65+
- ``features``: The vertex feature matrix. ``torch.Tensor`` with size :math:`(240,094 \times 3,200)`.
66+
- ``edge_list``: The edge list. ``List`` with length :math:`18,129`.
67+
- ``labels``: The label list. ``torch.LongTensor`` with size :math:`(240,094, )`.
68+
69+
Args:
70+
``data_root`` (``str``, optional): The ``data_root`` has stored the data. If set to ``None``, this function will auto-download from server and save into the default direction ``~/.dhg/datasets/``. Defaults to ``None``.
71+
"""
72+
73+
def __init__(self, data_root: Optional[str] = None) -> None:
74+
super().__init__("recipe-200k-v2", data_root)
75+
self._content = {
76+
"num_classes": 8,
77+
"num_vertices": 240094,
78+
"num_edges": 18129,
79+
"dim_features": 3200,
80+
"features": {
81+
"upon": [{"filename": "features.pkl", "md5": "a5df55a3e9591d7389f6ea5f09a483f4"}],
82+
"loader": load_from_pickle,
83+
"preprocess": [to_tensor,],
84+
},
85+
"edge_list": {
86+
"upon": [{"filename": "edge_list.pkl", "md5": "163ad784e35e56650fc22658d3e88767"}],
87+
"loader": load_from_pickle,
88+
},
89+
"labels": {
90+
"upon": [{"filename": "labels.pkl", "md5": "05bee03f1c5383f0cde5ea879be090af"}],
91+
"loader": load_from_pickle,
92+
"preprocess": [to_long_tensor],
93+
},
94+
}
95+

docs/source/api/data.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,8 @@ Hypergraph Datasets
6464
dhg.data.DBLP4k
6565
dhg.data.DBLP8k
6666
dhg.data.IMDB4k
67+
dhg.data.Recipe100k
68+
dhg.data.Recipe200k
6769

6870

6971
**Welcome to contribute datasets!**

0 commit comments

Comments
 (0)