|
| 1 | +from typing import Optional |
| 2 | +from functools import partial |
| 3 | + |
| 4 | +from dhg.datapipe import load_from_pickle, norm_ft, to_tensor, to_long_tensor, to_bool_tensor |
| 5 | + |
| 6 | +from .base import BaseData |
| 7 | + |
| 8 | + |
| 9 | +class Cora(BaseData): |
| 10 | + r"""The Cora dataset is a citation network dataset for vertex classification task. |
| 11 | + More details can be found in this `website <https://relational.fit.cvut.cz/dataset/CORA>`_. |
| 12 | + |
| 13 | + The content of the Cora dataset includes the following: |
| 14 | +
|
| 15 | + - ``num_classes``: The number of classes: :math:`7`. |
| 16 | + - ``num_vertices``: The number of vertices: :math:`2,708`. |
| 17 | + - ``num_edges``: The number of edges: :math:`10,858`. |
| 18 | + - ``dim_features``: The dimension of features: :math:`1,433`. |
| 19 | + - ``features``: The vertex feature matrix. ``torch.Tensor`` with size :math:`(2,708 \times 1,433)`. |
| 20 | + - ``edge_list``: The edge list. ``List`` with length :math:`(10,858 \times 2)`. |
| 21 | + - ``labels``: The label list. ``torch.LongTensor`` with size :math:`(2,708, )`. |
| 22 | + - ``train_mask``: The train mask. ``torch.BoolTensor`` with size :math:`(2,708, )`. |
| 23 | + - ``val_mask``: The validation mask. ``torch.BoolTensor`` with size :math:`(2,708, )`. |
| 24 | + - ``test_mask``: The test mask. ``torch.BoolTensor`` with size :math:`(2,708, )`. |
| 25 | +
|
| 26 | + Args: |
| 27 | + ``data_root`` (``str``, optional): The ``data_root`` has stored the data. If set to ``None``, this function will auto-download from server and save into the default direction ``~/.dhg/datasets/``. Defaults to ``None``. |
| 28 | + """ |
| 29 | + |
| 30 | + def __init__(self, data_root: Optional[str] = None) -> None: |
| 31 | + super().__init__("cora", data_root) |
| 32 | + self._content = { |
| 33 | + "num_classes": 7, |
| 34 | + "num_vertices": 2708, |
| 35 | + "num_edges": 10858, |
| 36 | + "dim_features": 1433, |
| 37 | + "features": { |
| 38 | + "upon": [{"filename": "features.pkl", "md5": "05b45e9c38cc95f4fc44b3668cc9ddc9"}], |
| 39 | + "loader": load_from_pickle, |
| 40 | + "preprocess": [to_tensor, partial(norm_ft, ord=1)], |
| 41 | + }, |
| 42 | + "edge_list": { |
| 43 | + "upon": [{"filename": "edge_list.pkl", "md5": "f488389c1edd0d898ce273fbd27822b3"}], |
| 44 | + "loader": load_from_pickle, |
| 45 | + }, |
| 46 | + "labels": { |
| 47 | + "upon": [{"filename": "labels.pkl", "md5": "e506014762052c6a36cb583c28bdae1d"}], |
| 48 | + "loader": load_from_pickle, |
| 49 | + "preprocess": [to_long_tensor], |
| 50 | + }, |
| 51 | + "train_mask": { |
| 52 | + "upon": [{"filename": "train_mask.pkl", "md5": "a11357a40e1f0b5cce728d1a961b8e13"}], |
| 53 | + "loader": load_from_pickle, |
| 54 | + "preprocess": [to_bool_tensor], |
| 55 | + }, |
| 56 | + "val_mask": { |
| 57 | + "upon": [{"filename": "val_mask.pkl", "md5": "355544da566452601bcfa74d30539a71"}], |
| 58 | + "loader": load_from_pickle, |
| 59 | + "preprocess": [to_bool_tensor], |
| 60 | + }, |
| 61 | + "test_mask": { |
| 62 | + "upon": [{"filename": "test_mask.pkl", "md5": "bbfc87d661560f55f6946f8cb9d602b9"}], |
| 63 | + "loader": load_from_pickle, |
| 64 | + "preprocess": [to_bool_tensor], |
| 65 | + }, |
| 66 | + } |
| 67 | + |
| 68 | + |
| 69 | +class CoraBiGraph(BaseData): |
| 70 | + r"""The CoraBiGraph dataset is a citation network dataset for vertex classification task. |
| 71 | + These are synthetic bipartite graph datasets that are generated from citation networks (single graph) |
| 72 | + where documents and citation links between them are treated as nodes and undirected edges, respectively. |
| 73 | + More details see the `Cascade-BGNN: Toward Efficient Self-supervised Representation Learning on Large-scale Bipartite Graphs <https://arxiv.org/pdf/1906.11994.pdf>`_ paper. |
| 74 | + |
| 75 | + The content of the CoraBiGraph dataset includes the following: |
| 76 | +
|
| 77 | + - ``num_u_classes``: The number of classes in set :math:`U` : :math:`7`. |
| 78 | + - ``num_u_vertices``: The number of vertices in set :math:`U` : :math:`1,312`. |
| 79 | + - ``num_v_vertices``: The number of vertices in set :math:`V` : :math:`789`. |
| 80 | + - ``num_edges``: The number of edges: :math:`2,314`. |
| 81 | + - ``dim_u_features``: The dimension of features in set :math:`U` : :math:`1,433`. |
| 82 | + - ``dim_v_features``: The dimension of features: :math:`1,433`. |
| 83 | + - ``u_features``: The vertex feature matrix in set :math:`U`. ``torch.Tensor`` with size :math:`(1,312 \times 1,433)`. |
| 84 | + - ``v_features``: The vertex feature matrix in set :math:`V` . ``torch.Tensor`` with size :math:`(789 \times 1,433)`. |
| 85 | + - ``edge_list``: The edge list. ``List`` with length :math:`(2,314 \times 2)`. |
| 86 | + - ``u_labels``: The label list in set :math:`U` . ``torch.LongTensor`` with size :math:`(1,312, )`. |
| 87 | +
|
| 88 | + Args: |
| 89 | + ``data_root`` (``str``, optional): The ``data_root`` has stored the data. If set to ``None``, this function will auto-download from server and save into the default direction ``~/.dhg/datasets/``. Defaults to ``None``. |
| 90 | + """ |
| 91 | + |
| 92 | + def __init__(self, data_root: Optional[str] = None) -> None: |
| 93 | + super().__init__("cora_bigraph", data_root) |
| 94 | + self._content = { |
| 95 | + "num_u_classes": 7, |
| 96 | + "num_u_vertices": 1312, |
| 97 | + "num_v_vertices": 789, |
| 98 | + "num_edges": 2314, |
| 99 | + "dim_u_features": 1433, |
| 100 | + "dim_v_features": 1433, |
| 101 | + "u_features": { |
| 102 | + "upon": [{"filename": "u_features.pkl", "md5": "84f0ecee4233ca70d40d36f457470032"}], |
| 103 | + "loader": load_from_pickle, |
| 104 | + "preprocess": [to_tensor, partial(norm_ft, ord=1)], |
| 105 | + }, |
| 106 | + "v_features": { |
| 107 | + "upon": [{"filename": "v_features.pkl", "md5": "de65cd478ea05333c26184bc8b2cb468"}], |
| 108 | + "loader": load_from_pickle, |
| 109 | + "preprocess": [to_tensor, partial(norm_ft, ord=1)], |
| 110 | + }, |
| 111 | + "edge_list": { |
| 112 | + "upon": [{"filename": "edge_list.pkl", "md5": "e7b82c4a8305e1488beac1b788ad46e3"}], |
| 113 | + "loader": load_from_pickle, |
| 114 | + }, |
| 115 | + "u_labels": { |
| 116 | + "upon": [{"filename": "u_labels.pkl", "md5": "65dff86f7920cdab61790d48a39f2e5b"}], |
| 117 | + "loader": load_from_pickle, |
| 118 | + "preprocess": [to_long_tensor], |
| 119 | + }, |
| 120 | + } |
| 121 | + |
0 commit comments