|
| 1 | +from typing import Optional |
| 2 | +from functools import partial |
| 3 | + |
| 4 | +from dhg.datapipe import load_from_pickle, norm_ft, to_tensor, to_long_tensor, to_bool_tensor |
| 5 | + |
| 6 | +from .base import BaseData |
| 7 | + |
| 8 | + |
| 9 | +class Github(BaseData): |
| 10 | + r"""The Github dataset is a collaboration network dataset for vertex classification task. |
| 11 | + Nodes correspond to developers who have starred at least 10 repositories and edges to mutual follower relationships. |
| 12 | + Node features are location, starred repositories, employer and e-mail address. |
| 13 | + The labels are binary, where denoting the web developers and machine learning developers. |
| 14 | + More details see the `Multi-Scale Attributed Node Embedding <https://arxiv.org/pdf/1909.13021.pdf>`_ paper. |
| 15 | + |
| 16 | + .. note:: |
| 17 | + The L1-normalization for the feature is not recommended for this dataset. |
| 18 | +
|
| 19 | + The content of the Github dataset includes the following: |
| 20 | +
|
| 21 | + - ``num_classes``: The number of classes: :math:`4`. |
| 22 | + - ``num_vertices``: The number of vertices: :math:`37,700`. |
| 23 | + - ``num_edges``: The number of edges: :math:`144,501`. |
| 24 | + - ``dim_features``: The dimension of features: :math:`4,005`. |
| 25 | + - ``features``: The vertex feature matrix. ``torch.Tensor`` with size :math:`(37,700 \times 4,005)`. |
| 26 | + - ``edge_list``: The edge list. ``List`` with length :math:`(144,501 \times 2)`. |
| 27 | + - ``labels``: The label list. ``torch.LongTensor`` with size :math:`(37,700, )`. |
| 28 | +
|
| 29 | + Args: |
| 30 | + ``data_root`` (``str``, optional): The ``data_root`` has stored the data. If set to ``None``, this function will auto-download from server and save into the default direction ``~/.dhg/datasets/``. Defaults to ``None``. |
| 31 | + """ |
| 32 | + |
| 33 | + def __init__(self, data_root: Optional[str] = None) -> None: |
| 34 | + super().__init__("github", data_root) |
| 35 | + self._content = { |
| 36 | + "num_classes": 2, |
| 37 | + "num_vertices": 37700, |
| 38 | + "num_edges": 144501, |
| 39 | + "dim_features": 4005, |
| 40 | + "features": { |
| 41 | + "upon": [{"filename": "features.pkl", "md5": "f097384b61876a22cf048d28a2193c5a"}], |
| 42 | + "loader": load_from_pickle, |
| 43 | + "preprocess": [to_tensor], # partial(norm_ft, ord=1) |
| 44 | + }, |
| 45 | + "edge_list": { |
| 46 | + "upon": [{"filename": "edge_list.pkl", "md5": "57012ac55fe125d8865a693b09f794b3"}], |
| 47 | + "loader": load_from_pickle, |
| 48 | + }, |
| 49 | + "labels": { |
| 50 | + "upon": [{"filename": "labels.pkl", "md5": "9b1282a2a8a23c9f3b480136055c8b6b"}], |
| 51 | + "loader": load_from_pickle, |
| 52 | + "preprocess": [to_long_tensor], |
| 53 | + }, |
| 54 | + } |
0 commit comments