|
| 1 | +from typing import Optional |
| 2 | +from functools import partial |
| 3 | + |
| 4 | +from dhg.datapipe import load_from_pickle, norm_ft, to_tensor, to_long_tensor, to_bool_tensor |
| 5 | + |
| 6 | +from .base import BaseData |
| 7 | + |
| 8 | + |
| 9 | +class BlogCatalog(BaseData): |
| 10 | + r"""The BlogCatalog dataset is a social network dataset for vertex classification task. |
| 11 | + This is a network of social relationships of bloggers from the BlogCatalog website, |
| 12 | + where nodes' attributes are constructed by keywords, which are generated by users as a short description of their blogs. |
| 13 | + The labels represent the topic categories provided by the authors. |
| 14 | + |
| 15 | + .. note:: |
| 16 | + The L1-normalization for the feature is not recommended for this dataset. |
| 17 | +
|
| 18 | + The content of the BlogCatalog dataset includes the following: |
| 19 | +
|
| 20 | + - ``num_classes``: The number of classes: :math:`6`. |
| 21 | + - ``num_vertices``: The number of vertices: :math:`5,196`. |
| 22 | + - ``num_edges``: The number of edges: :math:`343,486`. |
| 23 | + - ``dim_features``: The dimension of features: :math:`8,189`. |
| 24 | + - ``features``: The vertex feature matrix. ``torch.Tensor`` with size :math:`(5,196 \times 8,189)`. |
| 25 | + - ``edge_list``: The edge list. ``List`` with length :math:`(343,486 \times 2)`. |
| 26 | + - ``labels``: The label list. ``torch.LongTensor`` with size :math:`(5,196, )`. |
| 27 | +
|
| 28 | + Args: |
| 29 | + ``data_root`` (``str``, optional): The ``data_root`` has stored the data. If set to ``None``, this function will auto-download from server and save into the default direction ``~/.dhg/datasets/``. Defaults to ``None``. |
| 30 | + """ |
| 31 | + |
| 32 | + def __init__(self, data_root: Optional[str] = None) -> None: |
| 33 | + super().__init__("BlogCatalog", data_root) |
| 34 | + self._content = { |
| 35 | + "num_classes": 6, |
| 36 | + "num_vertices": 5196, |
| 37 | + "num_edges": 171743, |
| 38 | + "dim_features": 8189, |
| 39 | + "features": { |
| 40 | + "upon": [{"filename": "features.pkl", "md5": "ecdd26c63f483c4d919a156f9c8e92fc"}], |
| 41 | + "loader": load_from_pickle, |
| 42 | + "preprocess": [to_tensor], # partial(norm_ft, ord=1) |
| 43 | + }, |
| 44 | + "edge_list": { |
| 45 | + "upon": [{"filename": "edge_list.pkl", "md5": "03ffbc8c9a4d9abeab0f127c717888f0"}], |
| 46 | + "loader": load_from_pickle, |
| 47 | + }, |
| 48 | + "labels": { |
| 49 | + "upon": [{"filename": "labels.pkl", "md5": "246e7096dd834a631c33fe0c7afb89b4"}], |
| 50 | + "loader": load_from_pickle, |
| 51 | + "preprocess": [to_long_tensor], |
| 52 | + }, |
| 53 | + } |
0 commit comments