|
1 | 1 | from typing import Optional |
| 2 | +from functools import partial |
2 | 3 |
|
3 | | -from dhg.datapipe import load_from_pickle |
| 4 | +from dhg.datapipe import load_from_pickle, norm_ft, to_bool_tensor, to_tensor, to_long_tensor |
4 | 5 |
|
5 | 6 | from .base import BaseData |
6 | 7 |
|
@@ -29,3 +30,74 @@ def __init__(self, data_root: Optional[str] = None) -> None: |
29 | 30 | }, |
30 | 31 | } |
31 | 32 |
|
| 33 | + |
| 34 | +class DBLP4k(BaseData): |
| 35 | + r"""The DBLP-4k dataset is a citation network hypergraph dataset for node classification task. |
| 36 | + The dataset is an academic network from four research areas. There are 14,475 authors, |
| 37 | + 14,376 papers, and 20 conferences, among which 4,057 authors, 20 conferences and |
| 38 | + 100 papers are labeled with one of the four research areas (database, data mining, machine learning, and information retrieval). |
| 39 | + The vertice denotes author, and three types of correlation (co-paper, co-term, co-conference) can be used for building hyperedges. |
| 40 | + More details see the `PathSim: Meta Path-Based Top-K Similarity Search in Heterogeneous Information Networks <http://www.vldb.org/pvldb/vol4/p992-sun.pdf>`_ paper. |
| 41 | + |
| 42 | + The content of the DBLP-4k dataset includes the following: |
| 43 | +
|
| 44 | + - ``num_classes``: The number of classes: :math:`4`. |
| 45 | + - ``num_vertices``: The number of vertices: :math:`4,057`. |
| 46 | + - ``num_paper_edges``: The number of hyperedges constructed by the co-paper correlation: :math:`14,328`. |
| 47 | + - ``num_term_edges``: The number of hyperedges constructed by the co-term correlation: :math:`7,723`. |
| 48 | + - ``num_conf_edges``: The number of hyperedges constructed by the co-conference correlation: :math:`20`. |
| 49 | + - ``dim_features``: The dimension of author features: :math:`334`. |
| 50 | + - ``features``: The author feature matrix. ``torch.Tensor`` with size :math:`(4,057 \times 334)`. |
| 51 | + - ``labels``: The label list. ``torch.LongTensor`` with size :math:`(4,057, )`. |
| 52 | + - ``edge_by_paper``: The hyperedge list constructed by the co-paper correlation. ``List`` with length :math:`(14,328 \times 2)`. |
| 53 | + - ``edge_by_term``: The hyperedge list constructed by the co-term correlation. ``List`` with length :math:`(7,723 \times 2)`. |
| 54 | + - ``edge_by_conf``: The hyperedge list constructed by the co-conference correlation. ``List`` with length :math:`(20 \times 2)`. |
| 55 | + - ``paper_author_dict``: The dictionary of ``{paper_id: [author_id, ...]}``. ``Dict`` with length :math:`(14,328)`. |
| 56 | + - ``term_paper_dict``: The dictionary of ``{term_id: [paper_id, ...]}``. ``Dict`` with length :math:`(7,723)`. |
| 57 | + - ``conf_paper_dict``: The dictionary of ``{conf_id: [paper_id, ...]}``. ``Dict`` with length :math:`(20)`. |
| 58 | + """ |
| 59 | + |
| 60 | + def __init__(self, data_root: Optional[str] = None): |
| 61 | + super().__init__("dblp_4k", data_root) |
| 62 | + self._content = { |
| 63 | + 'num_classes': 4, |
| 64 | + 'num_vertices': 4057, |
| 65 | + 'num_paper_edges': 14328, |
| 66 | + 'num_term_edges': 7723, |
| 67 | + 'num_conf_edges': 20, |
| 68 | + 'dim_features': 334, |
| 69 | + "features": { |
| 70 | + "upon": [{"filename": "features.pkl", "md5": "7f8e6c3219026c284342d45c01e16406"}], |
| 71 | + "loader": load_from_pickle, |
| 72 | + "preprocess": [to_tensor, partial(norm_ft, ord=1)], |
| 73 | + }, |
| 74 | + 'labels': { |
| 75 | + 'upon': [{'filename': 'labels.pkl', 'md5': '6ffe5ab8c5670d8b5df595b5c4c63184'}], |
| 76 | + 'loader': load_from_pickle, |
| 77 | + 'preprocess': [to_long_tensor] |
| 78 | + }, |
| 79 | + 'edge_by_paper': { |
| 80 | + 'upon': [{'filename': 'edge_by_paper.pkl', 'md5': 'e473eddeb4692f732bc1e47ae94d62c2'}], |
| 81 | + 'loader': load_from_pickle, |
| 82 | + }, |
| 83 | + 'edge_by_term': { |
| 84 | + 'upon': [{'filename': 'edge_by_term.pkl', 'md5': '1ca7cfbf46a7f5fc743818c65392a0ed'}], |
| 85 | + 'loader': load_from_pickle, |
| 86 | + }, |
| 87 | + 'edge_by_conf': { |
| 88 | + 'upon': [{'filename': 'edge_by_conf.pkl', 'md5': '890d683b7d8f943ac6d7e87043e0355e'}], |
| 89 | + 'loader': load_from_pickle, |
| 90 | + }, |
| 91 | + 'paper_author_dict': { |
| 92 | + 'upon': [{'filename': 'paper_author_dict.pkl', 'md5': 'eb2922e010a78961b5b66e77f9bdf950'}], |
| 93 | + 'loader': load_from_pickle, |
| 94 | + }, |
| 95 | + 'term_paper_dict': { |
| 96 | + 'upon': [{'filename': 'term_paper_dict.pkl', 'md5': '1d71f988b52b0e1da9d12f1d3fe24350'}], |
| 97 | + 'loader': load_from_pickle, |
| 98 | + }, |
| 99 | + 'conf_paper_dict': { |
| 100 | + 'upon': [{'filename': 'conf_paper_dict.pkl', 'md5': 'cbf87d64dce4ef40d2ab8406e1ee10e1'}], |
| 101 | + 'loader': load_from_pickle, |
| 102 | + }, |
| 103 | + } |
0 commit comments