add github and facebook datasets

yifanfeng97 · yifanfeng97 · commit 9387412ca83d · 2022-09-19T12:25:20.000+08:00
diff --git a/dhg/data/__init__.py b/dhg/data/__init__.py
@@ -9,6 +9,8 @@
 from .cocitation import CocitationCora, CocitationCiteseer, CocitationPubmed
 from .blogcatalog import BlogCatalog
 from .flickr import Flickr
+from .github import Github
+from .facebook import Facebook
 
 __all__ = [
     "BaseData",
@@ -17,6 +19,8 @@
     "Pubmed",
     "BlogCatalog",
     "Flickr",
+    "Github",
+    "Facebook",
     "Cooking200",
     "MovieLens1M",
     "Yelp2018",
diff --git a/dhg/data/facebook.py b/dhg/data/facebook.py
@@ -0,0 +1,53 @@
+from typing import Optional
+from functools import partial
+
+from dhg.datapipe import load_from_pickle, norm_ft, to_tensor, to_long_tensor, to_bool_tensor
+
+from .base import BaseData
+
+
+class Facebook(BaseData):
+    r"""The Facebook dataset is a social network dataset for vertex classification task. 
+    A page-page graph of verified Facebook sites. Nodes correspond to official Facebook pages, links to mutual likes between sites. 
+    Node features are extracted from the site descriptions. 
+    More details see the `Multi-Scale Attributed Node Embedding <https://arxiv.org/pdf/1909.13021.pdf>`_ paper.
+    
+    .. note:: 
+        The L1-normalization for the feature is not recommended for this dataset.
+
+    The content of the Facebook dataset includes the following:
+
+    - ``num_classes``: The number of classes: :math:`4`.
+    - ``num_vertices``: The number of vertices: :math:`22,470`.
+    - ``num_edges``: The number of edges: :math:`85,501`.
+    - ``dim_features``: The dimension of features: :math:`4,714`.
+    - ``features``: The vertex feature matrix. ``torch.Tensor`` with size :math:`(22,470\times 4,714)`.
+    - ``edge_list``: The edge list. ``List`` with length :math:`(85,501 \times 2)`.
+    - ``labels``: The label list. ``torch.LongTensor`` with size :math:`(22,470, )`.
+
+    Args:
+        ``data_root`` (``str``, optional): The ``data_root`` has stored the data. If set to ``None``, this function will auto-download from server and save into the default direction ``~/.dhg/datasets/``. Defaults to ``None``.
+    """
+
+    def __init__(self, data_root: Optional[str] = None) -> None:
+        super().__init__("facebook", data_root)
+        self._content = {
+            "num_classes": 4,
+            "num_vertices": 22470,
+            "num_edges": 85501,
+            "dim_features": 8189,
+            "features": {
+                "upon": [{"filename": "features.pkl", "md5": "046eec1b67fb5bf504eaad75e98af141"}],
+                "loader": load_from_pickle,
+                "preprocess": [to_tensor],  # partial(norm_ft, ord=1)
+            },
+            "edge_list": {
+                "upon": [{"filename": "edge_list.pkl", "md5": "98c6551d020c7741554cae5eab8336ef"}],
+                "loader": load_from_pickle,
+            },
+            "labels": {
+                "upon": [{"filename": "labels.pkl", "md5": "ae0c116274cedc00522df66bd921affc"}],
+                "loader": load_from_pickle,
+                "preprocess": [to_long_tensor],
+            },
+        }
diff --git a/dhg/data/github.py b/dhg/data/github.py
@@ -0,0 +1,54 @@
+from typing import Optional
+from functools import partial
+
+from dhg.datapipe import load_from_pickle, norm_ft, to_tensor, to_long_tensor, to_bool_tensor
+
+from .base import BaseData
+
+
+class Github(BaseData):
+    r"""The Github dataset is a collaboration network dataset for vertex classification task. 
+    Nodes correspond to developers who have starred at least 10 repositories and edges to mutual follower relationships. 
+    Node features are location, starred repositories, employer and e-mail address. 
+    The labels are binary, where denoting the web developers and machine learning developers.
+    More details see the `Multi-Scale Attributed Node Embedding <https://arxiv.org/pdf/1909.13021.pdf>`_ paper.
+    
+    .. note:: 
+        The L1-normalization for the feature is not recommended for this dataset.
+
+    The content of the Github dataset includes the following:
+
+    - ``num_classes``: The number of classes: :math:`4`.
+    - ``num_vertices``: The number of vertices: :math:`37,700`.
+    - ``num_edges``: The number of edges: :math:`144,501`.
+    - ``dim_features``: The dimension of features: :math:`4,005`.
+    - ``features``: The vertex feature matrix. ``torch.Tensor`` with size :math:`(37,700 \times 4,005)`.
+    - ``edge_list``: The edge list. ``List`` with length :math:`(144,501 \times 2)`.
+    - ``labels``: The label list. ``torch.LongTensor`` with size :math:`(37,700, )`.
+
+    Args:
+        ``data_root`` (``str``, optional): The ``data_root`` has stored the data. If set to ``None``, this function will auto-download from server and save into the default direction ``~/.dhg/datasets/``. Defaults to ``None``.
+    """
+
+    def __init__(self, data_root: Optional[str] = None) -> None:
+        super().__init__("github", data_root)
+        self._content = {
+            "num_classes": 2,
+            "num_vertices": 37700,
+            "num_edges": 144501,
+            "dim_features": 4005,
+            "features": {
+                "upon": [{"filename": "features.pkl", "md5": "f097384b61876a22cf048d28a2193c5a"}],
+                "loader": load_from_pickle,
+                "preprocess": [to_tensor],  # partial(norm_ft, ord=1)
+            },
+            "edge_list": {
+                "upon": [{"filename": "edge_list.pkl", "md5": "57012ac55fe125d8865a693b09f794b3"}],
+                "loader": load_from_pickle,
+            },
+            "labels": {
+                "upon": [{"filename": "labels.pkl", "md5": "9b1282a2a8a23c9f3b480136055c8b6b"}],
+                "loader": load_from_pickle,
+                "preprocess": [to_long_tensor],
+            },
+        }
diff --git a/dhg/datapipe/common.py b/dhg/datapipe/common.py
@@ -27,6 +27,9 @@ def to_tensor(
     elif isinstance(X, scipy.sparse.csr_matrix):
         X = X.todense()
         X = torch.tensor(X)
+    elif isinstance(X, scipy.sparse.coo_matrix):
+        X = X.todense()
+        X = torch.tensor(X)
     elif isinstance(X, np.ndarray):
         X = torch.tensor(X)
     else:
diff --git a/dhg/models/graphs/lightgcn.py b/dhg/models/graphs/lightgcn.py
@@ -18,15 +18,17 @@ class LightGCN(nn.Module):
         ``num_items`` (``int``): The Number of items.
         ``emb_dim`` (``int``): Embedding dimension.
         ``num_layers`` (``int``): The Number of layers. Defaults to ``3``.
+        ``drop_rate`` (``float``): Dropout rate. Randomly dropout the connections in training stage with probability ``drop_rate``. Default: ``0.0``.
     """
 
     def __init__(
-        self, num_users: int, num_items: int, emb_dim: int, num_layers: int = 3
+        self, num_users: int, num_items: int, emb_dim: int, num_layers: int = 3, drop_rate: float = 0.0
     ) -> None:
 
         super().__init__()
         self.num_users, self.num_items = num_users, num_items
         self.num_layers = num_layers
+        self.drop_rate = drop_rate
         self.u_embedding = nn.Embedding(num_users, emb_dim)
         self.i_embedding = nn.Embedding(num_items, emb_dim)
         self.reset_parameters()
@@ -43,13 +45,14 @@ def forward(self, ui_bigraph: BiGraph) -> Tuple[torch.Tensor, torch.Tensor]:
         Args:
             ``ui_bigraph`` (``dhg.BiGraph``): The user-item bipartite graph.
         """
+        drop_rate = self.drop_rate if self.training else 0.0
         u_embs = self.u_embedding.weight
         i_embs = self.i_embedding.weight
         all_embs = torch.cat([u_embs, i_embs], dim=0)
 
         embs_list = [all_embs]
         for _ in range(self.num_layers):
-            all_embs = ui_bigraph.smoothing_with_GCN(all_embs)
+            all_embs = ui_bigraph.smoothing_with_GCN(all_embs, drop_rate=drop_rate)
             embs_list.append(all_embs)
         embs = torch.stack(embs_list, dim=1)
         embs = torch.mean(embs, dim=1)
diff --git a/dhg/structure/base.py b/dhg/structure/base.py
@@ -45,6 +45,7 @@ def __init__(
         extra_selfloop: bool = False,
         device: torch.device = torch.device("cpu"),
     ):
+        assert isinstance(num_v, int) and num_v > 0, "num_v should be a positive integer"
         self.clear()
         self.device = device
         self._num_v = num_v
@@ -374,6 +375,7 @@ def __init__(
         v_weight: Optional[List[float]] = None,
         device: torch.device = torch.device("cpu"),
     ):
+        assert isinstance(num_v, int) and num_v > 0, "num_v should be a positive integer"
         self.clear()
         self._num_v = num_v
         self.device = device
diff --git a/docs/source/api/data.rst b/docs/source/api/data.rst
@@ -21,6 +21,8 @@ Graph Datasets
     dhg.data.Citeseer
     dhg.data.BlogCatalog
     dhg.data.Flickr
+    dhg.data.Github
+    dhg.data.Facebook
 
 
 Hypergraph Datasets
diff --git a/examples/ui_recommender/lightgcn_gowalla.py b/examples/ui_recommender/lightgcn_gowalla.py
@@ -35,12 +35,12 @@ def forward(self, emb_users, emb_items, users, pos_items, neg_items, raw_emb_use
 
 def train(net, data_loader, optimizer, criterion, epoch):
     net.train()
-    dropped_ui_bigraph = ui_bigraph.drop_edges(0.2)
+
     loss_mean, st = 0, time.time()
     for users, pos_items, neg_items in data_loader:
         users, pos_items, neg_items = users.to(device), pos_items.to(device), neg_items.to(device)
         optimizer.zero_grad()
-        emb_users, emb_items = net(dropped_ui_bigraph)
+        emb_users, emb_items = net(ui_bigraph)
         loss = criterion(
             emb_users, emb_items, users, pos_items, neg_items, net.u_embedding.weight, net.i_embedding.weight,
         )
@@ -88,7 +88,7 @@ def test(net, data_loader):
 
 if __name__ == "__main__":
     # from dhg.utils import simple_stdout2file
-    # simple_stdout2file("/home/fengyifan/lightgcn_gowalla.log")
+    # simple_stdout2file("/home/fengyifan/lightgcn_gowalla_drop.log")
     dim_emb = 64
     lr = 0.001
     num_workers = 0