HypAR compatible with Cornac >=2.3.1 (#671)

theisjendal · qtuantruong · web-flow · commit 4d41f3532cc9 · 2025-10-02T09:46:38.000-07:00
* HypAR compatible with Cornac &gt;=2.3.1

* Pin macos-14

---------

Co-authored-by: Tuan Truong &lt;6743100+qtuantruong@users.noreply.github.com&gt;
diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
@@ -16,7 +16,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        os: [windows-latest, ubuntu-22.04, macos-latest]
+        os: [windows-latest, ubuntu-22.04, macos-14]
         python-version: ["3.9", "3.10", "3.11", "3.12"]
     env:
       LIMIT_NUMPY_VERSION: 2.0.0
diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml
@@ -22,7 +22,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        os: [windows-latest, ubuntu-22.04, macos-latest]
+        os: [windows-latest, ubuntu-22.04, macos-14]
         python-version: ["3.9", "3.10", "3.11", "3.12"]
     steps:
     - uses: actions/checkout@v4
diff --git a/cornac/models/hypar/README.md b/cornac/models/hypar/README.md
@@ -0,0 +1,19 @@
+# HypAR changes
+We've had to make some changes to the HypAR model to ensure compatibility with numpy 2.x. 
+The main change is replacing the gensim Word2Vec logic with `sentence-transformers`, 
+which provides high-quality embeddings and is compatible with numpy 2.x. We therefore do not learn embeddings from 
+data anymore, but use a pre-trained model instead.
+Furthermore, we've updated the requirements file to accomodate the new version.
+
+To validate the new implementation, we ran the original experiments on the Cellphone and Computer datasets. 
+The table below shows the results before and after the changes. We observe that these changes do slightly affect the 
+performance. If you want to use the original implementation, use an older version of Cornac (before v2.3.0).
+
+
+| Dataset   | Model Version | AUC        | MAP        | NDCG     |
+|-----------|---------------|------------|------------|----------|
+| Cellphone | Original      | **0.7533** | 0.0517     | 0.2054   |
+|           | Updated       | 0.7493     | **0.0597** | **0.2124** |
+| Computer  | Original      | **0.7278** | 0.0194     | **0.1473** |
+|           | Updated       | 0.7214     | **0.0201** | 0.1462   |
+
diff --git a/cornac/models/hypar/dgl_utils.py b/cornac/models/hypar/dgl_utils.py
@@ -249,11 +249,11 @@ def _generate(self, g, eids, canonical_etype):
 
 
 def stem_fn(x):
-    from gensim.parsing import stem_text
-
+    from nltk.stem import PorterStemmer
+    stemmer = PorterStemmer()
     # Remove special characters and numbers. Multiple dashes, single quotes, and equal signs, and similar special chars.
-    return stem_text(re.sub(r'--+.*|-+$|\+\+|\'.+|=+.*$|-\d.*', '', x))
-
+    cleaned = re.sub(r'--+.*|-+$|\+\+|\'.+|=+.*$|-\d.*', '', x)
+    return stemmer.stem(cleaned.lower())
 
 def stem(sentiment):
     ao_preprocess_fn = stem_fn
diff --git a/cornac/models/hypar/hypar.py b/cornac/models/hypar/hypar.py
@@ -946,6 +946,8 @@ def inference(self, node_review_graph, ui_graph, device, batch_size):
 
         # Node preference embedding
         if self.preference_module == 'lightgcn':
+            # Move ui_graph to the same device as the model to avoid device mismatch
+            ui_graph = ui_graph.to(device)
             u, i, _ = self.lightgcn(ui_graph)
             x = {'user': u, 'item': i}
         else:
diff --git a/cornac/models/hypar/recom_hypar.py b/cornac/models/hypar/recom_hypar.py
@@ -411,97 +411,52 @@ def _graph_wrapper(self, train_set, graph_type, *args):
 
     def _ao_embeddings(self, train_set):
         """
-        Learn aspect and opinion embeddings using word2vec.
+        Learn aspect and opinion embeddings using sentence-transformers.
         Parameters
         ----------
         train_set: dataset
             Dataset to use for learning embeddings.
         Returns
         -------
-            Aspect and opinion embeddings, and word2vec model.
+            Aspect and opinion embeddings, and the sentence-transformers model.
         """
         from .dgl_utils import generate_mappings, stem_fn
-        from gensim.models import Word2Vec
-        from gensim.parsing import remove_stopwords, preprocess_string, stem_text
         from nltk.tokenize import word_tokenize
         from tqdm import tqdm
         import numpy as np
+        from sentence_transformers import SentenceTransformer
 
         sentiment = train_set.sentiment
-
-        # Define preprocess functions for text, aspects and opinions.
         preprocess_fn = stem_fn
 
-        # Process corpus, getting all sentences and words.
-        corpus = []
-        for review in tqdm(train_set.review_text.corpus, desc='Processing text', disable=not self.verbose):
-            for sentence in review.split('.'):
-                words = word_tokenize(sentence.replace(' n\'t ', 'n ').replace('/', ' '))
-                corpus.append(' '.join(preprocess_fn(word) for word in words))
-
-        # Process words to match with aos extraction methodology used in SEER.
+        # Prepare aspect and opinion terms
         a_old_new_map = {a: preprocess_fn(a) for a in sentiment.aspect_id_map}
         o_old_new_map = {o: preprocess_fn(o) for o in sentiment.opinion_id_map}
-
-        # Generate mappings for aspect and opinion ids.
         _, _, _, _, _, _, a2a, o2o = generate_mappings(train_set.sentiment, 'a', get_ao_mappings=True)
 
-        # Define a progressbar for training word2vec as no information is displayed without.
-        class CallbackProgressBar:
-            def __init__(self, verbose):
-                self.verbose = verbose
-                self.progress = None
-
-            def on_train_begin(self, method):
-                if self.progress is None:
-                    self.progress = tqdm(desc='Training Word2Vec', total=method.epochs, disable=not self.verbose)
-
-            def on_train_end(self, method):
-                pass
-
-            def on_epoch_begin(self, method):
-                pass
+        # Load sentence-transformers model (use a small, fast model by default)
+        model = SentenceTransformer('all-MiniLM-L6-v2')
+        embedding_dim = model.get_sentence_embedding_dimension()
 
-            def on_epoch_end(self, method):
-                self.progress.update(1)
-
-        # Split words on space and get all unique words
-        wc = [s.split(' ') for s in corpus]
-        all_words = set(s for se in wc for s in se)
-
-        # Assert all aspects and opinions in dataset are in corpus. If not, print missing words.
-        # New datasets may require more preprocessing.
-        assert all([a in all_words for a in a_old_new_map.values()]), [a for a in a_old_new_map.values() if
-                                                                       a not in all_words]
-        assert all([o in all_words for o in o_old_new_map.values()]), [o for o in o_old_new_map.values() if
-                                                                       o not in all_words]
-
-        # Train word2vec model using callbacks for progressbar.
-        l = CallbackProgressBar(self.verbose)
-        embedding_dim = 100
-        w2v_model = Word2Vec(wc, vector_size=embedding_dim, min_count=1, window=5, callbacks=[l], epochs=100)
-
-        # Keyvector model
-        kv = w2v_model.wv
+        # Encode all unique aspect and opinion terms
+        aspect_terms = [a_old_new_map[a] for a in sentiment.aspect_id_map]
+        opinion_terms = [o_old_new_map[o] for o in sentiment.opinion_id_map]
+        aspect_vecs = model.encode(aspect_terms, show_progress_bar=self.verbose)
+        opinion_vecs = model.encode(opinion_terms, show_progress_bar=self.verbose)
 
         # Initialize embeddings
         a_embeddings = np.zeros((len(set(a2a.values())), embedding_dim))
         o_embeddings = np.zeros((len(set(o2o.values())), embedding_dim))
 
-        # Define function for assigning embeddings to correct aspect.
-        def get_info(old_new_pairs, mapping, embedding):
-            for old, new in old_new_pairs:
-                nid = mapping(old)
-                vector = np.array(kv.get_vector(new))
-                embedding[nid] = vector
-
-            return embedding
-
-        # Assign embeddings to correct aspect and opinion.
-        a_embeddings = get_info(a_old_new_map.items(), lambda x: a2a[sentiment.aspect_id_map[x]], a_embeddings)
-        o_embeddings = get_info(o_old_new_map.items(), lambda x: o2o[sentiment.opinion_id_map[x]], o_embeddings)
+        # Assign embeddings to correct aspect and opinion
+        for idx, a in enumerate(sentiment.aspect_id_map):
+            nid = a2a[sentiment.aspect_id_map[a]]
+            a_embeddings[nid] = aspect_vecs[idx]
+        for idx, o in enumerate(sentiment.opinion_id_map):
+            nid = o2o[sentiment.opinion_id_map[o]]
+            o_embeddings[nid] = opinion_vecs[idx]
 
-        return a_embeddings, o_embeddings, kv
+        return a_embeddings, o_embeddings, model
 
     def _normalize_embedding(self, embedding):
         """
@@ -550,8 +505,10 @@ def _learn_initial_ao_embeddings(self, train_set):
         return torch.tensor(a_embeddings), torch.tensor(o_embeddings)
 
     def fit(self, train_set: Dataset, val_set=None):
+        import os
         import torch
         from .lightgcn import construct_graph
+        os.environ['TOKENIZERS_PARALLELISM'] = 'false'
 
         # Initialize self variables
         super().fit(train_set, val_set)
diff --git a/cornac/models/hypar/requirements.txt b/cornac/models/hypar/requirements.txt
@@ -1,9 +1,12 @@
 # Links for torch and dgl
 -f https://download.pytorch.org/whl/torch_stable.html
+-f https://data.dgl.ai/wheels/torch-2.3/repo.html
 
-pandas==1.4.*
-gensim==4.2.0
+pandas==2.2.3
+scikit-learn>=1.0.0
+nltk>=3.6
 sentence-transformers==2.2.2
-dgl==1.0.*
-torch==1.*
-filelock==3.8.2
+dgl==2.4.0
+torch==2.3.*
+filelock==3.8.2
+huggingface_hub>=0.10.0,<0.16.0
diff --git a/cornac/models/hypar/requirements_cu116.txt b/cornac/models/hypar/requirements_cu116.txt
diff --git a/cornac/models/hypar/requirements_cu118.txt b/cornac/models/hypar/requirements_cu118.txt
@@ -0,0 +1,14 @@
+# Links for torch and dgl
+
+-f https://data.dgl.ai/wheels/torch-2.3/cu118/repo.html
+
+pandas==2.2.3
+scikit-learn>=1.0.0
+nltk>=3.6
+sentence-transformers==2.2.2
+dgl==2.4.0+cu118
+filelock==3.8.2
+huggingface_hub>=0.10.0,<0.16.0
+
+--index-url https://download.pytorch.org/whl/cu118
+torch==2.3.*