Skip to content

Commit 4d41f35

Browse files
HypAR compatible with Cornac >=2.3.1 (#671)
* HypAR compatible with Cornac >=2.3.1 * Pin macos-14 --------- Co-authored-by: Tuan Truong <[email protected]>
1 parent 708741d commit 4d41f35

File tree

9 files changed

+71
-86
lines changed

9 files changed

+71
-86
lines changed

.github/workflows/python-package.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ jobs:
1616
strategy:
1717
fail-fast: false
1818
matrix:
19-
os: [windows-latest, ubuntu-22.04, macos-latest]
19+
os: [windows-latest, ubuntu-22.04, macos-14]
2020
python-version: ["3.9", "3.10", "3.11", "3.12"]
2121
env:
2222
LIMIT_NUMPY_VERSION: 2.0.0

.github/workflows/python-publish.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ jobs:
2222
strategy:
2323
fail-fast: false
2424
matrix:
25-
os: [windows-latest, ubuntu-22.04, macos-latest]
25+
os: [windows-latest, ubuntu-22.04, macos-14]
2626
python-version: ["3.9", "3.10", "3.11", "3.12"]
2727
steps:
2828
- uses: actions/checkout@v4

cornac/models/hypar/README.md

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
# HypAR changes
2+
We've had to make some changes to the HypAR model to ensure compatibility with numpy 2.x.
3+
The main change is replacing the gensim Word2Vec logic with `sentence-transformers`,
4+
which provides high-quality embeddings and is compatible with numpy 2.x. We therefore do not learn embeddings from
5+
data anymore, but use a pre-trained model instead.
6+
Furthermore, we've updated the requirements file to accomodate the new version.
7+
8+
To validate the new implementation, we ran the original experiments on the Cellphone and Computer datasets.
9+
The table below shows the results before and after the changes. We observe that these changes do slightly affect the
10+
performance. If you want to use the original implementation, use an older version of Cornac (before v2.3.0).
11+
12+
13+
| Dataset | Model Version | AUC | MAP | NDCG |
14+
|-----------|---------------|------------|------------|----------|
15+
| Cellphone | Original | **0.7533** | 0.0517 | 0.2054 |
16+
| | Updated | 0.7493 | **0.0597** | **0.2124** |
17+
| Computer | Original | **0.7278** | 0.0194 | **0.1473** |
18+
| | Updated | 0.7214 | **0.0201** | 0.1462 |
19+

cornac/models/hypar/dgl_utils.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -249,11 +249,11 @@ def _generate(self, g, eids, canonical_etype):
249249

250250

251251
def stem_fn(x):
252-
from gensim.parsing import stem_text
253-
252+
from nltk.stem import PorterStemmer
253+
stemmer = PorterStemmer()
254254
# Remove special characters and numbers. Multiple dashes, single quotes, and equal signs, and similar special chars.
255-
return stem_text(re.sub(r'--+.*|-+$|\+\+|\'.+|=+.*$|-\d.*', '', x))
256-
255+
cleaned = re.sub(r'--+.*|-+$|\+\+|\'.+|=+.*$|-\d.*', '', x)
256+
return stemmer.stem(cleaned.lower())
257257

258258
def stem(sentiment):
259259
ao_preprocess_fn = stem_fn

cornac/models/hypar/hypar.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -946,6 +946,8 @@ def inference(self, node_review_graph, ui_graph, device, batch_size):
946946

947947
# Node preference embedding
948948
if self.preference_module == 'lightgcn':
949+
# Move ui_graph to the same device as the model to avoid device mismatch
950+
ui_graph = ui_graph.to(device)
949951
u, i, _ = self.lightgcn(ui_graph)
950952
x = {'user': u, 'item': i}
951953
else:

cornac/models/hypar/recom_hypar.py

Lines changed: 22 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -411,97 +411,52 @@ def _graph_wrapper(self, train_set, graph_type, *args):
411411

412412
def _ao_embeddings(self, train_set):
413413
"""
414-
Learn aspect and opinion embeddings using word2vec.
414+
Learn aspect and opinion embeddings using sentence-transformers.
415415
Parameters
416416
----------
417417
train_set: dataset
418418
Dataset to use for learning embeddings.
419419
Returns
420420
-------
421-
Aspect and opinion embeddings, and word2vec model.
421+
Aspect and opinion embeddings, and the sentence-transformers model.
422422
"""
423423
from .dgl_utils import generate_mappings, stem_fn
424-
from gensim.models import Word2Vec
425-
from gensim.parsing import remove_stopwords, preprocess_string, stem_text
426424
from nltk.tokenize import word_tokenize
427425
from tqdm import tqdm
428426
import numpy as np
427+
from sentence_transformers import SentenceTransformer
429428

430429
sentiment = train_set.sentiment
431-
432-
# Define preprocess functions for text, aspects and opinions.
433430
preprocess_fn = stem_fn
434431

435-
# Process corpus, getting all sentences and words.
436-
corpus = []
437-
for review in tqdm(train_set.review_text.corpus, desc='Processing text', disable=not self.verbose):
438-
for sentence in review.split('.'):
439-
words = word_tokenize(sentence.replace(' n\'t ', 'n ').replace('/', ' '))
440-
corpus.append(' '.join(preprocess_fn(word) for word in words))
441-
442-
# Process words to match with aos extraction methodology used in SEER.
432+
# Prepare aspect and opinion terms
443433
a_old_new_map = {a: preprocess_fn(a) for a in sentiment.aspect_id_map}
444434
o_old_new_map = {o: preprocess_fn(o) for o in sentiment.opinion_id_map}
445-
446-
# Generate mappings for aspect and opinion ids.
447435
_, _, _, _, _, _, a2a, o2o = generate_mappings(train_set.sentiment, 'a', get_ao_mappings=True)
448436

449-
# Define a progressbar for training word2vec as no information is displayed without.
450-
class CallbackProgressBar:
451-
def __init__(self, verbose):
452-
self.verbose = verbose
453-
self.progress = None
454-
455-
def on_train_begin(self, method):
456-
if self.progress is None:
457-
self.progress = tqdm(desc='Training Word2Vec', total=method.epochs, disable=not self.verbose)
458-
459-
def on_train_end(self, method):
460-
pass
461-
462-
def on_epoch_begin(self, method):
463-
pass
437+
# Load sentence-transformers model (use a small, fast model by default)
438+
model = SentenceTransformer('all-MiniLM-L6-v2')
439+
embedding_dim = model.get_sentence_embedding_dimension()
464440

465-
def on_epoch_end(self, method):
466-
self.progress.update(1)
467-
468-
# Split words on space and get all unique words
469-
wc = [s.split(' ') for s in corpus]
470-
all_words = set(s for se in wc for s in se)
471-
472-
# Assert all aspects and opinions in dataset are in corpus. If not, print missing words.
473-
# New datasets may require more preprocessing.
474-
assert all([a in all_words for a in a_old_new_map.values()]), [a for a in a_old_new_map.values() if
475-
a not in all_words]
476-
assert all([o in all_words for o in o_old_new_map.values()]), [o for o in o_old_new_map.values() if
477-
o not in all_words]
478-
479-
# Train word2vec model using callbacks for progressbar.
480-
l = CallbackProgressBar(self.verbose)
481-
embedding_dim = 100
482-
w2v_model = Word2Vec(wc, vector_size=embedding_dim, min_count=1, window=5, callbacks=[l], epochs=100)
483-
484-
# Keyvector model
485-
kv = w2v_model.wv
441+
# Encode all unique aspect and opinion terms
442+
aspect_terms = [a_old_new_map[a] for a in sentiment.aspect_id_map]
443+
opinion_terms = [o_old_new_map[o] for o in sentiment.opinion_id_map]
444+
aspect_vecs = model.encode(aspect_terms, show_progress_bar=self.verbose)
445+
opinion_vecs = model.encode(opinion_terms, show_progress_bar=self.verbose)
486446

487447
# Initialize embeddings
488448
a_embeddings = np.zeros((len(set(a2a.values())), embedding_dim))
489449
o_embeddings = np.zeros((len(set(o2o.values())), embedding_dim))
490450

491-
# Define function for assigning embeddings to correct aspect.
492-
def get_info(old_new_pairs, mapping, embedding):
493-
for old, new in old_new_pairs:
494-
nid = mapping(old)
495-
vector = np.array(kv.get_vector(new))
496-
embedding[nid] = vector
497-
498-
return embedding
499-
500-
# Assign embeddings to correct aspect and opinion.
501-
a_embeddings = get_info(a_old_new_map.items(), lambda x: a2a[sentiment.aspect_id_map[x]], a_embeddings)
502-
o_embeddings = get_info(o_old_new_map.items(), lambda x: o2o[sentiment.opinion_id_map[x]], o_embeddings)
451+
# Assign embeddings to correct aspect and opinion
452+
for idx, a in enumerate(sentiment.aspect_id_map):
453+
nid = a2a[sentiment.aspect_id_map[a]]
454+
a_embeddings[nid] = aspect_vecs[idx]
455+
for idx, o in enumerate(sentiment.opinion_id_map):
456+
nid = o2o[sentiment.opinion_id_map[o]]
457+
o_embeddings[nid] = opinion_vecs[idx]
503458

504-
return a_embeddings, o_embeddings, kv
459+
return a_embeddings, o_embeddings, model
505460

506461
def _normalize_embedding(self, embedding):
507462
"""
@@ -550,8 +505,10 @@ def _learn_initial_ao_embeddings(self, train_set):
550505
return torch.tensor(a_embeddings), torch.tensor(o_embeddings)
551506

552507
def fit(self, train_set: Dataset, val_set=None):
508+
import os
553509
import torch
554510
from .lightgcn import construct_graph
511+
os.environ['TOKENIZERS_PARALLELISM'] = 'false'
555512

556513
# Initialize self variables
557514
super().fit(train_set, val_set)
Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,12 @@
11
# Links for torch and dgl
22
-f https://download.pytorch.org/whl/torch_stable.html
3+
-f https://data.dgl.ai/wheels/torch-2.3/repo.html
34

4-
pandas==1.4.*
5-
gensim==4.2.0
5+
pandas==2.2.3
6+
scikit-learn>=1.0.0
7+
nltk>=3.6
68
sentence-transformers==2.2.2
7-
dgl==1.0.*
8-
torch==1.*
9-
filelock==3.8.2
9+
dgl==2.4.0
10+
torch==2.3.*
11+
filelock==3.8.2
12+
huggingface_hub>=0.10.0,<0.16.0

cornac/models/hypar/requirements_cu116.txt

Lines changed: 0 additions & 10 deletions
This file was deleted.
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
# Links for torch and dgl
2+
3+
-f https://data.dgl.ai/wheels/torch-2.3/cu118/repo.html
4+
5+
pandas==2.2.3
6+
scikit-learn>=1.0.0
7+
nltk>=3.6
8+
sentence-transformers==2.2.2
9+
dgl==2.4.0+cu118
10+
filelock==3.8.2
11+
huggingface_hub>=0.10.0,<0.16.0
12+
13+
--index-url https://download.pytorch.org/whl/cu118
14+
torch==2.3.*

0 commit comments

Comments
 (0)