@@ -411,97 +411,52 @@ def _graph_wrapper(self, train_set, graph_type, *args):
411411
412412 def _ao_embeddings (self , train_set ):
413413 """
414- Learn aspect and opinion embeddings using word2vec .
414+ Learn aspect and opinion embeddings using sentence-transformers .
415415 Parameters
416416 ----------
417417 train_set: dataset
418418 Dataset to use for learning embeddings.
419419 Returns
420420 -------
421- Aspect and opinion embeddings, and word2vec model.
421+ Aspect and opinion embeddings, and the sentence-transformers model.
422422 """
423423 from .dgl_utils import generate_mappings , stem_fn
424- from gensim .models import Word2Vec
425- from gensim .parsing import remove_stopwords , preprocess_string , stem_text
426424 from nltk .tokenize import word_tokenize
427425 from tqdm import tqdm
428426 import numpy as np
427+ from sentence_transformers import SentenceTransformer
429428
430429 sentiment = train_set .sentiment
431-
432- # Define preprocess functions for text, aspects and opinions.
433430 preprocess_fn = stem_fn
434431
435- # Process corpus, getting all sentences and words.
436- corpus = []
437- for review in tqdm (train_set .review_text .corpus , desc = 'Processing text' , disable = not self .verbose ):
438- for sentence in review .split ('.' ):
439- words = word_tokenize (sentence .replace (' n\' t ' , 'n ' ).replace ('/' , ' ' ))
440- corpus .append (' ' .join (preprocess_fn (word ) for word in words ))
441-
442- # Process words to match with aos extraction methodology used in SEER.
432+ # Prepare aspect and opinion terms
443433 a_old_new_map = {a : preprocess_fn (a ) for a in sentiment .aspect_id_map }
444434 o_old_new_map = {o : preprocess_fn (o ) for o in sentiment .opinion_id_map }
445-
446- # Generate mappings for aspect and opinion ids.
447435 _ , _ , _ , _ , _ , _ , a2a , o2o = generate_mappings (train_set .sentiment , 'a' , get_ao_mappings = True )
448436
449- # Define a progressbar for training word2vec as no information is displayed without.
450- class CallbackProgressBar :
451- def __init__ (self , verbose ):
452- self .verbose = verbose
453- self .progress = None
454-
455- def on_train_begin (self , method ):
456- if self .progress is None :
457- self .progress = tqdm (desc = 'Training Word2Vec' , total = method .epochs , disable = not self .verbose )
458-
459- def on_train_end (self , method ):
460- pass
461-
462- def on_epoch_begin (self , method ):
463- pass
437+ # Load sentence-transformers model (use a small, fast model by default)
438+ model = SentenceTransformer ('all-MiniLM-L6-v2' )
439+ embedding_dim = model .get_sentence_embedding_dimension ()
464440
465- def on_epoch_end (self , method ):
466- self .progress .update (1 )
467-
468- # Split words on space and get all unique words
469- wc = [s .split (' ' ) for s in corpus ]
470- all_words = set (s for se in wc for s in se )
471-
472- # Assert all aspects and opinions in dataset are in corpus. If not, print missing words.
473- # New datasets may require more preprocessing.
474- assert all ([a in all_words for a in a_old_new_map .values ()]), [a for a in a_old_new_map .values () if
475- a not in all_words ]
476- assert all ([o in all_words for o in o_old_new_map .values ()]), [o for o in o_old_new_map .values () if
477- o not in all_words ]
478-
479- # Train word2vec model using callbacks for progressbar.
480- l = CallbackProgressBar (self .verbose )
481- embedding_dim = 100
482- w2v_model = Word2Vec (wc , vector_size = embedding_dim , min_count = 1 , window = 5 , callbacks = [l ], epochs = 100 )
483-
484- # Keyvector model
485- kv = w2v_model .wv
441+ # Encode all unique aspect and opinion terms
442+ aspect_terms = [a_old_new_map [a ] for a in sentiment .aspect_id_map ]
443+ opinion_terms = [o_old_new_map [o ] for o in sentiment .opinion_id_map ]
444+ aspect_vecs = model .encode (aspect_terms , show_progress_bar = self .verbose )
445+ opinion_vecs = model .encode (opinion_terms , show_progress_bar = self .verbose )
486446
487447 # Initialize embeddings
488448 a_embeddings = np .zeros ((len (set (a2a .values ())), embedding_dim ))
489449 o_embeddings = np .zeros ((len (set (o2o .values ())), embedding_dim ))
490450
491- # Define function for assigning embeddings to correct aspect.
492- def get_info (old_new_pairs , mapping , embedding ):
493- for old , new in old_new_pairs :
494- nid = mapping (old )
495- vector = np .array (kv .get_vector (new ))
496- embedding [nid ] = vector
497-
498- return embedding
499-
500- # Assign embeddings to correct aspect and opinion.
501- a_embeddings = get_info (a_old_new_map .items (), lambda x : a2a [sentiment .aspect_id_map [x ]], a_embeddings )
502- o_embeddings = get_info (o_old_new_map .items (), lambda x : o2o [sentiment .opinion_id_map [x ]], o_embeddings )
451+ # Assign embeddings to correct aspect and opinion
452+ for idx , a in enumerate (sentiment .aspect_id_map ):
453+ nid = a2a [sentiment .aspect_id_map [a ]]
454+ a_embeddings [nid ] = aspect_vecs [idx ]
455+ for idx , o in enumerate (sentiment .opinion_id_map ):
456+ nid = o2o [sentiment .opinion_id_map [o ]]
457+ o_embeddings [nid ] = opinion_vecs [idx ]
503458
504- return a_embeddings , o_embeddings , kv
459+ return a_embeddings , o_embeddings , model
505460
506461 def _normalize_embedding (self , embedding ):
507462 """
@@ -550,8 +505,10 @@ def _learn_initial_ao_embeddings(self, train_set):
550505 return torch .tensor (a_embeddings ), torch .tensor (o_embeddings )
551506
552507 def fit (self , train_set : Dataset , val_set = None ):
508+ import os
553509 import torch
554510 from .lightgcn import construct_graph
511+ os .environ ['TOKENIZERS_PARALLELISM' ] = 'false'
555512
556513 # Initialize self variables
557514 super ().fit (train_set , val_set )
0 commit comments