@@ -457,7 +457,7 @@ def fit_transform(
457457 logger .info ("Embedding - Transforming documents to embeddings." )
458458 self .embedding_model = select_backend (self .embedding_model , language = self .language , verbose = self .verbose )
459459 embeddings = self ._extract_embeddings (
460- documents .Document .values .tolist (),
460+ documents .Document .to_numpy () .tolist (),
461461 images = images ,
462462 method = "document" ,
463463 verbose = self .verbose ,
@@ -503,7 +503,7 @@ def fit_transform(
503503 documents = self ._sort_mappings_by_frequency (documents )
504504
505505 # Create documents from images if we have images only
506- if documents .Document .values [0 ] is None :
506+ if documents .Document .to_numpy () [0 ] is None :
507507 custom_documents = self ._images_to_text (documents , embeddings )
508508
509509 # Extract topics by calculating c-TF-IDF, reduce topics if needed, and get representations.
@@ -726,7 +726,7 @@ def partial_fit(
726726 self .embedding_model , language = self .language , verbose = self .verbose
727727 )
728728 embeddings = self ._extract_embeddings (
729- documents .Document .values .tolist (),
729+ documents .Document .to_numpy () .tolist (),
730730 method = "document" ,
731731 verbose = self .verbose ,
732732 )
@@ -926,7 +926,7 @@ def topics_over_time(
926926 # Fine-tune the timestamp c-TF-IDF representation based on the global c-TF-IDF representation
927927 # by simply taking the average of the two
928928 if global_tuning :
929- selected_topics = [all_topics_indices [topic ] for topic in documents_per_topic .Topic .values ]
929+ selected_topics = [all_topics_indices [topic ] for topic in documents_per_topic .Topic .to_numpy () ]
930930 c_tf_idf = (global_c_tf_idf [selected_topics ] + c_tf_idf ) / 2.0
931931
932932 # Extract the words per topic
@@ -1010,11 +1010,11 @@ def topics_per_class(
10101010 # by simply taking the average of the two
10111011 if global_tuning :
10121012 c_tf_idf = normalize (c_tf_idf , axis = 1 , norm = "l1" , copy = False )
1013- c_tf_idf = (global_c_tf_idf [documents_per_topic .Topic .values + self ._outliers ] + c_tf_idf ) / 2.0
1013+ c_tf_idf = (global_c_tf_idf [documents_per_topic .Topic .to_numpy () + self ._outliers ] + c_tf_idf ) / 2.0
10141014
10151015 # Extract the words per topic
10161016 words_per_topic = self ._extract_words_per_topic (words , selection , c_tf_idf , calculate_aspects = False )
1017- topic_frequency = pd .Series (documents_per_topic .Class .values , index = documents_per_topic .Topic ).to_dict ()
1017+ topic_frequency = pd .Series (documents_per_topic .Class .to_numpy () , index = documents_per_topic .Topic ).to_dict ()
10181018
10191019 # Fill dataframe with results
10201020 topics_at_class = [
@@ -1796,7 +1796,7 @@ def get_document_info(
17961796
17971797 # Add topic info through `.get_topic_info()`
17981798 topic_info = self .get_topic_info ().drop ("Count" , axis = 1 )
1799- document_info = pd .merge (document_info , topic_info , on = "Topic" , how = "left" )
1799+ document_info = document_info .merge (topic_info , on = "Topic" , how = "left" )
18001800
18011801 # Add top n words
18021802 top_n_words = {topic : " - " .join (next (zip (* self .get_topic (topic )))) for topic in set (self .topics_ )}
@@ -1941,7 +1941,7 @@ def _tree(to_print, start, parent, tree, grandpa=None, indent=""):
19411941 (hier_topics .Child_Left_ID == parent ) | (hier_topics .Child_Right_ID == parent ),
19421942 "Distance" ,
19431943 ]
1944- distance = distance .values [0 ] if len (distance ) > 0 else 10
1944+ distance = distance .to_numpy () [0 ] if len (distance ) > 0 else 10
19451945
19461946 if parent != start :
19471947 if grandpa is None :
@@ -4059,7 +4059,7 @@ def _zeroshot_topic_modeling(
40594059 embeddings = embeddings [non_assigned_ids ]
40604060
40614061 if len (documents ) == 0 :
4062- self .topics_ = assigned_documents ["Topic" ].values .tolist ()
4062+ self .topics_ = assigned_documents ["Topic" ].to_numpy () .tolist ()
40634063 self .topic_mapper_ = TopicMapper (self .topics_ )
40644064
40654065 logger .info ("Zeroshot Step 1 - Completed \u2713 " )
@@ -4280,7 +4280,7 @@ def _extract_representative_docs(
42804280 for index , topic in enumerate (labels ):
42814281 # Slice data
42824282 selection = documents_per_topic .loc [documents_per_topic .Topic == topic , :]
4283- selected_docs = selection ["Document" ].values
4283+ selected_docs = selection ["Document" ].to_numpy ()
42844284 selected_docs_ids = selection .index .tolist ()
42854285
42864286 # Calculate similarity
@@ -4335,8 +4335,8 @@ def _create_topic_vectors(
43354335 if embeddings is not None and documents is not None :
43364336 topic_embeddings = []
43374337 topics = documents .sort_values ("Topic" ).Topic .unique ()
4338- topic_ids = documents ["Topic" ].values
4339- doc_ids = documents ["ID" ].values .astype (int )
4338+ topic_ids = documents ["Topic" ].to_numpy ()
4339+ doc_ids = documents ["ID" ].to_numpy () .astype (int )
43404340 for topic in topics :
43414341 mask = topic_ids == topic
43424342 topic_embeddings .append (embeddings [doc_ids [mask ]].mean (axis = 0 ))
@@ -4458,7 +4458,7 @@ def _update_topic_size(self, documents: pd.DataFrame):
44584458 Arguments:
44594459 documents: Updated dataframe with documents and their corresponding IDs and newly added Topics
44604460 """
4461- self .topic_sizes_ = collections .Counter (documents .Topic .values .tolist ())
4461+ self .topic_sizes_ = collections .Counter (documents .Topic .to_numpy () .tolist ())
44624462 self .topics_ = documents .Topic .astype (int ).tolist ()
44634463
44644464 def _extract_words_per_topic (
0 commit comments