Enable ruff rule PD (#2461)

afuetterer · web-flow · commit b2ce08422250 · 2025-12-17T09:12:05.000+01:00
diff --git a/bertopic/_bertopic.py b/bertopic/_bertopic.py
@@ -457,7 +457,7 @@ def fit_transform(
             logger.info("Embedding - Transforming documents to embeddings.")
             self.embedding_model = select_backend(self.embedding_model, language=self.language, verbose=self.verbose)
             embeddings = self._extract_embeddings(
-                documents.Document.values.tolist(),
+                documents.Document.to_numpy().tolist(),
                 images=images,
                 method="document",
                 verbose=self.verbose,
@@ -503,7 +503,7 @@ def fit_transform(
             documents = self._sort_mappings_by_frequency(documents)
 
         # Create documents from images if we have images only
-        if documents.Document.values[0] is None:
+        if documents.Document.to_numpy()[0] is None:
             custom_documents = self._images_to_text(documents, embeddings)
 
             # Extract topics by calculating c-TF-IDF, reduce topics if needed, and get representations.
@@ -726,7 +726,7 @@ def partial_fit(
                     self.embedding_model, language=self.language, verbose=self.verbose
                 )
             embeddings = self._extract_embeddings(
-                documents.Document.values.tolist(),
+                documents.Document.to_numpy().tolist(),
                 method="document",
                 verbose=self.verbose,
             )
@@ -926,7 +926,7 @@ def topics_over_time(
             # Fine-tune the timestamp c-TF-IDF representation based on the global c-TF-IDF representation
             # by simply taking the average of the two
             if global_tuning:
-                selected_topics = [all_topics_indices[topic] for topic in documents_per_topic.Topic.values]
+                selected_topics = [all_topics_indices[topic] for topic in documents_per_topic.Topic.to_numpy()]
                 c_tf_idf = (global_c_tf_idf[selected_topics] + c_tf_idf) / 2.0
 
             # Extract the words per topic
@@ -1010,11 +1010,11 @@ def topics_per_class(
             # by simply taking the average of the two
             if global_tuning:
                 c_tf_idf = normalize(c_tf_idf, axis=1, norm="l1", copy=False)
-                c_tf_idf = (global_c_tf_idf[documents_per_topic.Topic.values + self._outliers] + c_tf_idf) / 2.0
+                c_tf_idf = (global_c_tf_idf[documents_per_topic.Topic.to_numpy() + self._outliers] + c_tf_idf) / 2.0
 
             # Extract the words per topic
             words_per_topic = self._extract_words_per_topic(words, selection, c_tf_idf, calculate_aspects=False)
-            topic_frequency = pd.Series(documents_per_topic.Class.values, index=documents_per_topic.Topic).to_dict()
+            topic_frequency = pd.Series(documents_per_topic.Class.to_numpy(), index=documents_per_topic.Topic).to_dict()
 
             # Fill dataframe with results
             topics_at_class = [
@@ -1796,7 +1796,7 @@ def get_document_info(
 
         # Add topic info through `.get_topic_info()`
         topic_info = self.get_topic_info().drop("Count", axis=1)
-        document_info = pd.merge(document_info, topic_info, on="Topic", how="left")
+        document_info = document_info.merge(topic_info, on="Topic", how="left")
 
         # Add top n words
         top_n_words = {topic: " - ".join(next(zip(*self.get_topic(topic)))) for topic in set(self.topics_)}
@@ -1941,7 +1941,7 @@ def _tree(to_print, start, parent, tree, grandpa=None, indent=""):
                     (hier_topics.Child_Left_ID == parent) | (hier_topics.Child_Right_ID == parent),
                     "Distance",
                 ]
-                distance = distance.values[0] if len(distance) > 0 else 10
+                distance = distance.to_numpy()[0] if len(distance) > 0 else 10
 
                 if parent != start:
                     if grandpa is None:
@@ -4059,7 +4059,7 @@ def _zeroshot_topic_modeling(
         embeddings = embeddings[non_assigned_ids]
 
         if len(documents) == 0:
-            self.topics_ = assigned_documents["Topic"].values.tolist()
+            self.topics_ = assigned_documents["Topic"].to_numpy().tolist()
             self.topic_mapper_ = TopicMapper(self.topics_)
 
         logger.info("Zeroshot Step 1 - Completed \u2713")
@@ -4280,7 +4280,7 @@ def _extract_representative_docs(
         for index, topic in enumerate(labels):
             # Slice data
             selection = documents_per_topic.loc[documents_per_topic.Topic == topic, :]
-            selected_docs = selection["Document"].values
+            selected_docs = selection["Document"].to_numpy()
             selected_docs_ids = selection.index.tolist()
 
             # Calculate similarity
@@ -4335,8 +4335,8 @@ def _create_topic_vectors(
         if embeddings is not None and documents is not None:
             topic_embeddings = []
             topics = documents.sort_values("Topic").Topic.unique()
-            topic_ids = documents["Topic"].values
-            doc_ids = documents["ID"].values.astype(int)
+            topic_ids = documents["Topic"].to_numpy()
+            doc_ids = documents["ID"].to_numpy().astype(int)
             for topic in topics:
                 mask = topic_ids == topic
                 topic_embeddings.append(embeddings[doc_ids[mask]].mean(axis=0))
@@ -4458,7 +4458,7 @@ def _update_topic_size(self, documents: pd.DataFrame):
         Arguments:
             documents: Updated dataframe with documents and their corresponding IDs and newly added Topics
         """
-        self.topic_sizes_ = collections.Counter(documents.Topic.values.tolist())
+        self.topic_sizes_ = collections.Counter(documents.Topic.to_numpy().tolist())
         self.topics_ = documents.Topic.astype(int).tolist()
 
     def _extract_words_per_topic(
diff --git a/bertopic/plotting/_datamap.py b/bertopic/plotting/_datamap.py
@@ -162,7 +162,7 @@ def visualize_document_datamap(
                 topic_name_mapping[topic_num] = "Unlabelled"
 
     # Map in topic names and plot
-    named_topic_per_doc = pd.Series(topic_per_doc).map(topic_name_mapping).values
+    named_topic_per_doc = pd.Series(topic_per_doc).map(topic_name_mapping).to_numpy()
 
     if interactive:
         figure = datamapplot.create_interactive_plot(
diff --git a/bertopic/plotting/_hierarchical_documents.py b/bertopic/plotting/_hierarchical_documents.py
@@ -230,7 +230,7 @@ def visualize_hierarchical_documents(
         else:
             trace_name = (
                 f"{topic}_"
-                + hierarchical_topics.loc[hierarchical_topics.Parent_ID == str(topic), "Parent_Name"].values[0]
+                + hierarchical_topics.loc[hierarchical_topics.Parent_ID == str(topic), "Parent_Name"].to_numpy()[0]
             )
             plot_text = "_".join([name[:20] for name in trace_name.split("_")[:3]])
             topic_names[topic] = {
diff --git a/bertopic/plotting/_hierarchy.py b/bertopic/plotting/_hierarchy.py
@@ -306,7 +306,7 @@ def _get_annotations(
         else:
             for key, value in parent_topic.items():
                 if set(value) == set(fst_topic):
-                    fst_name = df.loc[df.Parent_ID == key, "Parent_Name"].values[0]
+                    fst_name = df.loc[df.Parent_ID == key, "Parent_Name"].to_numpy()[0]
 
         if len(scnd_topic) == 1:
             if isinstance(custom_labels, str):
@@ -320,7 +320,7 @@ def _get_annotations(
         else:
             for key, value in parent_topic.items():
                 if set(value) == set(scnd_topic):
-                    scnd_name = df.loc[df.Parent_ID == key, "Parent_Name"].values[0]
+                    scnd_name = df.loc[df.Parent_ID == key, "Parent_Name"].to_numpy()[0]
 
         text_annotations.append([fst_name, "", "", scnd_name])
 
diff --git a/bertopic/plotting/_topics_over_time.py b/bertopic/plotting/_topics_over_time.py
@@ -92,10 +92,10 @@ def visualize_topics_over_time(
     fig = go.Figure()
     for index, topic in enumerate(data.Topic.unique()):
         trace_data = data.loc[data.Topic == topic, :]
-        topic_name = trace_data.Name.values[0]
-        words = trace_data.Words.values
+        topic_name = trace_data.Name.to_numpy()[0]
+        words = trace_data.Words.to_numpy()
         if normalize_frequency:
-            y = normalize(trace_data.Frequency.values.reshape(1, -1))[0]
+            y = normalize(trace_data.Frequency.to_numpy().reshape(1, -1))[0]
         else:
             y = trace_data.Frequency
         fig.add_trace(
diff --git a/bertopic/plotting/_topics_per_class.py b/bertopic/plotting/_topics_per_class.py
@@ -96,10 +96,10 @@ def visualize_topics_per_class(
         else:
             visible = "legendonly"
         trace_data = data.loc[data.Topic == topic, :]
-        topic_name = trace_data.Name.values[0]
-        words = trace_data.Words.values
+        topic_name = trace_data.Name.to_numpy()[0]
+        words = trace_data.Words.to_numpy()
         if normalize_frequency:
-            x = normalize(trace_data.Frequency.values.reshape(1, -1))[0]
+            x = normalize(trace_data.Frequency.to_numpy().reshape(1, -1))[0]
         else:
             x = trace_data.Frequency
         fig.add_trace(
diff --git a/bertopic/representation/_visual.py b/bertopic/representation/_visual.py
@@ -92,7 +92,7 @@ def extract_topics(
             representative_images: Representative images per topic
         """
         # Extract image ids of most representative documents
-        images = documents["Image"].values.tolist()
+        images = documents["Image"].to_numpy().tolist()
         (_, _, _, repr_docs_ids) = topic_model._extract_representative_docs(
             c_tf_idf,
             documents,
@@ -156,10 +156,10 @@ def _convert_image_to_text(self, images: List[str], verbose: bool = False) -> Li
     def image_to_text(self, documents: pd.DataFrame, embeddings: np.ndarray) -> pd.DataFrame:
         """Convert images to text."""
         # Create image topic embeddings
-        topics = documents.Topic.values.tolist()
-        images = documents.Image.values.tolist()
+        topics = documents.Topic.to_numpy().tolist()
+        images = documents.Image.to_numpy().tolist()
         df = pd.DataFrame(np.hstack([np.array(topics).reshape(-1, 1), embeddings]))
-        image_topic_embeddings = df.groupby(0).mean().values
+        image_topic_embeddings = df.groupby(0).mean().to_numpy()
 
         # Extract image centroids
         image_centroids = {}
diff --git a/pyproject.toml b/pyproject.toml
@@ -110,6 +110,7 @@ select = [
   "E9",
   "F",   # pyflakes
   "D",   # pydocstyle
+  "PD",  # pandas-vet
   "RUF", # ruff
 ]
 

Original file line number	Diff line number	Diff line change
`@@ -230,7 +230,7 @@ def visualize_hierarchical_documents(`
`230`	`230`	`else:`
`231`	`231`	`trace_name = (`
`232`	`232`	`f"{topic}_"`
`233`		`- + hierarchical_topics.loc[hierarchical_topics.Parent_ID == str(topic), "Parent_Name"].values[0]`
	`233`	`+ + hierarchical_topics.loc[hierarchical_topics.Parent_ID == str(topic), "Parent_Name"].to_numpy()[0]`
`234`	`234`	`)`
`235`	`235`	`plot_text = "_".join([name[:20] for name in trace_name.split("_")[:3]])`
`236`	`236`	`topic_names[topic] = {`
Original file line number	Diff line number	Diff line change
`@@ -110,6 +110,7 @@ select = [`
`110`	`110`	`"E9",`
`111`	`111`	`"F", # pyflakes`
`112`	`112`	`"D", # pydocstyle`
	`113`	`+ "PD", # pandas-vet`
`113`	`114`	`"RUF", # ruff`
`114`	`115`	`]`
`115`	`116`