Skip to content

Commit b2ce084

Browse files
authored
Enable ruff rule PD (#2461)
1 parent 75f2910 commit b2ce084

File tree

8 files changed

+28
-27
lines changed

8 files changed

+28
-27
lines changed

bertopic/_bertopic.py

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -457,7 +457,7 @@ def fit_transform(
457457
logger.info("Embedding - Transforming documents to embeddings.")
458458
self.embedding_model = select_backend(self.embedding_model, language=self.language, verbose=self.verbose)
459459
embeddings = self._extract_embeddings(
460-
documents.Document.values.tolist(),
460+
documents.Document.to_numpy().tolist(),
461461
images=images,
462462
method="document",
463463
verbose=self.verbose,
@@ -503,7 +503,7 @@ def fit_transform(
503503
documents = self._sort_mappings_by_frequency(documents)
504504

505505
# Create documents from images if we have images only
506-
if documents.Document.values[0] is None:
506+
if documents.Document.to_numpy()[0] is None:
507507
custom_documents = self._images_to_text(documents, embeddings)
508508

509509
# Extract topics by calculating c-TF-IDF, reduce topics if needed, and get representations.
@@ -726,7 +726,7 @@ def partial_fit(
726726
self.embedding_model, language=self.language, verbose=self.verbose
727727
)
728728
embeddings = self._extract_embeddings(
729-
documents.Document.values.tolist(),
729+
documents.Document.to_numpy().tolist(),
730730
method="document",
731731
verbose=self.verbose,
732732
)
@@ -926,7 +926,7 @@ def topics_over_time(
926926
# Fine-tune the timestamp c-TF-IDF representation based on the global c-TF-IDF representation
927927
# by simply taking the average of the two
928928
if global_tuning:
929-
selected_topics = [all_topics_indices[topic] for topic in documents_per_topic.Topic.values]
929+
selected_topics = [all_topics_indices[topic] for topic in documents_per_topic.Topic.to_numpy()]
930930
c_tf_idf = (global_c_tf_idf[selected_topics] + c_tf_idf) / 2.0
931931

932932
# Extract the words per topic
@@ -1010,11 +1010,11 @@ def topics_per_class(
10101010
# by simply taking the average of the two
10111011
if global_tuning:
10121012
c_tf_idf = normalize(c_tf_idf, axis=1, norm="l1", copy=False)
1013-
c_tf_idf = (global_c_tf_idf[documents_per_topic.Topic.values + self._outliers] + c_tf_idf) / 2.0
1013+
c_tf_idf = (global_c_tf_idf[documents_per_topic.Topic.to_numpy() + self._outliers] + c_tf_idf) / 2.0
10141014

10151015
# Extract the words per topic
10161016
words_per_topic = self._extract_words_per_topic(words, selection, c_tf_idf, calculate_aspects=False)
1017-
topic_frequency = pd.Series(documents_per_topic.Class.values, index=documents_per_topic.Topic).to_dict()
1017+
topic_frequency = pd.Series(documents_per_topic.Class.to_numpy(), index=documents_per_topic.Topic).to_dict()
10181018

10191019
# Fill dataframe with results
10201020
topics_at_class = [
@@ -1796,7 +1796,7 @@ def get_document_info(
17961796

17971797
# Add topic info through `.get_topic_info()`
17981798
topic_info = self.get_topic_info().drop("Count", axis=1)
1799-
document_info = pd.merge(document_info, topic_info, on="Topic", how="left")
1799+
document_info = document_info.merge(topic_info, on="Topic", how="left")
18001800

18011801
# Add top n words
18021802
top_n_words = {topic: " - ".join(next(zip(*self.get_topic(topic)))) for topic in set(self.topics_)}
@@ -1941,7 +1941,7 @@ def _tree(to_print, start, parent, tree, grandpa=None, indent=""):
19411941
(hier_topics.Child_Left_ID == parent) | (hier_topics.Child_Right_ID == parent),
19421942
"Distance",
19431943
]
1944-
distance = distance.values[0] if len(distance) > 0 else 10
1944+
distance = distance.to_numpy()[0] if len(distance) > 0 else 10
19451945

19461946
if parent != start:
19471947
if grandpa is None:
@@ -4059,7 +4059,7 @@ def _zeroshot_topic_modeling(
40594059
embeddings = embeddings[non_assigned_ids]
40604060

40614061
if len(documents) == 0:
4062-
self.topics_ = assigned_documents["Topic"].values.tolist()
4062+
self.topics_ = assigned_documents["Topic"].to_numpy().tolist()
40634063
self.topic_mapper_ = TopicMapper(self.topics_)
40644064

40654065
logger.info("Zeroshot Step 1 - Completed \u2713")
@@ -4280,7 +4280,7 @@ def _extract_representative_docs(
42804280
for index, topic in enumerate(labels):
42814281
# Slice data
42824282
selection = documents_per_topic.loc[documents_per_topic.Topic == topic, :]
4283-
selected_docs = selection["Document"].values
4283+
selected_docs = selection["Document"].to_numpy()
42844284
selected_docs_ids = selection.index.tolist()
42854285

42864286
# Calculate similarity
@@ -4335,8 +4335,8 @@ def _create_topic_vectors(
43354335
if embeddings is not None and documents is not None:
43364336
topic_embeddings = []
43374337
topics = documents.sort_values("Topic").Topic.unique()
4338-
topic_ids = documents["Topic"].values
4339-
doc_ids = documents["ID"].values.astype(int)
4338+
topic_ids = documents["Topic"].to_numpy()
4339+
doc_ids = documents["ID"].to_numpy().astype(int)
43404340
for topic in topics:
43414341
mask = topic_ids == topic
43424342
topic_embeddings.append(embeddings[doc_ids[mask]].mean(axis=0))
@@ -4458,7 +4458,7 @@ def _update_topic_size(self, documents: pd.DataFrame):
44584458
Arguments:
44594459
documents: Updated dataframe with documents and their corresponding IDs and newly added Topics
44604460
"""
4461-
self.topic_sizes_ = collections.Counter(documents.Topic.values.tolist())
4461+
self.topic_sizes_ = collections.Counter(documents.Topic.to_numpy().tolist())
44624462
self.topics_ = documents.Topic.astype(int).tolist()
44634463

44644464
def _extract_words_per_topic(

bertopic/plotting/_datamap.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -162,7 +162,7 @@ def visualize_document_datamap(
162162
topic_name_mapping[topic_num] = "Unlabelled"
163163

164164
# Map in topic names and plot
165-
named_topic_per_doc = pd.Series(topic_per_doc).map(topic_name_mapping).values
165+
named_topic_per_doc = pd.Series(topic_per_doc).map(topic_name_mapping).to_numpy()
166166

167167
if interactive:
168168
figure = datamapplot.create_interactive_plot(

bertopic/plotting/_hierarchical_documents.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -230,7 +230,7 @@ def visualize_hierarchical_documents(
230230
else:
231231
trace_name = (
232232
f"{topic}_"
233-
+ hierarchical_topics.loc[hierarchical_topics.Parent_ID == str(topic), "Parent_Name"].values[0]
233+
+ hierarchical_topics.loc[hierarchical_topics.Parent_ID == str(topic), "Parent_Name"].to_numpy()[0]
234234
)
235235
plot_text = "_".join([name[:20] for name in trace_name.split("_")[:3]])
236236
topic_names[topic] = {

bertopic/plotting/_hierarchy.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -306,7 +306,7 @@ def _get_annotations(
306306
else:
307307
for key, value in parent_topic.items():
308308
if set(value) == set(fst_topic):
309-
fst_name = df.loc[df.Parent_ID == key, "Parent_Name"].values[0]
309+
fst_name = df.loc[df.Parent_ID == key, "Parent_Name"].to_numpy()[0]
310310

311311
if len(scnd_topic) == 1:
312312
if isinstance(custom_labels, str):
@@ -320,7 +320,7 @@ def _get_annotations(
320320
else:
321321
for key, value in parent_topic.items():
322322
if set(value) == set(scnd_topic):
323-
scnd_name = df.loc[df.Parent_ID == key, "Parent_Name"].values[0]
323+
scnd_name = df.loc[df.Parent_ID == key, "Parent_Name"].to_numpy()[0]
324324

325325
text_annotations.append([fst_name, "", "", scnd_name])
326326

bertopic/plotting/_topics_over_time.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -92,10 +92,10 @@ def visualize_topics_over_time(
9292
fig = go.Figure()
9393
for index, topic in enumerate(data.Topic.unique()):
9494
trace_data = data.loc[data.Topic == topic, :]
95-
topic_name = trace_data.Name.values[0]
96-
words = trace_data.Words.values
95+
topic_name = trace_data.Name.to_numpy()[0]
96+
words = trace_data.Words.to_numpy()
9797
if normalize_frequency:
98-
y = normalize(trace_data.Frequency.values.reshape(1, -1))[0]
98+
y = normalize(trace_data.Frequency.to_numpy().reshape(1, -1))[0]
9999
else:
100100
y = trace_data.Frequency
101101
fig.add_trace(

bertopic/plotting/_topics_per_class.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -96,10 +96,10 @@ def visualize_topics_per_class(
9696
else:
9797
visible = "legendonly"
9898
trace_data = data.loc[data.Topic == topic, :]
99-
topic_name = trace_data.Name.values[0]
100-
words = trace_data.Words.values
99+
topic_name = trace_data.Name.to_numpy()[0]
100+
words = trace_data.Words.to_numpy()
101101
if normalize_frequency:
102-
x = normalize(trace_data.Frequency.values.reshape(1, -1))[0]
102+
x = normalize(trace_data.Frequency.to_numpy().reshape(1, -1))[0]
103103
else:
104104
x = trace_data.Frequency
105105
fig.add_trace(

bertopic/representation/_visual.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,7 @@ def extract_topics(
9292
representative_images: Representative images per topic
9393
"""
9494
# Extract image ids of most representative documents
95-
images = documents["Image"].values.tolist()
95+
images = documents["Image"].to_numpy().tolist()
9696
(_, _, _, repr_docs_ids) = topic_model._extract_representative_docs(
9797
c_tf_idf,
9898
documents,
@@ -156,10 +156,10 @@ def _convert_image_to_text(self, images: List[str], verbose: bool = False) -> Li
156156
def image_to_text(self, documents: pd.DataFrame, embeddings: np.ndarray) -> pd.DataFrame:
157157
"""Convert images to text."""
158158
# Create image topic embeddings
159-
topics = documents.Topic.values.tolist()
160-
images = documents.Image.values.tolist()
159+
topics = documents.Topic.to_numpy().tolist()
160+
images = documents.Image.to_numpy().tolist()
161161
df = pd.DataFrame(np.hstack([np.array(topics).reshape(-1, 1), embeddings]))
162-
image_topic_embeddings = df.groupby(0).mean().values
162+
image_topic_embeddings = df.groupby(0).mean().to_numpy()
163163

164164
# Extract image centroids
165165
image_centroids = {}

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,7 @@ select = [
110110
"E9",
111111
"F", # pyflakes
112112
"D", # pydocstyle
113+
"PD", # pandas-vet
113114
"RUF", # ruff
114115
]
115116

0 commit comments

Comments
 (0)