Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
91 changes: 91 additions & 0 deletions turftopic/feature_importance.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,11 @@
from __future__ import annotations

from typing import Literal

import numpy as np
import scipy.sparse as spr
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import scale


def cluster_centroid_distance(
Expand Down Expand Up @@ -126,3 +131,89 @@ def bayes_rule(
p_tw = (p_wt.T * p_t).T / p_w
p_tw /= np.nansum(p_tw, axis=0)
return p_tw


def fighting_words(
doc_topic_matrix: np.ndarray,
doc_term_matrix: spr.csr_matrix,
prior: float | Literal["corpus"] = "corpus",
) -> np.ndarray:
"""Computes feature importance using the *Fighting Words* algorithm.

Parameters
----------
doc_topic_matrix: np.ndarray
Document-topic matrix of shape (n_documents, n_topics)
doc_term_matrix: np.ndarray
Document-term matrix of shape (n_documents, vocab_size)
prior: float or "corpus", default "corpus"
Dirichlet prior to use. When a float, it indicates the alpha
parameter of a symmetric Dirichlet, if "corpus",
word frequencies from the background corpus are used.
Returns
-------
ndarray of shape (n_topics, vocab_size)
Term importance matrix.
"""
labels = np.argmax(doc_topic_matrix, axis=1)
n_topics = doc_topic_matrix.shape[1]
n_vocab = doc_term_matrix.shape[1]
components = []
if prior == "corpus":
priors = np.ravel(np.asarray(doc_term_matrix.sum(axis=0)))
else:
priors = np.full(n_vocab, prior)
a0 = np.sum(priors) # prior * n_vocab
for i_topic in range(n_topics):
topic_freq = np.ravel(
np.asarray(doc_term_matrix[labels == i_topic].sum(axis=0))
)
rest_freq = np.ravel(
np.asarray(doc_term_matrix[labels != i_topic].sum(axis=0))
)
n1 = np.sum(topic_freq)
n2 = np.sum(rest_freq)
topic_logodds = np.log(
(topic_freq + priors) / (n1 + a0 - topic_freq - priors)
)
rest_logodds = np.log(
(rest_freq + priors) / (n2 + a0 - rest_freq - priors)
)
delta = topic_logodds - rest_logodds
delta_var = 1 / (topic_freq + priors) + 1 / (rest_freq + priors)
zscore = delta / np.sqrt(delta_var)
components.append(zscore)
return np.stack(components)


def semantic_difference(
doc_topic_matrix: np.ndarray,
embeddings: np.ndarray,
vocab_embeddings: np.ndarray,
) -> np.ndarray:
"""Computes feature importances based on semantic differences
between one group and the rest.

Parameters
----------
doc_topic_matrix: np.ndarray
Document-topic matrix of shape (n_documents, n_topics)
embeddings: np.ndarray
Document embeddingsof shape (n_documents, embedding_size).
vocab_embeddings: np.ndarray
Term embeddings of shape (vocab_size, embedding_size)

Returns
-------
ndarray of shape (n_topics, vocab_size)
Term importance matrix.
"""
labels = np.argmax(doc_topic_matrix, axis=1)
unique_labels = np.sort(np.unique(labels))
components = []
for label in unique_labels:
mean_diff = np.mean(embeddings[label == labels], axis=0) - np.mean(
embeddings[label != labels], axis=0
)
components.append(np.dot(vocab_embeddings, mean_diff))
return scale(np.stack(components), axis=1)
24 changes: 17 additions & 7 deletions turftopic/models/cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
from turftopic.dynamic import DynamicTopicModel
from turftopic.feature_importance import (bayes_rule,
cluster_centroid_distance, ctf_idf,
soft_ctf_idf)
fighting_words, soft_ctf_idf)
from turftopic.vectorizer import default_vectorizer

integer_message = """
Expand All @@ -39,7 +39,7 @@
"""

feature_message = """
feature_importance must be one of 'soft-c-tf-idf', 'c-tf-idf', 'centroid'
feature_importance must be one of 'soft-c-tf-idf', 'c-tf-idf', 'centroid', 'fighting_words'
"""

NOT_MATCHING_ERROR = (
Expand Down Expand Up @@ -152,14 +152,14 @@ class ClusteringTopicModel(ContextualModel, ClusterMixin, DynamicTopicModel):
Clustering method to use for finding topics.
Defaults to OPTICS with 25 minimum cluster size.
To imitate the behavior of BERTopic or Top2Vec you should use HDBSCAN.
feature_importance: {'soft-c-tf-idf', 'c-tf-idf', 'bayes', 'centroid'}, default 'soft-c-tf-idf'
feature_importance: {'soft-c-tf-idf', 'c-tf-idf', 'fighting-words', 'centroid'}, default 'soft-c-tf-idf'
Method for estimating term importances.
'centroid' uses distances from cluster centroid similarly
to Top2Vec.
'c-tf-idf' uses BERTopic's c-tf-idf.
'soft-c-tf-idf' uses Soft c-TF-IDF from GMM, the results should
be very similar to 'c-tf-idf'.
'bayes' uses Bayes' rule.
'fighting-words', uses the fighting-words algorithm (a Bayesian probabilistic model).
n_reduce_to: int, default None
Number of topics to reduce topics to.
The specified reduction method will be used to merge them.
Expand Down Expand Up @@ -188,6 +188,7 @@ def __init__(
"soft-c-tf-idf",
"centroid",
"bayes",
"fighting-words",
] = "soft-c-tf-idf",
n_reduce_to: Optional[int] = None,
reduction_method: Literal[
Expand All @@ -202,6 +203,7 @@ def __init__(
"soft-c-tf-idf",
"centroid",
"bayes",
"fighting-words",
]:
raise ValueError(feature_message)
if isinstance(encoder, int):
Expand Down Expand Up @@ -364,21 +366,21 @@ def reset_topics(self):
def estimate_components(
self,
feature_importance: Literal[
"centroid", "soft-c-tf-idf", "bayes", "c-tf-idf"
"centroid", "soft-c-tf-idf", "bayes", "c-tf-idf", "fighting-words"
],
) -> np.ndarray:
"""Estimates feature importances based on a fitted clustering.

Parameters
----------
feature_importance: {'soft-c-tf-idf', 'c-tf-idf', 'bayes', 'centroid'}, default 'soft-c-tf-idf'
feature_importance: {'soft-c-tf-idf', 'c-tf-idf', 'bayes', 'centroid', 'fighting-words'}, default 'soft-c-tf-idf'
Method for estimating term importances.
'centroid' uses distances from cluster centroid similarly
to Top2Vec.
'c-tf-idf' uses BERTopic's c-tf-idf.
'soft-c-tf-idf' uses Soft c-TF-IDF from GMM, the results should
be very similar to 'c-tf-idf'.
'bayes' uses Bayes' rule.
'fighting-words', uses the fighting-words algorithm (a Bayesian probabilistic model).

Returns
-------
Expand Down Expand Up @@ -426,6 +428,10 @@ def estimate_components(
self.components_ = bayes_rule(
document_topic_matrix, self.doc_term_matrix
)
elif feature_importance == "fighting-words":
self.components_ = fighting_words(
document_topic_matrix, self.doc_term_matrix
)
else:
self.components_ = ctf_idf(
document_topic_matrix, self.doc_term_matrix
Expand Down Expand Up @@ -556,6 +562,10 @@ def estimate_temporal_components(
self.temporal_components_[i_timebin] = bayes_rule(
t_doc_topic, t_dtm
)
elif feature_importance == "fighting-words":
self.temporal_components_[i_timebin] = fighting_words(
t_doc_topic, t_dtm
)
elif feature_importance == "centroid":
t_topic_vectors = self._calculate_topic_vectors(
time_labels == i_timebin,
Expand Down
Empty file.
Loading
Loading