diff --git a/papers/topeax/citations.bib b/papers/topeax/citations.bib
new file mode 100644
index 0000000..6306cbf
--- /dev/null
+++ b/papers/topeax/citations.bib
@@ -0,0 +1,323 @@
+@article{turftopic, doi = {10.21105/joss.08183}, url = {https://doi.org/10.21105/joss.08183}, year = {2025}, publisher = {The Open Journal}, volume = {10}, number = {111}, pages = {8183}, author = {Kardos, Márton and Enevoldsen, Kenneth C. and Kostkan, Jan and Kristensen-McLachlan, Ross Deans and Rocca, Roberta}, title = {Turftopic: Topic Modelling with Contextual Representations from Sentence Transformers}, journal = {Journal of Open Source Software} }
+
+@article{tsne,
+ author = {Laurens van der Maaten and Geoffrey Hinton},
+ title = {Visualizing Data using t-SNE},
+ journal = {Journal of Machine Learning Research},
+ year = {2008},
+ volume = {9},
+ number = {86},
+ pages = {2579--2605},
+ url = {http://jmlr.org/papers/v9/vandermaaten08a.html}
+}
+
+@inproceedings{sklearn_api,
+ author = {Lars Buitinck and Gilles Louppe and Mathieu Blondel and
+ Fabian Pedregosa and Andreas Mueller and Olivier Grisel and
+ Vlad Niculae and Peter Prettenhofer and Alexandre Gramfort
+ and Jaques Grobler and Robert Layton and Jake VanderPlas and
+ Arnaud Joly and Brian Holt and Ga{\"{e}}l Varoquaux},
+ title = {{API} design for machine learning software: experiences from the scikit-learn
+ project},
+ booktitle = {ECML PKDD Workshop: Languages for Data Mining and Machine Learning},
+ year = {2013},
+ pages = {108--122},
+}
+
+@article{using_tsne,
+ author = {Wattenberg, Martin and Viégas, Fernanda and Johnson, Ian},
+ title = {How to Use t-SNE Effectively},
+ journal = {Distill},
+ year = {2016},
+ url = {http://distill.pub/2016/misread-tsne},
+ doi = {10.23915/distill.00002}
+}
+
+@article{understanding_umap,
+ author = {Andy Coenen and Adam Pearce},
+ title = {Understanding UMAP},
+ url = {https://pair-code.github.io/understanding-umap/},
+}
+
+@article{scott,
+ author = {Scott, David W.},
+ title = {On optimal and data-based histograms},
+ journal = {Biometrika},
+ volume = {66},
+ number = {3},
+ pages = {605-610},
+ year = {1979},
+ month = {12},
+ abstract = {In this paper the formula for the optimal histogram bin width is derived which asymptotically minimizes the integrated mean squared error. Monte Carlo methods are used to verify the usefulness of this formula for small samples. A data-based procedure for choosing the bin width parameter is proposed, which assumes a Gaussian reference standard and requires only the sample size and an estimate of the standard deviation. The sensitivity of the procedure is investigated using several probability models which violate the Gaussian assumption.},
+ issn = {0006-3444},
+ doi = {10.1093/biomet/66.3.605},
+ url = {https://doi.org/10.1093/biomet/66.3.605},
+ eprint = {https://academic.oup.com/biomet/article-pdf/66/3/605/632347/66-3-605.pdf},
+}
+
+@misc{top2vec,
+ title={Top2Vec: Distributed Representations of Topics},
+ author={Dimo Angelov},
+ year={2020},
+ eprint={2008.09470},
+ archivePrefix={arXiv},
+ primaryClass={cs.CL},
+ url={https://arxiv.org/abs/2008.09470},
+}
+
+@article{probabilistic_topic_models,
+author = {Blei, David M.},
+title = {Probabilistic topic models},
+year = {2012},
+issue_date = {April 2012},
+publisher = {Association for Computing Machinery},
+address = {New York, NY, USA},
+volume = {55},
+number = {4},
+issn = {0001-0782},
+url = {https://doi.org/10.1145/2133806.2133826},
+doi = {10.1145/2133806.2133826},
+abstract = {Surveying a suite of algorithms that offer a solution to managing large document archives.},
+journal = {Commun. ACM},
+month = apr,
+pages = {77–84},
+numpages = {8}
+}
+
+
+@InProceedings{hdbscan,
+author="Campello, Ricardo J. G. B.
+and Moulavi, Davoud
+and Sander, Joerg",
+editor="Pei, Jian
+and Tseng, Vincent S.
+and Cao, Longbing
+and Motoda, Hiroshi
+and Xu, Guandong",
+title="Density-Based Clustering Based on Hierarchical Density Estimates",
+booktitle="Advances in Knowledge Discovery and Data Mining",
+year="2013",
+publisher="Springer Berlin Heidelberg",
+address="Berlin, Heidelberg",
+pages="160--172",
+abstract="We propose a theoretically and practically improved density-based, hierarchical clustering method, providing a clustering hierarchy from which a simplified tree of significant clusters can be constructed. For obtaining a ``flat'' partition consisting of only the most significant clusters (possibly corresponding to different density thresholds), we propose a novel cluster stability measure, formalize the problem of maximizing the overall stability of selected clusters, and formulate an algorithm that computes an optimal solution to this problem. We demonstrate that our approach outperforms the current, state-of-the-art, density-based clustering methods on a wide variety of real world data.",
+isbn="978-3-642-37456-2"
+}
+
+@article{embeddinggemma,
+ title={EmbeddingGemma: Powerful and Lightweight Text Representations},
+ author={Schechter Vera, Henrique* and Dua, Sahil* and Zhang, Biao and Salz, Daniel and Mullins, Ryan and Raghuram Panyam, Sindhu and Smoot, Sara and Naim, Iftekhar and Zou, Joe and Chen, Feiyang and Cer, Daniel and Lisak, Alice and Choi, Min and Gonzalez, Lucas and Sanseviero, Omar and Cameron, Glenn and Ballantyne, Ian and Black, Kat and Chen, Kaifeng and Wang, Weiyi and Li, Zhe and Martins, Gus and Lee, Jinhyuk and Sherwood, Mark and Ji, Juyeong and Wu, Renjie and Zheng, Jingxiao and Singh, Jyotinder and Sharma, Abheesht and Sreepat, Divya and Jain, Aashi and Elarabawy, Adham and Co, AJ and Doumanoglou, Andreas and Samari, Babak and Hora, Ben and Potetz, Brian and Kim, Dahun and Alfonseca, Enrique and Moiseev, Fedor and Han, Feng and Palma Gomez, Frank and Hernández Ábrego, Gustavo and Zhang, Hesen and Hui, Hui and Han, Jay and Gill, Karan and Chen, Ke and Chen, Koert and Shanbhogue, Madhuri and Boratko, Michael and Suganthan, Paul and Duddu, Sai Meher Karthik and Mariserla, Sandeep and Ariafar, Setareh and Zhang, Shanfeng and Zhang, Shijie and Baumgartner, Simon and Goenka, Sonam and Qiu, Steve and Dabral, Tanmaya and Walker, Trevor and Rao, Vikram and Khawaja, Waleed and Zhou, Wenlei and Ren, Xiaoqi and Xia, Ye and Chen, Yichang and Chen, Yi-Ting and Dong, Zhe and Ding, Zhongli and Visin, Francesco and Liu, Gaël and Zhang, Jiageng and Kenealy, Kathleen and Casbon, Michelle and Kumar, Ravin and Mesnard, Thomas and Gleicher, Zach and Brick, Cormac and Lacombe, Olivier and Roberts, Adam and Sung, Yunhsuan and Hoffmann, Raphael and Warkentin, Tris and Joulin, Armand and Duerig, Tom and Seyedhosseini, Mojtaba},
+ publisher={Google DeepMind},
+ year={2025},
+ url={https://arxiv.org/abs/2509.20354}
+}
+
+@inproceedings{sbert,
+ title = "Sentence-{BERT}: Sentence Embeddings using {S}iamese {BERT}-Networks",
+ author = "Reimers, Nils and
+ Gurevych, Iryna",
+ editor = "Inui, Kentaro and
+ Jiang, Jing and
+ Ng, Vincent and
+ Wan, Xiaojun",
+ booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP)",
+ month = nov,
+ year = "2019",
+ address = "Hong Kong, China",
+ publisher = "Association for Computational Linguistics",
+ url = "https://aclanthology.org/D19-1410/",
+ doi = "10.18653/v1/D19-1410",
+ pages = "3982--3992",
+ abstract = "BERT (Devlin et al., 2018) and RoBERTa (Liu et al., 2019) has set a new state-of-the-art performance on sentence-pair regression tasks like semantic textual similarity (STS). However, it requires that both sentences are fed into the network, which causes a massive computational overhead: Finding the most similar pair in a collection of 10,000 sentences requires about 50 million inference computations ({\textasciitilde}65 hours) with BERT. The construction of BERT makes it unsuitable for semantic similarity search as well as for unsupervised tasks like clustering. In this publication, we present Sentence-BERT (SBERT), a modification of the pretrained BERT network that use siamese and triplet network structures to derive semantically meaningful sentence embeddings that can be compared using cosine-similarity. This reduces the effort for finding the most similar pair from 65 hours with BERT / RoBERTa to about 5 seconds with SBERT, while maintaining the accuracy from BERT. We evaluate SBERT and SRoBERTa on common STS tasks and transfer learning tasks, where it outperforms other state-of-the-art sentence embeddings methods."
+}
+
+@misc{bertopic,
+ title={BERTopic: Neural topic modeling with a class-based TF-IDF procedure},
+ author={Maarten Grootendorst},
+ year={2022},
+ eprint={2203.05794},
+ archivePrefix={arXiv},
+ primaryClass={cs.CL},
+ url={https://arxiv.org/abs/2203.05794},
+}
+
+@article{mmteb,
+ author = {Kenneth Enevoldsen and Isaac Chung and Imene Kerboua and Márton Kardos and Ashwin Mathur and David Stap and Jay Gala and Wissam Siblini and Dominik Krzemiński and Genta Indra Winata and Saba Sturua and Saiteja Utpala and Mathieu Ciancone and Marion Schaeffer and Gabriel Sequeira and Diganta Misra and Shreeya Dhakal and Jonathan Rystrøm and Roman Solomatin and Ömer Çağatan and Akash Kundu and Martin Bernstorff and Shitao Xiao and Akshita Sukhlecha and Bhavish Pahwa and Rafał Poświata and Kranthi Kiran GV and Shawon Ashraf and Daniel Auras and Björn Plüster and Jan Philipp Harries and Loïc Magne and Isabelle Mohr and Mariya Hendriksen and Dawei Zhu and Hippolyte Gisserot-Boukhlef and Tom Aarsen and Jan Kostkan and Konrad Wojtasik and Taemin Lee and Marek Šuppa and Crystina Zhang and Roberta Rocca and Mohammed Hamdy and Andrianos Michail and John Yang and Manuel Faysse and Aleksei Vatolin and Nandan Thakur and Manan Dey and Dipam Vasani and Pranjal Chitale and Simone Tedeschi and Nguyen Tai and Artem Snegirev and Michael Günther and Mengzhou Xia and Weijia Shi and Xing Han Lù and Jordan Clive and Gayatri Krishnakumar and Anna Maksimova and Silvan Wehrli and Maria Tikhonova and Henil Panchal and Aleksandr Abramov and Malte Ostendorff and Zheng Liu and Simon Clematide and Lester James Miranda and Alena Fenogenova and Guangyu Song and Ruqiya Bin Safi and Wen-Ding Li and Alessia Borghini and Federico Cassano and Hongjin Su and Jimmy Lin and Howard Yen and Lasse Hansen and Sara Hooker and Chenghao Xiao and Vaibhav Adlakha and Orion Weller and Siva Reddy and Niklas Muennighoff},
+ doi = {10.48550/arXiv.2502.13595},
+ journal = {arXiv preprint arXiv:2502.13595},
+ publisher = {arXiv},
+ title = {MMTEB: Massive Multilingual Text Embedding Benchmark},
+ url = {https://arxiv.org/abs/2502.13595},
+ year = {2025},
+}
+
+@inproceedings{glove,
+ title = "{G}lo{V}e: Global Vectors for Word Representation",
+ author = "Pennington, Jeffrey and
+ Socher, Richard and
+ Manning, Christopher",
+ editor = "Moschitti, Alessandro and
+ Pang, Bo and
+ Daelemans, Walter",
+ booktitle = "Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing ({EMNLP})",
+ month = oct,
+ year = "2014",
+ address = "Doha, Qatar",
+ publisher = "Association for Computational Linguistics",
+ url = "https://aclanthology.org/D14-1162/",
+ doi = "10.3115/v1/D14-1162",
+ pages = "1532--1543"
+}
+
+@article{fmi,
+author = {E. B. Fowlkes and C. L. Mallows},
+title = {A Method for Comparing Two Hierarchical Clusterings},
+journal = {Journal of the American Statistical Association},
+volume = {78},
+number = {383},
+pages = {553--569},
+year = {1983},
+publisher = {ASA Website},
+doi = {10.1080/01621459.1983.10478008},
+URL = {
+ https://www.tandfonline.com/doi/abs/10.1080/01621459.1983.10478008
+},
+eprint = {
+ https://www.tandfonline.com/doi/pdf/10.1080/01621459.1983.10478008
+}
+}
+
+
+@inproceedings{s3,
+ title = "$S^3$ - Semantic Signal Separation",
+ author = "Kardos, M{\'a}rton and
+ Kostkan, Jan and
+ Enevoldsen, Kenneth and
+ Vermillet, Arnault-Quentin and
+ Nielbo, Kristoffer and
+ Rocca, Roberta",
+ editor = "Che, Wanxiang and
+ Nabende, Joyce and
+ Shutova, Ekaterina and
+ Pilehvar, Mohammad Taher",
+ booktitle = "Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
+ month = jul,
+ year = "2025",
+ address = "Vienna, Austria",
+ publisher = "Association for Computational Linguistics",
+ url = "https://aclanthology.org/2025.acl-long.32/",
+ doi = "10.18653/v1/2025.acl-long.32",
+ pages = "633--666",
+ ISBN = "979-8-89176-251-0",
+ abstract = "Topic models are useful tools for discovering latent semantic structures in large textual corpora. Recent efforts have been oriented at incorporating contextual representations in topic modeling and have been shown to outperform classical topic models. These approaches are typically slow, volatile, and require heavy preprocessing for optimal results. We present Semantic Signal Separation ($S^3$), a theory-driven topic modeling approach in neural embedding spaces. $S^3$ conceptualizes topics as independent axes of semantic space and uncovers these by decomposing contextualized document embeddings using Independent Component Analysis. Our approach provides diverse and highly coherent topics, requires no preprocessing, and is demonstrated to be the fastest contextual topic model, being, on average, 4.5x faster than the runner-up BERTopic. We offer an implementation of $S^3$, and all contextual baselines, in the Turftopic Python package."
+}
+
+@inproceedings{proxann,
+ title = "{P}rox{A}nn: Use-Oriented Evaluations of Topic Models and Document Clustering",
+ author = "Hoyle, Alexander Miserlis and
+ Calvo-Bartolom{\'e}, Lorena and
+ Boyd-Graber, Jordan Lee and
+ Resnik, Philip",
+ editor = "Che, Wanxiang and
+ Nabende, Joyce and
+ Shutova, Ekaterina and
+ Pilehvar, Mohammad Taher",
+ booktitle = "Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
+ month = jul,
+ year = "2025",
+ address = "Vienna, Austria",
+ publisher = "Association for Computational Linguistics",
+ url = "https://aclanthology.org/2025.acl-long.772/",
+ doi = "10.18653/v1/2025.acl-long.772",
+ pages = "15872--15897",
+ ISBN = "979-8-89176-251-0",
+ abstract = "Topic models and document-clustering evaluations either use automated metrics that align poorly with human preferences, or require expert labels that are intractable to scale. We design a scalable human evaluation protocol and a corresponding automated approximation that reflect practitioners' real-world usage of models. Annotators{---}or an LLM-based proxy{---}review text items assigned to a topic or cluster, infer a category for the group, then apply that category to other documents. Using this protocol, we collect extensive crowdworker annotations of outputs from a diverse set of topic models on two datasets. We then use these annotations to validate automated proxies, finding that the best LLM proxy is statistically indistinguishable from a human annotator and can therefore serve as a reasonable substitute in automated evaluations."
+}
+
+@inproceedings{ctop2vec,
+ title = "Topic Modeling: Contextual Token Embeddings Are All You Need",
+ author = "Angelov, Dimo and
+ Inkpen, Diana",
+ editor = "Al-Onaizan, Yaser and
+ Bansal, Mohit and
+ Chen, Yun-Nung",
+ booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2024",
+ month = nov,
+ year = "2024",
+ address = "Miami, Florida, USA",
+ publisher = "Association for Computational Linguistics",
+ url = "https://aclanthology.org/2024.findings-emnlp.790/",
+ doi = "10.18653/v1/2024.findings-emnlp.790",
+ pages = "13528--13539",
+ abstract = "The goal of topic modeling is to find meaningful topics that capture the information present in a collection of documents. The main challenges of topic modeling are finding the optimal number of topics, labeling the topics, segmenting documents by topic, and evaluating topic model performance. Current neural approaches have tackled some of these problems but none have been able to solve all of them. We introduce a novel topic modeling approach, Contextual-Top2Vec, which uses document contextual token embeddings, it creates hierarchical topics, finds topic spans within documents and labels topics with phrases rather than just words. We propose the use of BERTScore to evaluate topic coherence and to evaluate how informative topics are of the underlying documents. Our model outperforms the current state-of-the-art models on a comprehensive set of topic model evaluation metrics."
+}
+
+@InProceedings{doc2vec,
+ title = {Distributed Representations of Sentences and Documents},
+ author = {Le, Quoc and Mikolov, Tomas},
+ booktitle = {Proceedings of the 31st International Conference on Machine Learning},
+ pages = {1188--1196},
+ year = {2014},
+ editor = {Xing, Eric P. and Jebara, Tony},
+ volume = {32},
+ number = {2},
+ series = {Proceedings of Machine Learning Research},
+ address = {Bejing, China},
+ month = {22--24 Jun},
+ publisher = {PMLR},
+ pdf = {http://proceedings.mlr.press/v32/le14.pdf},
+ url = {https://proceedings.mlr.press/v32/le14.html},
+ abstract = {Many machine learning algorithms require the input to be represented as a fixed length feature vector. When it comes to texts, one of the most common representations is bag-of-words. Despite their popularity, bag-of-words models have two major weaknesses: they lose the ordering of the words and they also ignore semantics of the words. For example, "powerful," "strong" and "Paris" are equally distant. In this paper, we propose an unsupervised algorithm that learns vector representations of sentences and text documents. This algorithm represents each document by a dense vector which is trained to predict words in the document. Its construction gives our algorithm the potential to overcome the weaknesses of bag-of-words models. Empirical results show that our technique outperforms bag-of-words models as well as other techniques for text representations. Finally, we achieve new state-of-the-art results on several text classification and sentiment analysis tasks.}
+}
+
+@inproceedings{npmi_colloc,
+ title={Normalized (pointwise) mutual information in collocation extraction},
+ author={Gerlof J. Bouma},
+ year={2009},
+ url={https://api.semanticscholar.org/CorpusID:2762657}
+}
+
+@inproceedings{npmi_coherence,
+author = {R\"{o}der, Michael and Both, Andreas and Hinneburg, Alexander},
+title = {Exploring the Space of Topic Coherence Measures},
+year = {2015},
+isbn = {9781450333177},
+publisher = {Association for Computing Machinery},
+address = {New York, NY, USA},
+url = {https://doi.org/10.1145/2684822.2685324},
+doi = {10.1145/2684822.2685324},
+abstract = {Quantifying the coherence of a set of statements is a long standing problem with many potential applications that has attracted researchers from different sciences. The special case of measuring coherence of topics has been recently studied to remedy the problem that topic models give no guaranty on the interpretablity of their output. Several benchmark datasets were produced that record human judgements of the interpretability of topics. We are the first to propose a framework that allows to construct existing word based coherence measures as well as new ones by combining elementary components. We conduct a systematic search of the space of coherence measures using all publicly available topic relevance data for the evaluation. Our results show that new combinations of components outperform existing measures with respect to correlation to human ratings. nFinally, we outline how our results can be transferred to further applications in the context of text mining, information retrieval and the world wide web.},
+booktitle = {Proceedings of the Eighth ACM International Conference on Web Search and Data Mining},
+pages = {399–408},
+numpages = {10},
+keywords = {topic coherence, topic evaluation, topic model},
+location = {Shanghai, China},
+series = {WSDM '15}
+}
+
+@article{tweet_dataset,
+ title={Finding Core Topics: Topic Extraction with Clustering on Tweet},
+ author={Sungchul Kim and Sungho Jeon and Jinha Kim and Young-Ho Park and Hwanjo Yu},
+ journal={2012 Second International Conference on Cloud and Green Computing},
+ year={2012},
+ pages={777-782},
+ url={https://api.semanticscholar.org/CorpusID:7986603}
+}
+
+@inproceedings{bbc_news_dataset,
+author = {Greene, Derek and Cunningham, P\'{a}draig},
+title = {Practical solutions to the problem of diagonal dominance in kernel document clustering},
+year = {2006},
+isbn = {1595933832},
+publisher = {Association for Computing Machinery},
+address = {New York, NY, USA},
+url = {https://doi.org/10.1145/1143844.1143892},
+doi = {10.1145/1143844.1143892},
+abstract = {In supervised kernel methods, it has been observed that the performance of the SVM classifier is poor in cases where the diagonal entries of the Gram matrix are large relative to the off-diagonal entries. This problem, referred to as diagonal dominance, often occurs when certain kernel functions are applied to sparse high-dimensional data, such as text corpora. In this paper we investigate the implications of diagonal dominance for unsupervised kernel methods, specifically in the task of document clustering. We propose a selection of strategies for addressing this issue, and evaluate their effectiveness in producing more accurate and stable clusterings.},
+booktitle = {Proceedings of the 23rd International Conference on Machine Learning},
+pages = {377–384},
+numpages = {8},
+location = {Pittsburgh, Pennsylvania, USA},
+series = {ICML '06}
+}
+@article{umap, title={Uniform manifold approximation and projection}, volume={4}, url={https://www.nature.com/articles/s43586-024-00363-x#citeas}, DOI={10.1038/s43586-024-00363-x}, number={1}, journal={Nature Reviews Methods Primers}, author={Healy, John and McInnes, Leland}, year={2024}, month=nov }
diff --git a/papers/topeax/figures/poster.pdf b/papers/topeax/figures/poster.pdf
new file mode 100644
index 0000000..1a07ddf
Binary files /dev/null and b/papers/topeax/figures/poster.pdf differ
diff --git a/papers/topeax/figures/poster.svg b/papers/topeax/figures/poster.svg
new file mode 100644
index 0000000..fe29ae8
--- /dev/null
+++ b/papers/topeax/figures/poster.svg
@@ -0,0 +1,821 @@
+
+
+
+
diff --git a/papers/topeax/figures/steps_plot.png b/papers/topeax/figures/steps_plot.png
new file mode 100644
index 0000000..4c9125c
Binary files /dev/null and b/papers/topeax/figures/steps_plot.png differ
diff --git a/papers/topeax/main.pdf b/papers/topeax/main.pdf
index 975d253..3adf27e 100644
Binary files a/papers/topeax/main.pdf and b/papers/topeax/main.pdf differ
diff --git a/papers/topeax/main.typ b/papers/topeax/main.typ
index 769aecb..d819171 100644
--- a/papers/topeax/main.typ
+++ b/papers/topeax/main.typ
@@ -15,6 +15,9 @@
radius: 5pt,
extent: 3pt
)
+#set cite(
+ style: "apa",
+)
#let appendix(body) = {
set heading(numbering: "A", supplement: [Appendix])
@@ -31,6 +34,7 @@
#par[
*Márton Kardos* \
Aarhus University \
+ Student no. 202105399\
#link("mailto:martonkardos@cas.au.dk")
]
@@ -53,53 +57,74 @@
= Introduction
+Topic models are statistical models that can identify latent topic variables in a selection of documents, and can describe them with keywords/phrases @probabilistic_topic_models.
+Older topic models typically relied on bag-of-words representations of text, and conceptualized topic discovery as recovering latent factors that generate word content in documents.
+Due to advances in text embedding, documents can now be encoded into dense neural representations @doc2vec @sbert.
+
== Clustering Topic Models
-#figure(
- image("figures/clustering_models.png", width: 100%),
- caption: [Schematic overview of clustering topic models' steps.],
-)
+Neural text embeddings are easier to cluster than bag-of-words document vectors,
+and this has allowed researchers to conceptualize topic modelling as discovering clusters of documents in embedding space.
+
+The Top2Vec model @top2vec relies on a multi-stage pipeline for discovering interpretable topics in embedding spaces. Document embeddings are first reduced to a lower dimensionality using a manifold learning technique called UMAP @umap. Next, documents are clustered using a density-based technique called HDBSCAN @hdbscan, which in theory, can also determine the number of clusters empirically.
+After discovering clusters, Top2Vec assigns importance to words based on their proximity in embedding space to topic vectors, which are centroids of the discovered clusters.
+More recently, #cite(, form: "prose") have used a sliding window over BERT embeddings to get clusters of contextualized document chunks, introducing c-Top2Vec. The clustering methodology and term-importance estimation schemes, however, remain the same.
+
+#align(center)[$t_k = frac(sum_(d in T_k) x_d, |T_k|); beta_("kj") = cos(t_k, w_j)$]
+where $t_k$ is the embedding of topic $k$ and $x_d$ is the embedding of document $d$, $T_k$ is the set of documents in topic $k$, $w_j$ is the embedding of term $j$ and $beta_("kj")$ is the importance of term $j$ in topic $k$.
+
+BERTopic @bertopic is a very similar model, with the only difference being that it uses a weighting scheme called c-TF-IDF (see @c_tf_idf for formula) for computing term importance instead. This approach is more theoretically correct, as Top2Vec makes the assumption that clusters are spherical, which is likely not the case with a density-based clustering model.
+
+Users will commonly find that these topic models discover a larger number of topics than they find useful for their analysis. In order to combat this, both methods have a _hierarchical topic reduction_ method. In both cases, users have to specify how many topics they would like to have in the end, and then clusters are merged until this desired number is reached. BERTopic utilizes agglomerative clustering with average linkage, while Top2Vec merges the smallest cluster to the closest one based on centroid proximity.
+Although the clustering topic models have been enjoying popularity in academia (BERTopic has at the time of writing 3726 citations on Google Scholar, while Top2Vec has 827), they are plagued by a number of problems.
+#cite(, form: "prose") found, using extensive human evaluation, BERTopic no better than older models like LDA. #cite(, form: "prose") found that BERTopic often includes stop-words in topics, and Top2Vec is usually negatively affected by higher-dimensional embeddings.
+In addition, these models have usually been evaluated in a setting where the topics were reduced hierarchically, including the original papers.
+Our knowledge is limited on how well these models perform when they have to determine the number of topics themselves, and how well they recover clusters in corpora.
+
+This papers' contributions can be summarized as follows:
+Firstly, I evaluate these models in a free-clustering scenario, without specifying the number of clusters, both on how well they match gold cluster labels, as well as topic quality.
+Secondly, they are evaluated based on their sensitivity to subsampling and hyperparameters.
+And thirdly, I introduce a novel method, Topeax, which outperforms Top2Vec and BERTopic on these tasks.
= Model Specification
I introduce Topeax, a novel topic modelling approach based on document clustering.
-The model differs in a number of aspects from traditional clustering topic models like BERTopic and Top2Vec. The model is implemented in the Turftopic Python package (cite), following scikit-learn API conventions.
-Example usage is presented in @example_code.
+The model is implemented in the Turftopic Python package @turftopic, following scikit-learn API conventions @sklearn_api.
+Example usage, along with figure and keywords are presented in @example_code. This section will outline how Topeax discovers topics.
#figure(
image("figures/peax.png", width: 100%),
caption: [A schematic overview of the Peax clustering algorithm.
- \ Illustrations were generated from the _political ideologies dataset#footnote[https://huggingface.co/datasets/JyotiNayak/political_ideologies]._],
+ \ Illustrations were generated from the _political ideologies dataset#footnote[#link("https://huggingface.co/datasets/JyotiNayak/political_ideologies")]._],
)
== Dimensionality Reduction
Unlike other clustering topic models, Topeax relies on
-t-Distributed Stochastic Neighbour Embeddings (cite it here) instead of UMAP.
-I use the the cosine metric to calculate document similarities for TSNE,
-as it is widely used for model training and downstream applications.
-The number of dimensions was fixed to 2 in all of our experiments,
-as this allows us to visualize the reduced embeddings.
-Additionally, TSNE has fewer hyperparameters than UMAP.
-While it has been demonstrated that TSNE can be sensitive the chosen value of `perplexity`,
+t-Distributed Stochastic Neighbour Embeddings @tsne instead of UMAP.
+Cosine distance is used as the distance metric for dimensionality reduction,
+due to its wide-spread use in model training and downstream applications.
+The number of dimensions was fixed to 2 in all experiments,
+as this allows for easier visualization.
+While it has been demonstrated that TSNE can be sensitive the chosen value of `perplexity` @using_tsne,
we will show that, within a reasonable range, this will not have an effect on the number of topics
or topic quality.
== The Peax Clustering Model
-While HDBSCAN is the choice of clustering model for both BERTopic and Top2Vec,
-I introduce a new technique for document clustering, termed *#highlight[Peax]*, which,
-instead, clusters documents based on density peaks in the reduced document space.
+For the clustering step in the modelling pipeline,
+I introduce a new technique for clustering, termed *#highlight[Peax]*, which
+clusters documents based on density peaks in the reduced document space.
The Peax algorithm consists of the following steps:
-+ A Gaussian Kernel Density Estimate (KDE) is obtained over the reduced document embeddings.
- Bandwidth is determined with the Scott method.
-+ The KDE is evaluated on a 100x100 grid over the embedding space.
- Density peaks are then detected by applying a local-maximum filter to the KDE heatmap.
++ A Gaussian Kernel Density Estimate (KDE) is obtained over the reduced embeddings.
+ Bandwidth is determined with the Scott method @scott.
++ The KDE is evaluated on a 100x100 grid over the range of the embeddings.
++ Density peaks are detected by applying a local-maximum filter to the KDE heatmap.
A neighbourhood connectivity of 25 is used, which means,
every pixel is included within a 5 unit radius.
+ Cluster centres are assigned to these density peaks.
@@ -110,41 +135,29 @@ The Peax algorithm consists of the following steps:
where $accent(z_d, "^")$ is the estimated underlying component assigned to document $d$,
$accent(x, "^")_d$ is the TSN embedding of document $d$, and $r_("kd")$ is the responsibility of component $k$ for document $d$.
-#figure(
- placement: top,
- image("figures/bbc_news_light.png", width: 80%),
- caption: [Topeax model illustrated on the BBC News dataset. Topics are identified at density peaks, and keywords get selected based on combined term importance.\
- _Left_: Density plot in 2D with topic names and keywords.
- _Right_: Density landscape in 3D with topic names.
-
-],
-)
-
== Term Importance Estimation
To mitigate the issues experienced with c-TF-IDF and centroid-based term importance estimation in previously proposed clustering topic models,
-I introduce a novel approach that uses a combination of a semantic and a lexical cluster-term importance.
+I introduce a novel approach that uses a combination of a embedding-based and a lexical term importance.
=== Semantic Importance
-Semantic term importance is estimated similar to (cite Top2Vec), but,
+Semantic term importance is estimated similar to #cite(, form: "prose"), but,
since we have access to a probabilistic, non-spherical model, and cluster boundaries are not hard,
topic vectors are estimated from the responsibility-weighted average of document embeddings. \
-#align(center)[$t_k = frac(sum_(d) r_("kd") dot x_d, sum_(d) r_("kd"))$]
-where $t_k$ is the embedding of topic $k$ and $x_d$ is the embedding of document $d$.
-Let the embedding of term $j$ be $w_j$. The semantic importance of term $j$ for cluster $k$ is then:
-#align(center)[$s_("kj") = cos(t_k, w_j)$]
+#align(center)[$t_k = frac(sum_(d) r_("kd") dot x_d, sum_(d) r_("kd")); s_("kj") = cos(t_k, w_j)$]
+where $t_k$ is the embedding of topic $k$ and $x_d$ is the embedding of document $d$,
+ $w_j$ is the embedding of term $j$ and the semantic importance of term $j$ for cluster $k$ is $s_("kj")$.
=== Lexical Importance
Instead of relying on a tf-idf-based measure for computing the valence of a term in a corpus,
an information-theoretical approach is used.
-Theoretically, we can estimate the lexical importance of a term for a cluster,
-by computing the mutual information of the term's occurrence with the cluster's occurrence.
-Due to its convenient interpretability properties, I opt for using normalized pointwise mutual information (NPMI),
-which has been historically used for phrase detection (cite) and topic-coherence evaluation (cite).
-
-We calculate the pointwise mutual information by taking the logarithm of the fraction of conditional and marginal word probabilities:
+I estimate the lexical importance of a term for a cluster
+by computing the mutual information of the term's presence with the cluster label.
+Due to its easier interpretability, I opt for using normalized pointwise mutual information (NPMI),
+which has been historically used for collocation extraction @npmi_colloc and topic-coherence evaluation @npmi_coherence.
+Pointwise mutual information is calculated by taking the logarithm of the fraction of conditional and marginal word probabilities:
#align(center)[$"pmi"_("kj") = log_2 frac(p(v_j|z_k), p(v_j))$]
where $p(v_j|z_k)$ is the conditional probability of word $j$ given the presence of topic $z$,
and $p(v_j)$ is the probability of word $j$ occurring.
@@ -164,67 +177,42 @@ Since regular PMI scores have no lower bound, we normalize them to obtain NPMI:
=== Combined Term Importance
To balance the semantic proximity of keywords to topic embeddings and cluster-term occurrences,
-a I introduce a combined approach, which consists of the geometric mean of min-max normalized lexical and semantic scores:
+a combined approach is used, which consists in the geometric mean of min-max normalized lexical and semantic importance:
#align(center)[$beta_("kj") = sqrt(frac(1 + "npmi"_("kj"), 2) dot frac(1 + s_("kj"), 2))$]
= Experimental Methods
-Since one of the main strengths of clustering approaches, that they can supposedly find the number of clusters in the data, and are not given this information a-priori,
-a good clustering topic model should be able to faithfully replicate a human-assigned clustering of the data, and should be able to describe these clusters in a manner that is human-interpretable. I will therefore utilize datasets with gold-standard labels.
-In this section I will outline the criteria and considerations taken into account when designing an evaluation procedure:
-
-+ The number of clusters in the topic model should preferably be not too far from the number of gold categories.
-+ Preferably, if two points are in the same gold category, they should also belong together in the predicted clustering, while points that do not, shouldn't.
-+ For topic modelling purposes, it is often preferable that the number of clusters is not overly large.
- Topic models should, in theory, aid the understanding of a corpus. Using a topic model becomes impractical when the number of topics one has to interpret is over a couple hundred.
-+ Topics should be distinct and easily readable.
+Since one of the main strengths of clustering approaches is that they can supposedly find the number of clusters in the data, a good clustering topic model should roughly align with a human clustering of the data,
+and should be able to describe these clusters effectively.
+In the following section, I outline the benchmark used to evaluate models on these aspects.
-Reproducible scripts used for evaluation, along with instructions on how to run them, are made available in the `x-tabdeveloping/topeax-eval`#footnote("https://github.com/x-tabdeveloping/topeax-eval") Github repository. Results for all evaluations can be found in the `results/` directory.
+Reproducible scripts used for evaluation, along with instructions on how to run them, are made available in the `x-tabdeveloping/topeax-eval`#footnote(link("https://github.com/x-tabdeveloping/topeax-eval")) Github repository. Results for all evaluations can be found in the `results/` directory.
== Datasets
-In order to evaluate these properties, I used a number of openly available datasets with gold-standard category metadata.
-This included all clustering tasks from the new version of the Massive Text Embedding Benchmark `MTEB(eng, v2)` (cite).
+In order to evaluate models on a variety of domains, I used openly available datasets with gold labels.
+This included all clustering tasks from the new version of the Massive Text Embedding Benchmark `MTEB(eng, v2)` @mmteb.
To avoid evaluating on the same corpus twice, the P2P variants of the tasks where used.
-In addition an annotated Twitter topic-classification dataset, and a BBC News dataset was used.
-
-#figure(
- caption: [Descriptive statistics of the datasets used for evaluation\ _Document length is reported as mean±standard deviation_],
- table(
- columns: 4,
- stroke: none,
- align: (left, center, center, center),
- table.hline(),
- table.header[*Dataset*][*Document Length*\ _N characters_ ][*Corpus Size*\ _N documents_ ][*Clusters* \ _N unique gold labels_],
- table.hline(),
- [ArXivHierarchicalClusteringP2P],[1008.44±438.01],[2048],[23],
- [BiorxivClusteringP2P.v2],[1663.97±541.93],[53787],[26],
- [MedrxivClusteringP2P.v2],[1981.20±922.01],[37500],[51],
- [StackExchangeClusteringP2P.v2],[1091.06±808.88],[74914],[524],
- [TwentyNewsgroupsClustering.v2],[32.04±14.60],[59545],[20],
- [TweetTopicClustering],[165.66±68.19],[4374],[6],
- [BBCNewsClustering],[1000.46±638.41],[2224],[5],
- table.hline(),
- )
-)
+In addition an annotated Twitter topic-classification dataset @tweet_dataset, and a BBC News dataset @bbc_news_dataset was used.
+I report descriptive statistics in @appx_dataset_stats.
== Models
To compare Topeax with existing approaches, it was run on all corpora alongside BERTopic and Top2Vec.
-Implementations were sourced from the Turftopic (cite) Python package.
+Implementations were sourced from the Turftopic @turftopic Python package.
For the main analysis, default hyperparameters were used from the original BERTopic and Top2Vec packages respectively,
-as these give different clusterings, despite having the same pipeline.
-All models were run with both the `all-MiniLM-L6-v2`, the slightly larger and higher performing `all-mpnet-base-v2` sentence encoders (cite sbert), as well as Google's `embeddinggemma-300m`
+as these give different clusterings.
+All models were run with both the `all-MiniLM-L6-v2`, and the slightly larger `all-mpnet-base-v2` sentence encoders @sbert, as well as Google's `embeddinggemma-300m` @embeddinggemma
to control for embedding size and quality.
The models were fitted without filtering for stop words and uncommon terms,
-since state-of-the art topic models have been shown to be able to handle such information without issues (cite S3).
+since state-of-the art topic models have been shown to be able to handle such information without issues @s3.
== Metrics
For evaluating model performance, both clustering quality and topic quality was evaluated.
-I evaluated the faithfulness of the predicted clustering to the gold labels using the Fowlkes-Mallows index (cite).
+I evaluated the faithfulness of the predicted clustering to the gold labels using the Fowlkes-Mallows index @fmi.
The FMI, is very similar to the F1 score for classification, in that it also intends to balance precision and recall.
Unlike F1, however, FMI uses the geometric mean of these quantities:
#align(center)[$"FMI" = N_("TP")/sqrt((N_("TP") + N_("FP")) dot (N_("TP") + N_("FN")))$]
@@ -232,8 +220,8 @@ where $N_("TP")$ is the number of pairs of points that get clustered together in
$N_("FP")$ is the number of pairs that get clustered together in the predicted clustering but not in the gold labels (false positives) and
$N_("FN")$ is the number of pairs that do not get clustered together in the predicted clustering, despite them belonging together in the gold labels (false negatives).
-For topic quality, I adopt the methodology of (cite S3), with minor differences.
-I use GloVe embeddings (cite GloVe) for evaluating internal word embedding coherence instead of Skip-gram.
+For topic quality, I adopt the methodology of #cite(, form: "prose"), with minor differences.
+I use GloVe embeddings @glove for evaluating internal word embedding coherence instead of Skip-gram.
As such, topic quality was evaluated on topic diversity $d$, external word embedding coherence $C_("ex")$ using the `word2vec-google-news-300` word embedding model,
as well as internal word embedding coherence $C_("in")$ with a GloVe model trained on each corpus.
Ideally a model should both have high intrinsic and extrinsic coherence, and thus an aggregate measure of coherence can give a better
@@ -244,15 +232,15 @@ We will also refer to this quantity as _interpretability_.
== Sensitivity to Perplexity
Both TSNE and UMAP, have a hyperparameter that determines, how many neighbours of a given point are considered when generating lower-dimensional projections, this hyperparameter is usually referred to as _perplexity_.
-It is also known that both methods are sensitive to the choice of hyperparameters, and depending on these, structures, that do not exist in the higher-dimensional feature space might occur (cite Distill article and "Understanding UMAP").
-In order to see how this affects the Topeax algorithm, and how robust it is to the choice of this hyperparameter in comparison with other clustering topic models, I fitted each model to the 20 Newsgroups corpus from `scikit-learn`, using `all-MiniLM-L6-v2` with `perplexities=[2, 5, 30, 50, 100]`.
-This choice of values was inspired by (cite Distill). Each model was evaluated on the metrics outlined above.
+It is also known that both methods are sensitive to the choice of hyperparameters, and depending on these, structures, that do not exist in the higher-dimensional feature space might appear in the lower-dimensional representations @using_tsne @understanding_umap.
+In order to see how this affects the Topeax algorithm, in comparison with other clustering topic models, I fit each model to the 20 Newsgroups corpus from `scikit-learn`, using `all-MiniLM-L6-v2` with `perplexities=[2, 5, 30, 50, 100]`.
+This choice of values was inspired by #cite(, form: "prose"). Each model was evaluated on the metrics outlined above.
== Subsampling Invariance
Ideally, a good topic model should roughly recover the same topics, and same number of topics in a corpus even when we only have access to a subsample of that corpus, assuming that the underlying categories are the same.
-On the other hand, we would reasonably assume that a model having access to the full corpus, instead of a subsample, should increase the accuracy of the results, not decrease it.
-To evaluate models' ability to cope with subsampling, I fit each model on the same corpus and embeddings as in the perplexity sensitivity test, and evaluate them on the previously outlined metrics.
+We should also expect that a model having access to the full corpus, instead of a subsample, should yield higher quality results.
+I fit each model on the same corpus and embeddings as in the perplexity sensitivity test, and evaluate them on the previously outlined metrics.
Subsample sizes are the following: `[250, 1000, 5000, 10_000, "full"]`.
= Results
@@ -260,24 +248,8 @@ Subsample sizes are the following: `[250, 1000, 5000, 10_000, "full"]`.
Topeax substantially outperformed both Top2Vec and BERTopic in cluster recovery, as well as the quality of the topic keywords (see @performance).
A regression analysis predicting Fowlkes-Mallows index from model type, with random effects and intercepts for encoders and datasets was conducted.
The regression was significant at $alpha=0.05$. ($R^2=0.127$, $F=4.368$, $p=0.0169$).
-Both BERTopic and Top2Vec had significantly negative slopes (see @coeffs).
+Both BERTopic and Top2Vec had significantly negative slopes (coefficients and p-values are reported in @appx_regr).
-#figure(
- table(
- columns: 4,
- align: (left, center, center, center),
- stroke: none,
- table.hline(),
- table.header([*Coefficients*], [*Estimate*], [*p-value*], [*95% CI*]),
- table.hline(),
- [Intercept (_Topeax_)], [0.3405], [0.000], [[0.267, 0.414]],
- [Topeax], [-0.1106], [0.038], [[-0.215, -0.006]],
- [BERTopic], [-0.1479], [0.006], [[-0.252, -0.044]],
- table.hline(),
-
- ),
- caption: [Regression coefficients for predicting Fowlkes-Mallows Index from choice of topic model]
-)
Topeax also exhibited the lowest absolute percentage error in recovering the number of topics (see @performance) with $"MAPE" = 60.52$ ($"SD"=26.19$),
while Top2Vec ($M=1797.29%, "SD"=2622.52$) and BERTopic ($M = 2438.91%,"SD" = 3011.63$) drastically deviated from the number of gold labels in the datasets.
@@ -315,9 +287,9 @@ caption: [Metrics of topic quality compared between different models. Best bold,
== Perplexity
Metrics of quality and number of topics across perplexity values can are displayed on @perplexity_robustness.
-Topeax converges very early on the number of topics with perplexity, and remains stable from `perplexity=5`, while converges at around `perplexity=30` for quality metrics. It is reasonable to conclude that 50 is a reasonable recommendation and default value.
-Meanwhile, BERTopic converges at around `perplexity=50`, and has the lowest performance on all metrics. Top2Vec does not seem to converge at all for the values of perplexity tested, and is most unstable. It does seem to improve with larger values of the hyperparameter.
-Keep in mind, that while BERTopic and Top2Vec improve with higher values, their default is set at `perplexity=15`, which, in light of these evaluations, seems rather unreasonable.
+Topeax converges very early on the number of topics with perplexity, and remains stable from `perplexity=5`, while converges at around `perplexity=30` for quality metrics. It is reasonable to conclude that 50 is a good default value.
+Meanwhile, BERTopic converges at around `perplexity=50`, and has the lowest performance on all metrics. Top2Vec does not seem to converge at all in this range, and is most unstable. It does seem to improve with larger values of the hyperparameter.
+Keep in mind, that while BERTopic and Top2Vec improve with higher values, their default is set at `perplexity=15`, which, in light of these evaluations, is suboptimal.
#figure(
@@ -336,7 +308,7 @@ Number of topics, topic quality and cluster quality are displayed on @subsamplin
Topeax is relatively well-behaved, and converges to the highest performance when it has access to the full corpus.
The number of topics is also relatively stable across from a sample size of 5000 (hovers around 10-12).
In contrast, BERTopic and Top2Vec do not converge to a single value of N topics and keep growing with the size of the subsample.
-This also has an impact on cluster and topic quality. BERTopic has highest performance on the smallest subsamples (250-1000), while Top2Vec has best performance on a subsample of 5000, both methods decrease in performance as the number of topics grows with sample size. This behaviour is far from ideal, and it is apparent that Topeax is much more reliable at determining the number and structure of clusters in subsampled and full corpora.
+This also has an impact on cluster and topic quality. BERTopic has highest performance on the smallest subsamples (250-1000), while Top2Vec has best performance on a subsample of 5000, both methods decrease in performance as the number of topics grows with sample size. This behaviour is far from ideal, and it is apparent that Topeax is much more reliable at determining the number and structure of clusters.
#figure(
image("figures/robustness_sample_size.png", width: 100%),
@@ -350,8 +322,8 @@ This also has an impact on cluster and topic quality. BERTopic has highest perfo
== Qualitative Considerations
-As per the experimental evaluations presented above, Topeax systematically underestimates the number of clusters in a given dataset, despite matching the gold labels better as per the Fowlkes-Mallows index.
-This warrants further investigation. A Topeax model was run on 20 Newsgroups with `all-MiniLM-L6-v2` embeddings, where the estimated number of clusters was 11, while the original dataset contains data from 20 categories, as suggested by its name.
+As per the experimental evaluations presented above, Topeax systematically underestimates the number of clusters in a given dataset, despite matching the gold labels better.
+This warrants further investigation. A Topeax model was run on 20 Newsgroups with `all-MiniLM-L6-v2` embeddings, where the estimated number of clusters was 11, while the original dataset contains data from 20 categories.
Adjusted mutual information was calculated between each topic discovered by the model and each newsgroup (see @20ng_groups).
#figure(
@@ -364,24 +336,24 @@ Adjusted mutual information was calculated between each topic discovered by the
While, indeed the number of clusters is less than the categories in the original dataset, the clustering provided by Topeax is arguably just as natural.
Most clusters ended up compressing information from one or two newsgroups, that were in some way related.
For instance the `1_god_atheism_christians_christianity` topic contained documents from `alt.atheism`, `talk.religion.misc` and `soc.religion.christian`, thereby combining discourse on religion into a single topic. Likewise `6_car_bikes_bmw` compresses the `rec.autos` and `rec.motorcycles` newsgroups.
-In addition, the model uncovered a topic of outlier documents (`7_yer_umm_ahh__i_`), which were either empty, or only contained a few words, no coherent sentences.
+In addition, the model uncovered a topic of outlier documents (`7_yer_umm_ahh__i_`), which were either empty, or contained no coherent sentences.
-Meanwhile, BERTopic discovered 232, and Top2Vec 145 topics in the same corpus using the same embeddings, while labelling 34.15% and 35.07% of documents as outliers respectively.
-While different users and use cases might have different tolerance levels for time spent on analyzing topics, and the number of outliers, this behaviour seems far from ideal under most circumstances.
+BERTopic discovered 232, and Top2Vec 145 topics in the same corpus using the same embeddings, while labelling 34.15% and 35.07% of documents as outliers respectively.
+While users might have different tolerance levels for time spent on analyzing topics, and the number of outliers, this behaviour seems far from ideal under most circumstances.
Interpreting, and labelling the topics would take a considerable amount of time in both cases.
In addition, regarding more than a third of documents as outliers means that a substantial amount of information is not covered by these models.
-This will inevitably prompt users of these topic models to a) hierarchically reduce topics, where they are required to specify the number of topics or b) fiddle with hyperparameters until they arrive at a result they deem sensible.
-It is thus questionable, how much these models are at all able to identify the number of natural clusters in a corpus, and until better and more rigorous heuristics are established for hyperparameter selection, their use remains highly subjective and circular.
+This will inevitably prompt users of these topic models to a) hierarchically reduce topics, where they are required to specify the number of topics or b) change hyperparameters until they arrive at a result they deem sensible.
+It is thus questionable, whether these models are at all able to identify the number of natural clusters in a corpus, and until better and more rigorous heuristics are established for hyperparameter selection, their use remains highly subjective.
= Conclusion
I propose a novel method, Topeax for finding natural clusters in text data, and assigning keywords to these clusters
based on peak finding in kernel-density estimates.
-The model is compared to popular clustering topic models, Top2Vec and BERTopic on a number of clustering datasets from the Massive Text Embedding Benchmark.
+The model is compared to popular clustering topic models, Top2Vec and BERTopic on clustering datasets from the Massive Text Embedding Benchmark.
In addition, models' robustness and stability to sample size and hyperparameter choices is evaluated.
-Topeax approximates human clusterings significantly more faithfully than previous approaches and describes topics with more diverse and coherent keywords.
-Furthermore, the model exhibits much more sensible behaviour under changing circumstances and hyperparameters.
-It is found, however, that Topeax underestimates the number of clusters systematically.
+Topeax approximates human clusterings significantly better than previous approaches and describes topics with more diverse and coherent keywords.
+Furthermore, the model exhibits much more stable behaviour under changing sample size and hyperparameters.
+It is important to note, however, that Topeax underestimates the number of clusters systematically.
Qualitative investigation suggests that this is due to the model grouping together related clusters in the case of 20 Newsgroups.
In light of these findings, Topeax seems a better choice for text clustering,
@@ -398,39 +370,47 @@ In addition the evaluation methodology also has a number of limitations of its o
+ Quantitative metrics of topic quality, while roughly correlate with human preference, do not perfectly capture interpretability. Preferably, future research should evaluate topic quality with human subjects.
+ Subsampling and perplexity were only tested on the 20NG corpus in the interest of time and compute. This is of course a limitation, and evaluation on multiple corpora would be preferable.
+#pagebreak()
+
+#bibliography("citations.bib", style: "apa", title: "References")
+
#pagebreak()
#heading(level:1, numbering: none, "Appendix")
+
#show: appendix
= Example code
Due to the model being implemented in Turftopic,
-it is very easy to run on a new corpus. One first has to install the package:
-
-```bash
-pip install turftopic
-```
-
-Then run fit the model to a corpus, here's an example with 20 Newsgroups:
+you can easily run it on a corpus and print and plot the fitted model's results:
```python
-from sklearn.datasets import fetch_20newsgroups
+# pip install turftopic, datasets, plotly
+from datasets import load_dataset
from turftopic import Topeax
-ds = fetch_20newsgroups(
- subset="all",
- remove=("headers", "footers", "quotes"),
-)
-corpus = ds.data
+ds = load_dataset("gopalkalpande/bbc-news-summary", split="train")
+topeax = Topeax(random_state=42)
+doc_topic = topeax.fit_transform(list(ds["Summaries"]))
+
+topeax.plot_steps()
+```
+(see @steps_plot)
+
+#figure(
+ image("figures/steps_plot.png", width: 100%),
+ caption: [Interactive plot of steps in the Topeax algorithm on the BBC News dataset],
+)
-model = Topeax()
-model.fit(corpus)
-model.print_topics()
+
+```python
+topeax.print_topics()
```
+(see @bbc_keywords)
#figure(
- caption: [Topics found in the 20 Newsgroups corpus],
+ caption: [Top 10 Keywords for the topics found in the BBC News corpus],
table(
columns: 2,
stroke: none,
@@ -438,18 +418,72 @@ model.print_topics()
table.hline(),
table.header([ *ID* ], [*Highest Ranking*]),
table.hline(),
- [ 0 ], [armenians, armenian, israel, israeli, jews, genocide, turkish, palestinians, palestinian, israelis ],
- [ 1 ], [god, christians, atheism, christianity, bible, scripture, christian, theology, faith, church ],
- [ 2 ], [ pitching, pitcher, hitter, baseball, braves, batting, pitchers, cubs, sox, fielder ],
- [ 3] ,[ hockey, nhl, puck, leafs, sabres, bruins, flyers, islanders, team, canucks ],
- [ 4],[ gun, guns, militia, amendment, firearms, homicides, nra, fbi, crime, homicide],
- [ 5],[ patients, disease, medical, treatment, doctor, clinical, vitamin, medicine, treatments, infection ],
- [ 6],[ car, bike, cars, bmw, honda, engine, motorcycle, ford, dealer, bikes ],
- [ 7], [yer, umm, ahhh, \_i\_, \_you\_, cheek, expresses, reacted, ths, advertisement ],
- [ 8], [ ax, nasa, spacecraft, a86, satellite, detectors, satellites, spaceflight, max, langley ],
- [ 9],[ encryption, nsa, key, privacy, security, clipper, chip, encrypted, crypto, cryptography ],
- [ 10], [motherboard, scsi, card, ram, mhz, chipset, bios, hardware, monitor, modem ],
- [ 11],[ windows, xfree86, x11r5, x11, openwindows, jpeg, window, xterm, x11r4, microsoft ],
+ [0],[mobile, microsoft, digital, technology, broadband, phones, devices, internet, mobiles, computer],
+ [1],[economy, growth, economic, deficit, prices, gdp, inflation, currency, rates, exports],
+ [2],[profits, shareholders, shares, takeover, shareholder, company, profit, merger, investors, financial],
+ [3],[film, actor, oscar, films, actress, oscars, bafta, movie, awards, actors],
+ [4],[band, album, song, singer, concert, rock, songs, rapper, rap, grammy],
+ [5],[tory, blair, labour, ukip, mps, minister, election, tories, mr, ministers],
+ [6],[olympic, tennis, iaaf, federer, wimbledon, doping, roddick, champion, athletics, olympics],
+ [7],[rugby, liverpool, england, mourinho, chelsea, premiership, arsenal, gerrard, hodgson, gareth],
table.hline(),
),
-)
+)
+
+= C-TF-IDF
+
+This section contains the formula for computing C-TF-IDF term importance.
+
+- Let $C_("ij")$ be the number of times word j occurs in document i.
+- $"tf"_("kj") = frac(c_("kj"),w_k)$, where
+ $c_("kj") = sum_(i in k) C_("ij")$ is the number of occurrences of a word in a topic and
+ $w_(k)= sum_j c_("kj")$ is all words in the topic
+- Estimate inverse document/topic frequency for term $j$:
+ $"idf"_j = log(1 + frac(A,sum_k |c_("kj")|))$, where
+ $A = frac(sum_k sum_j c_("kj"),M)$ is the average number of words per topic, and $M$ is the number of topics.
+- Calculate importance of term $j$ for topic $k$: $beta_("kj") = "tf"_("kj") dot "idf"_j$
+
+= Descriptive Statistics for Datasets
+
+Testing dataset statistics are reported in @dataset_stats.
+
+#figure(
+ caption: [Descriptive statistics of the datasets used for evaluation\ _Document length is reported as mean±standard deviation_],
+ table(
+ columns: 4,
+ stroke: none,
+ align: (left, center, center, center),
+ table.hline(),
+ table.header[*Dataset*][*Document Length*\ _N characters_ ][*Corpus Size*\ _N documents_ ][*Clusters* \ _N unique gold labels_],
+ table.hline(),
+ [ArXivHierarchicalClusteringP2P],[1008.44±438.01],[2048],[23],
+ [BiorxivClusteringP2P.v2],[1663.97±541.93],[53787],[26],
+ [MedrxivClusteringP2P.v2],[1981.20±922.01],[37500],[51],
+ [StackExchangeClusteringP2P.v2],[1091.06±808.88],[74914],[524],
+ [TwentyNewsgroupsClustering.v2],[32.04±14.60],[59545],[20],
+ [TweetTopicClustering],[165.66±68.19],[4374],[6],
+ [BBCNewsClustering],[1000.46±638.41],[2224],[5],
+ table.hline(),
+ )
+)
+
+= Regression modelling
+
+Coefficients for the model prediction FMI from model type are reported in @coeffs.
+
+#figure(
+ table(
+ columns: 4,
+ align: (left, center, center, center),
+ stroke: none,
+ table.hline(),
+ table.header([*Coefficients*], [*Estimate*], [*p-value*], [*95% CI*]),
+ table.hline(),
+ [Intercept (_Topeax_)], [0.3405], [0.000], [[0.267, 0.414]],
+ [Topeax], [-0.1106], [0.038], [[-0.215, -0.006]],
+ [BERTopic], [-0.1479], [0.006], [[-0.252, -0.044]],
+ table.hline(),
+
+ ),
+ caption: [Regression coefficients for predicting Fowlkes-Mallows Index from choice of topic model]
+)