|
| 1 | +@article{turftopic, doi = {10.21105/joss.08183}, url = {https://doi.org/10.21105/joss.08183}, year = {2025}, publisher = {The Open Journal}, volume = {10}, number = {111}, pages = {8183}, author = {Kardos, Márton and Enevoldsen, Kenneth C. and Kostkan, Jan and Kristensen-McLachlan, Ross Deans and Rocca, Roberta}, title = {Turftopic: Topic Modelling with Contextual Representations from Sentence Transformers}, journal = {Journal of Open Source Software} } |
| 2 | + // |
| 3 | +@article{tsne, |
| 4 | + author = {Laurens van der Maaten and Geoffrey Hinton}, |
| 5 | + title = {Visualizing Data using t-SNE}, |
| 6 | + journal = {Journal of Machine Learning Research}, |
| 7 | + year = {2008}, |
| 8 | + volume = {9}, |
| 9 | + number = {86}, |
| 10 | + pages = {2579--2605}, |
| 11 | + url = {http://jmlr.org/papers/v9/vandermaaten08a.html} |
| 12 | +} |
| 13 | + |
| 14 | +@inproceedings{sklearn_api, |
| 15 | + author = {Lars Buitinck and Gilles Louppe and Mathieu Blondel and |
| 16 | + Fabian Pedregosa and Andreas Mueller and Olivier Grisel and |
| 17 | + Vlad Niculae and Peter Prettenhofer and Alexandre Gramfort |
| 18 | + and Jaques Grobler and Robert Layton and Jake VanderPlas and |
| 19 | + Arnaud Joly and Brian Holt and Ga{\"{e}}l Varoquaux}, |
| 20 | + title = {{API} design for machine learning software: experiences from the scikit-learn |
| 21 | + project}, |
| 22 | + booktitle = {ECML PKDD Workshop: Languages for Data Mining and Machine Learning}, |
| 23 | + year = {2013}, |
| 24 | + pages = {108--122}, |
| 25 | +} |
| 26 | + |
| 27 | +@article{using_tsne, |
| 28 | + author = {Wattenberg, Martin and Viégas, Fernanda and Johnson, Ian}, |
| 29 | + title = {How to Use t-SNE Effectively}, |
| 30 | + journal = {Distill}, |
| 31 | + year = {2016}, |
| 32 | + url = {http://distill.pub/2016/misread-tsne}, |
| 33 | + doi = {10.23915/distill.00002} |
| 34 | +} |
| 35 | + |
| 36 | +@article{understanding_umap, |
| 37 | + author = {Andy Coenen and Adam Pearce}, |
| 38 | + title = {Understanding UMAP}, |
| 39 | + url = {https://pair-code.github.io/understanding-umap/}, |
| 40 | +} |
| 41 | + |
| 42 | +@article{scott, |
| 43 | + author = {Scott, David W.}, |
| 44 | + title = {On optimal and data-based histograms}, |
| 45 | + journal = {Biometrika}, |
| 46 | + volume = {66}, |
| 47 | + number = {3}, |
| 48 | + pages = {605-610}, |
| 49 | + year = {1979}, |
| 50 | + month = {12}, |
| 51 | + abstract = {In this paper the formula for the optimal histogram bin width is derived which asymptotically minimizes the integrated mean squared error. Monte Carlo methods are used to verify the usefulness of this formula for small samples. A data-based procedure for choosing the bin width parameter is proposed, which assumes a Gaussian reference standard and requires only the sample size and an estimate of the standard deviation. The sensitivity of the procedure is investigated using several probability models which violate the Gaussian assumption.}, |
| 52 | + issn = {0006-3444}, |
| 53 | + doi = {10.1093/biomet/66.3.605}, |
| 54 | + url = {https://doi.org/10.1093/biomet/66.3.605}, |
| 55 | + eprint = {https://academic.oup.com/biomet/article-pdf/66/3/605/632347/66-3-605.pdf}, |
| 56 | +} |
| 57 | + |
| 58 | +@misc{top2vec, |
| 59 | + title={Top2Vec: Distributed Representations of Topics}, |
| 60 | + author={Dimo Angelov}, |
| 61 | + year={2020}, |
| 62 | + eprint={2008.09470}, |
| 63 | + archivePrefix={arXiv}, |
| 64 | + primaryClass={cs.CL}, |
| 65 | + url={https://arxiv.org/abs/2008.09470}, |
| 66 | +} |
| 67 | + |
| 68 | +@article{probabilistic_topic_models, |
| 69 | +author = {Blei, David M.}, |
| 70 | +title = {Probabilistic topic models}, |
| 71 | +year = {2012}, |
| 72 | +issue_date = {April 2012}, |
| 73 | +publisher = {Association for Computing Machinery}, |
| 74 | +address = {New York, NY, USA}, |
| 75 | +volume = {55}, |
| 76 | +number = {4}, |
| 77 | +issn = {0001-0782}, |
| 78 | +url = {https://doi.org/10.1145/2133806.2133826}, |
| 79 | +doi = {10.1145/2133806.2133826}, |
| 80 | +abstract = {Surveying a suite of algorithms that offer a solution to managing large document archives.}, |
| 81 | +journal = {Commun. ACM}, |
| 82 | +month = apr, |
| 83 | +pages = {77–84}, |
| 84 | +numpages = {8} |
| 85 | +} |
| 86 | + |
| 87 | + |
| 88 | +@InProceedings{hdbscan, |
| 89 | +author="Campello, Ricardo J. G. B. |
| 90 | +and Moulavi, Davoud |
| 91 | +and Sander, Joerg", |
| 92 | +editor="Pei, Jian |
| 93 | +and Tseng, Vincent S. |
| 94 | +and Cao, Longbing |
| 95 | +and Motoda, Hiroshi |
| 96 | +and Xu, Guandong", |
| 97 | +title="Density-Based Clustering Based on Hierarchical Density Estimates", |
| 98 | +booktitle="Advances in Knowledge Discovery and Data Mining", |
| 99 | +year="2013", |
| 100 | +publisher="Springer Berlin Heidelberg", |
| 101 | +address="Berlin, Heidelberg", |
| 102 | +pages="160--172", |
| 103 | +abstract="We propose a theoretically and practically improved density-based, hierarchical clustering method, providing a clustering hierarchy from which a simplified tree of significant clusters can be constructed. For obtaining a ``flat'' partition consisting of only the most significant clusters (possibly corresponding to different density thresholds), we propose a novel cluster stability measure, formalize the problem of maximizing the overall stability of selected clusters, and formulate an algorithm that computes an optimal solution to this problem. We demonstrate that our approach outperforms the current, state-of-the-art, density-based clustering methods on a wide variety of real world data.", |
| 104 | +isbn="978-3-642-37456-2" |
| 105 | +} |
| 106 | + |
| 107 | +@article{embeddinggemma, |
| 108 | + title={EmbeddingGemma: Powerful and Lightweight Text Representations}, |
| 109 | + author={Schechter Vera, Henrique* and Dua, Sahil* and Zhang, Biao and Salz, Daniel and Mullins, Ryan and Raghuram Panyam, Sindhu and Smoot, Sara and Naim, Iftekhar and Zou, Joe and Chen, Feiyang and Cer, Daniel and Lisak, Alice and Choi, Min and Gonzalez, Lucas and Sanseviero, Omar and Cameron, Glenn and Ballantyne, Ian and Black, Kat and Chen, Kaifeng and Wang, Weiyi and Li, Zhe and Martins, Gus and Lee, Jinhyuk and Sherwood, Mark and Ji, Juyeong and Wu, Renjie and Zheng, Jingxiao and Singh, Jyotinder and Sharma, Abheesht and Sreepat, Divya and Jain, Aashi and Elarabawy, Adham and Co, AJ and Doumanoglou, Andreas and Samari, Babak and Hora, Ben and Potetz, Brian and Kim, Dahun and Alfonseca, Enrique and Moiseev, Fedor and Han, Feng and Palma Gomez, Frank and Hernández Ábrego, Gustavo and Zhang, Hesen and Hui, Hui and Han, Jay and Gill, Karan and Chen, Ke and Chen, Koert and Shanbhogue, Madhuri and Boratko, Michael and Suganthan, Paul and Duddu, Sai Meher Karthik and Mariserla, Sandeep and Ariafar, Setareh and Zhang, Shanfeng and Zhang, Shijie and Baumgartner, Simon and Goenka, Sonam and Qiu, Steve and Dabral, Tanmaya and Walker, Trevor and Rao, Vikram and Khawaja, Waleed and Zhou, Wenlei and Ren, Xiaoqi and Xia, Ye and Chen, Yichang and Chen, Yi-Ting and Dong, Zhe and Ding, Zhongli and Visin, Francesco and Liu, Gaël and Zhang, Jiageng and Kenealy, Kathleen and Casbon, Michelle and Kumar, Ravin and Mesnard, Thomas and Gleicher, Zach and Brick, Cormac and Lacombe, Olivier and Roberts, Adam and Sung, Yunhsuan and Hoffmann, Raphael and Warkentin, Tris and Joulin, Armand and Duerig, Tom and Seyedhosseini, Mojtaba}, |
| 110 | + publisher={Google DeepMind}, |
| 111 | + year={2025}, |
| 112 | + url={https://arxiv.org/abs/2509.20354} |
| 113 | +} |
| 114 | + |
| 115 | +@inproceedings{sbert, |
| 116 | + title = "Sentence-{BERT}: Sentence Embeddings using {S}iamese {BERT}-Networks", |
| 117 | + author = "Reimers, Nils and |
| 118 | + Gurevych, Iryna", |
| 119 | + editor = "Inui, Kentaro and |
| 120 | + Jiang, Jing and |
| 121 | + Ng, Vincent and |
| 122 | + Wan, Xiaojun", |
| 123 | + booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP)", |
| 124 | + month = nov, |
| 125 | + year = "2019", |
| 126 | + address = "Hong Kong, China", |
| 127 | + publisher = "Association for Computational Linguistics", |
| 128 | + url = "https://aclanthology.org/D19-1410/", |
| 129 | + doi = "10.18653/v1/D19-1410", |
| 130 | + pages = "3982--3992", |
| 131 | + abstract = "BERT (Devlin et al., 2018) and RoBERTa (Liu et al., 2019) has set a new state-of-the-art performance on sentence-pair regression tasks like semantic textual similarity (STS). However, it requires that both sentences are fed into the network, which causes a massive computational overhead: Finding the most similar pair in a collection of 10,000 sentences requires about 50 million inference computations ({\textasciitilde}65 hours) with BERT. The construction of BERT makes it unsuitable for semantic similarity search as well as for unsupervised tasks like clustering. In this publication, we present Sentence-BERT (SBERT), a modification of the pretrained BERT network that use siamese and triplet network structures to derive semantically meaningful sentence embeddings that can be compared using cosine-similarity. This reduces the effort for finding the most similar pair from 65 hours with BERT / RoBERTa to about 5 seconds with SBERT, while maintaining the accuracy from BERT. We evaluate SBERT and SRoBERTa on common STS tasks and transfer learning tasks, where it outperforms other state-of-the-art sentence embeddings methods." |
| 132 | +} |
| 133 | + |
| 134 | +@misc{bertopic, |
| 135 | + title={BERTopic: Neural topic modeling with a class-based TF-IDF procedure}, |
| 136 | + author={Maarten Grootendorst}, |
| 137 | + year={2022}, |
| 138 | + eprint={2203.05794}, |
| 139 | + archivePrefix={arXiv}, |
| 140 | + primaryClass={cs.CL}, |
| 141 | + url={https://arxiv.org/abs/2203.05794}, |
| 142 | +} |
| 143 | + |
| 144 | +@article{mmteb, |
| 145 | + author = {Kenneth Enevoldsen and Isaac Chung and Imene Kerboua and Márton Kardos and Ashwin Mathur and David Stap and Jay Gala and Wissam Siblini and Dominik Krzemiński and Genta Indra Winata and Saba Sturua and Saiteja Utpala and Mathieu Ciancone and Marion Schaeffer and Gabriel Sequeira and Diganta Misra and Shreeya Dhakal and Jonathan Rystrøm and Roman Solomatin and Ömer Çağatan and Akash Kundu and Martin Bernstorff and Shitao Xiao and Akshita Sukhlecha and Bhavish Pahwa and Rafał Poświata and Kranthi Kiran GV and Shawon Ashraf and Daniel Auras and Björn Plüster and Jan Philipp Harries and Loïc Magne and Isabelle Mohr and Mariya Hendriksen and Dawei Zhu and Hippolyte Gisserot-Boukhlef and Tom Aarsen and Jan Kostkan and Konrad Wojtasik and Taemin Lee and Marek Šuppa and Crystina Zhang and Roberta Rocca and Mohammed Hamdy and Andrianos Michail and John Yang and Manuel Faysse and Aleksei Vatolin and Nandan Thakur and Manan Dey and Dipam Vasani and Pranjal Chitale and Simone Tedeschi and Nguyen Tai and Artem Snegirev and Michael Günther and Mengzhou Xia and Weijia Shi and Xing Han Lù and Jordan Clive and Gayatri Krishnakumar and Anna Maksimova and Silvan Wehrli and Maria Tikhonova and Henil Panchal and Aleksandr Abramov and Malte Ostendorff and Zheng Liu and Simon Clematide and Lester James Miranda and Alena Fenogenova and Guangyu Song and Ruqiya Bin Safi and Wen-Ding Li and Alessia Borghini and Federico Cassano and Hongjin Su and Jimmy Lin and Howard Yen and Lasse Hansen and Sara Hooker and Chenghao Xiao and Vaibhav Adlakha and Orion Weller and Siva Reddy and Niklas Muennighoff}, |
| 146 | + doi = {10.48550/arXiv.2502.13595}, |
| 147 | + journal = {arXiv preprint arXiv:2502.13595}, |
| 148 | + publisher = {arXiv}, |
| 149 | + title = {MMTEB: Massive Multilingual Text Embedding Benchmark}, |
| 150 | + url = {https://arxiv.org/abs/2502.13595}, |
| 151 | + year = {2025}, |
| 152 | +} |
| 153 | + |
| 154 | +@inproceedings{glove, |
| 155 | + title = "{G}lo{V}e: Global Vectors for Word Representation", |
| 156 | + author = "Pennington, Jeffrey and |
| 157 | + Socher, Richard and |
| 158 | + Manning, Christopher", |
| 159 | + editor = "Moschitti, Alessandro and |
| 160 | + Pang, Bo and |
| 161 | + Daelemans, Walter", |
| 162 | + booktitle = "Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing ({EMNLP})", |
| 163 | + month = oct, |
| 164 | + year = "2014", |
| 165 | + address = "Doha, Qatar", |
| 166 | + publisher = "Association for Computational Linguistics", |
| 167 | + url = "https://aclanthology.org/D14-1162/", |
| 168 | + doi = "10.3115/v1/D14-1162", |
| 169 | + pages = "1532--1543" |
| 170 | +} |
| 171 | + |
| 172 | +@article{fmi, |
| 173 | +author = {E. B. Fowlkes and C. L. Mallows}, |
| 174 | +title = {A Method for Comparing Two Hierarchical Clusterings}, |
| 175 | +journal = {Journal of the American Statistical Association}, |
| 176 | +volume = {78}, |
| 177 | +number = {383}, |
| 178 | +pages = {553--569}, |
| 179 | +year = {1983}, |
| 180 | +publisher = {ASA Website}, |
| 181 | +doi = {10.1080/01621459.1983.10478008}, |
| 182 | +URL = { |
| 183 | + https://www.tandfonline.com/doi/abs/10.1080/01621459.1983.10478008 |
| 184 | +}, |
| 185 | +eprint = { |
| 186 | + https://www.tandfonline.com/doi/pdf/10.1080/01621459.1983.10478008 |
| 187 | +} |
| 188 | +} |
| 189 | + |
| 190 | + |
| 191 | +@inproceedings{s3, |
| 192 | + title = "$S^3$ - Semantic Signal Separation", |
| 193 | + author = "Kardos, M{\'a}rton and |
| 194 | + Kostkan, Jan and |
| 195 | + Enevoldsen, Kenneth and |
| 196 | + Vermillet, Arnault-Quentin and |
| 197 | + Nielbo, Kristoffer and |
| 198 | + Rocca, Roberta", |
| 199 | + editor = "Che, Wanxiang and |
| 200 | + Nabende, Joyce and |
| 201 | + Shutova, Ekaterina and |
| 202 | + Pilehvar, Mohammad Taher", |
| 203 | + booktitle = "Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)", |
| 204 | + month = jul, |
| 205 | + year = "2025", |
| 206 | + address = "Vienna, Austria", |
| 207 | + publisher = "Association for Computational Linguistics", |
| 208 | + url = "https://aclanthology.org/2025.acl-long.32/", |
| 209 | + doi = "10.18653/v1/2025.acl-long.32", |
| 210 | + pages = "633--666", |
| 211 | + ISBN = "979-8-89176-251-0", |
| 212 | + abstract = "Topic models are useful tools for discovering latent semantic structures in large textual corpora. Recent efforts have been oriented at incorporating contextual representations in topic modeling and have been shown to outperform classical topic models. These approaches are typically slow, volatile, and require heavy preprocessing for optimal results. We present Semantic Signal Separation ($S^3$), a theory-driven topic modeling approach in neural embedding spaces. $S^3$ conceptualizes topics as independent axes of semantic space and uncovers these by decomposing contextualized document embeddings using Independent Component Analysis. Our approach provides diverse and highly coherent topics, requires no preprocessing, and is demonstrated to be the fastest contextual topic model, being, on average, 4.5x faster than the runner-up BERTopic. We offer an implementation of $S^3$, and all contextual baselines, in the Turftopic Python package." |
| 213 | +} |
| 214 | + |
| 215 | +@inproceedings{proxann, |
| 216 | + title = "{P}rox{A}nn: Use-Oriented Evaluations of Topic Models and Document Clustering", |
| 217 | + author = "Hoyle, Alexander Miserlis and |
| 218 | + Calvo-Bartolom{\'e}, Lorena and |
| 219 | + Boyd-Graber, Jordan Lee and |
| 220 | + Resnik, Philip", |
| 221 | + editor = "Che, Wanxiang and |
| 222 | + Nabende, Joyce and |
| 223 | + Shutova, Ekaterina and |
| 224 | + Pilehvar, Mohammad Taher", |
| 225 | + booktitle = "Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)", |
| 226 | + month = jul, |
| 227 | + year = "2025", |
| 228 | + address = "Vienna, Austria", |
| 229 | + publisher = "Association for Computational Linguistics", |
| 230 | + url = "https://aclanthology.org/2025.acl-long.772/", |
| 231 | + doi = "10.18653/v1/2025.acl-long.772", |
| 232 | + pages = "15872--15897", |
| 233 | + ISBN = "979-8-89176-251-0", |
| 234 | + abstract = "Topic models and document-clustering evaluations either use automated metrics that align poorly with human preferences, or require expert labels that are intractable to scale. We design a scalable human evaluation protocol and a corresponding automated approximation that reflect practitioners' real-world usage of models. Annotators{---}or an LLM-based proxy{---}review text items assigned to a topic or cluster, infer a category for the group, then apply that category to other documents. Using this protocol, we collect extensive crowdworker annotations of outputs from a diverse set of topic models on two datasets. We then use these annotations to validate automated proxies, finding that the best LLM proxy is statistically indistinguishable from a human annotator and can therefore serve as a reasonable substitute in automated evaluations." |
| 235 | +} |
| 236 | + |
| 237 | +@inproceedings{ctop2vec, |
| 238 | + title = "Topic Modeling: Contextual Token Embeddings Are All You Need", |
| 239 | + author = "Angelov, Dimo and |
| 240 | + Inkpen, Diana", |
| 241 | + editor = "Al-Onaizan, Yaser and |
| 242 | + Bansal, Mohit and |
| 243 | + Chen, Yun-Nung", |
| 244 | + booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2024", |
| 245 | + month = nov, |
| 246 | + year = "2024", |
| 247 | + address = "Miami, Florida, USA", |
| 248 | + publisher = "Association for Computational Linguistics", |
| 249 | + url = "https://aclanthology.org/2024.findings-emnlp.790/", |
| 250 | + doi = "10.18653/v1/2024.findings-emnlp.790", |
| 251 | + pages = "13528--13539", |
| 252 | + abstract = "The goal of topic modeling is to find meaningful topics that capture the information present in a collection of documents. The main challenges of topic modeling are finding the optimal number of topics, labeling the topics, segmenting documents by topic, and evaluating topic model performance. Current neural approaches have tackled some of these problems but none have been able to solve all of them. We introduce a novel topic modeling approach, Contextual-Top2Vec, which uses document contextual token embeddings, it creates hierarchical topics, finds topic spans within documents and labels topics with phrases rather than just words. We propose the use of BERTScore to evaluate topic coherence and to evaluate how informative topics are of the underlying documents. Our model outperforms the current state-of-the-art models on a comprehensive set of topic model evaluation metrics." |
| 253 | +} |
| 254 | + |
| 255 | +@InProceedings{doc2vec, |
| 256 | + title = {Distributed Representations of Sentences and Documents}, |
| 257 | + author = {Le, Quoc and Mikolov, Tomas}, |
| 258 | + booktitle = {Proceedings of the 31st International Conference on Machine Learning}, |
| 259 | + pages = {1188--1196}, |
| 260 | + year = {2014}, |
| 261 | + editor = {Xing, Eric P. and Jebara, Tony}, |
| 262 | + volume = {32}, |
| 263 | + number = {2}, |
| 264 | + series = {Proceedings of Machine Learning Research}, |
| 265 | + address = {Bejing, China}, |
| 266 | + month = {22--24 Jun}, |
| 267 | + publisher = {PMLR}, |
| 268 | + pdf = {http://proceedings.mlr.press/v32/le14.pdf}, |
| 269 | + url = {https://proceedings.mlr.press/v32/le14.html}, |
| 270 | + abstract = {Many machine learning algorithms require the input to be represented as a fixed length feature vector. When it comes to texts, one of the most common representations is bag-of-words. Despite their popularity, bag-of-words models have two major weaknesses: they lose the ordering of the words and they also ignore semantics of the words. For example, "powerful," "strong" and "Paris" are equally distant. In this paper, we propose an unsupervised algorithm that learns vector representations of sentences and text documents. This algorithm represents each document by a dense vector which is trained to predict words in the document. Its construction gives our algorithm the potential to overcome the weaknesses of bag-of-words models. Empirical results show that our technique outperforms bag-of-words models as well as other techniques for text representations. Finally, we achieve new state-of-the-art results on several text classification and sentiment analysis tasks.} |
| 271 | +} |
| 272 | + |
| 273 | +@article{umap, title={Uniform manifold approximation and projection}, volume={4}, url={https://www.nature.com/articles/s43586-024-00363-x#citeas}, DOI={10.1038/s43586-024-00363-x}, number={1}, journal={Nature Reviews Methods Primers}, author={Healy, John and McInnes, Leland}, year={2024}, month=nov } |
0 commit comments