Skip to content

Commit d5d7c1b

Browse files
Added paper draft
1 parent a639ebf commit d5d7c1b

File tree

3 files changed

+287
-0
lines changed

3 files changed

+287
-0
lines changed

paper.bib

Lines changed: 210 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,210 @@
1+
@misc{s3,
2+
title={$S^3$ -- Semantic Signal Separation},
3+
author={Márton Kardos and Jan Kostkan and Arnault-Quentin Vermillet and Kristoffer Nielbo and Kenneth Enevoldsen and Roberta Rocca},
4+
year={2024},
5+
eprint={2406.09556},
6+
archivePrefix={arXiv},
7+
primaryClass={cs.LG},
8+
url={https://arxiv.org/abs/2406.09556},
9+
}
10+
11+
@misc{keynmf,
12+
title={Context is Key(NMF): Modelling Topical Information Dynamics in Chinese Diaspora Media},
13+
author={Ross Deans Kristensen-McLachlan and Rebecca M. M. Hicke and Márton Kardos and Mette Thunø},
14+
year={2024},
15+
eprint={2410.12791},
16+
archivePrefix={arXiv},
17+
primaryClass={cs.CL},
18+
url={https://arxiv.org/abs/2410.12791},
19+
}
20+
21+
@misc{bertopic_paper,
22+
title={BERTopic: Neural topic modeling with a class-based TF-IDF procedure},
23+
author={Maarten Grootendorst},
24+
year={2022},
25+
eprint={2203.05794},
26+
archivePrefix={arXiv},
27+
primaryClass={cs.CL}
28+
}
29+
30+
@inproceedings{topmost,
31+
title = "Towards the {T}op{M}ost: A Topic Modeling System Toolkit",
32+
author = "Wu, Xiaobao and
33+
Pan, Fengjun and
34+
Luu, Anh Tuan",
35+
editor = "Cao, Yixin and
36+
Feng, Yang and
37+
Xiong, Deyi",
38+
booktitle = "Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 3: System Demonstrations)",
39+
month = aug,
40+
year = "2024",
41+
address = "Bangkok, Thailand",
42+
publisher = "Association for Computational Linguistics",
43+
url = "https://aclanthology.org/2024.acl-demos.4/",
44+
doi = "10.18653/v1/2024.acl-demos.4",
45+
pages = "31--41",
46+
abstract = "Topic models have a rich history with various applications and have recently been reinvigorated by neural topic modeling. However, these numerous topic models adopt totally distinct datasets, implementations, and evaluations. This impedes quick utilization and fair comparisons, and thereby hinders their research progress and applications. To tackle this challenge, we in this paper propose a Topic Modeling System Toolkit (TopMost). Compared to existing toolkits, TopMost stands out by supporting more extensive features. It covers a broader spectrum of topic modeling scenarios with their complete lifecycles, including datasets, preprocessing, models, training, and evaluations. Thanks to its highly cohesive and decoupled modular design, TopMost enables rapid utilization, fair comparisons, and flexible extensions of diverse cutting-edge topic models. Our code, tutorials, and documentation are available at https://github.com/bobxwu/topmost."
47+
}
48+
49+
@inproceedings{stream,
50+
title = "{STREAM}: Simplified Topic Retrieval, Exploration, and Analysis Module",
51+
author = {Thielmann, Anton and
52+
Reuter, Arik and
53+
Weisser, Christoph and
54+
Kant, Gillian and
55+
Kumar, Manish and
56+
S{\"a}fken, Benjamin},
57+
editor = "Ku, Lun-Wei and
58+
Martins, Andre and
59+
Srikumar, Vivek",
60+
booktitle = "Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers)",
61+
month = aug,
62+
year = "2024",
63+
address = "Bangkok, Thailand",
64+
publisher = "Association for Computational Linguistics",
65+
url = "https://aclanthology.org/2024.acl-short.41/",
66+
doi = "10.18653/v1/2024.acl-short.41",
67+
pages = "435--444",
68+
abstract = "Topic modeling is a widely used technique to analyze large document corpora. With the ever-growing emergence of scientific contributions in the field, non-technical users may often use the simplest available software module, independent of whether there are potentially better models available. We present a Simplified Topic Retrieval, Exploration, and Analysis Module (STREAM) for user-friendly topic modelling and especially subsequent interactive topic visualization and analysis. For better topic analysis, we implement multiple intruder-word based topic evaluation metrics. Additionally, we publicize multiple new datasets that can extend the so far very limited number of publicly available benchmark datasets in topic modeling. We integrate downstream interpretable analysis modules to enable users to easily analyse the created topics in downstream tasks together with additional tabular information.The code is available at the following link: https://github.com/AnFreTh/STREAM"
69+
}
70+
71+
@inproceedings{ctm,
72+
title = "Pre-training is a Hot Topic: Contextualized Document Embeddings Improve Topic Coherence",
73+
author = "Bianchi, Federico and
74+
Terragni, Silvia and
75+
Hovy, Dirk",
76+
editor = "Zong, Chengqing and
77+
Xia, Fei and
78+
Li, Wenjie and
79+
Navigli, Roberto",
80+
booktitle = "Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing (Volume 2: Short Papers)",
81+
month = aug,
82+
year = "2021",
83+
address = "Online",
84+
publisher = "Association for Computational Linguistics",
85+
url = "https://aclanthology.org/2021.acl-short.96",
86+
doi = "10.18653/v1/2021.acl-short.96",
87+
pages = "759--766",
88+
abstract = "Topic models extract groups of words from documents, whose interpretation as a topic hopefully allows for a better understanding of the data. However, the resulting word groups are often not coherent, making them harder to interpret. Recently, neural topic models have shown improvements in overall coherence. Concurrently, contextual embeddings have advanced the state of the art of neural models in general. In this paper, we combine contextualized representations with neural topic models. We find that our approach produces more meaningful and coherent topics than traditional bag-of-words topic models and recent neural models. Our results indicate that future improvements in language models will translate into better topic models.",
89+
}
90+
91+
@inproceedings{zeroshot_tm,
92+
title = "Cross-lingual Contextualized Topic Models with Zero-shot Learning",
93+
author = "Bianchi, Federico and
94+
Terragni, Silvia and
95+
Hovy, Dirk and
96+
Nozza, Debora and
97+
Fersini, Elisabetta",
98+
editor = "Merlo, Paola and
99+
Tiedemann, Jorg and
100+
Tsarfaty, Reut",
101+
booktitle = "Proceedings of the 16th Conference of the European Chapter of the Association for Computational Linguistics: Main Volume",
102+
month = apr,
103+
year = "2021",
104+
address = "Online",
105+
publisher = "Association for Computational Linguistics",
106+
url = "https://aclanthology.org/2021.eacl-main.143",
107+
doi = "10.18653/v1/2021.eacl-main.143",
108+
pages = "1676--1683",
109+
abstract = "Many data sets (e.g., reviews, forums, news, etc.) exist parallelly in multiple languages. They all cover the same content, but the linguistic differences make it impossible to use traditional, bag-of-word-based topic models. Models have to be either single-language or suffer from a huge, but extremely sparse vocabulary. Both issues can be addressed by transfer learning. In this paper, we introduce a zero-shot cross-lingual topic model. Our model learns topics on one language (here, English), and predicts them for unseen documents in different languages (here, Italian, French, German, and Portuguese). We evaluate the quality of the topic predictions for the same document in different languages. Our results show that the transferred topics are coherent and stable across languages, which suggests exciting future research directions.",
110+
}
111+
112+
@article{blei_prob_topic_models, title={Probabilistic topic models}, volume={55}, url={https://doi.org/10.1145/2133806.2133826}, DOI={10.1145/2133806.2133826}, number={4}, journal={Communications of the ACM}, author={Blei, David M.}, year={2012}, month=apr, pages={77–84} }
113+
114+
@misc{top2vec,
115+
title={Top2Vec: Distributed Representations of Topics},
116+
author={Dimo Angelov},
117+
year={2020},
118+
eprint={2008.09470},
119+
archivePrefix={arXiv},
120+
primaryClass={cs.CL}
121+
}
122+
123+
@inproceedings{prodlda,
124+
title={Autoencoding Variational Inference For Topic Models},
125+
author={Akash Srivastava and Charles Sutton},
126+
booktitle={International Conference on Learning Representations},
127+
year={2017},
128+
url={https://api.semanticscholar.org/CorpusID:29842525}
129+
}
130+
131+
@article{scikit-learn,
132+
title={Scikit-learn: Machine Learning in {P}ython},
133+
author={Pedregosa, F. and Varoquaux, G. and Gramfort, A. and Michel, V.
134+
and Thirion, B. and Grisel, O. and Blondel, M. and Prettenhofer, P.
135+
and Weiss, R. and Dubourg, V. and Vanderplas, J. and Passos, A. and
136+
Cournapeau, D. and Brucher, M. and Perrot, M. and Duchesnay, E.},
137+
journal={Journal of Machine Learning Research},
138+
volume={12},
139+
pages={2825--2830},
140+
year={2011}
141+
}
142+
143+
@inproceedings{blei_dynamic,
144+
author = {Blei, David M. and Lafferty, John D.},
145+
title = {Dynamic Topic Models},
146+
year = {2006},
147+
isbn = {1595933832},
148+
publisher = {Association for Computing Machinery},
149+
address = {New York, NY, USA},
150+
url = {https://doi.org/10.1145/1143844.1143859},
151+
doi = {10.1145/1143844.1143859},
152+
abstract = {A family of probabilistic time series models is developed to analyze the time evolution of topics in large document collections. The approach is to use state space models on the natural parameters of the multinomial distributions that represent the topics. Variational approximations based on Kalman filters and nonparametric wavelet regression are developed to carry out approximate posterior inference over the latent topics. In addition to giving quantitative, predictive models of a sequential corpus, dynamic topic models provide a qualitative window into the contents of a large document collection. The models are demonstrated by analyzing the OCR'ed archives of the journal Science from 1880 through 2000.},
153+
booktitle = {Proceedings of the 23rd International Conference on Machine Learning},
154+
pages = {113–120},
155+
numpages = {8},
156+
location = {Pittsburgh, Pennsylvania, USA},
157+
series = {ICML '06}
158+
}
159+
160+
@inproceedings{blei_hierarchical,
161+
author = {Blei, David M. and Jordan, Michael I. and Griffiths, Thomas L. and Tenenbaum, Joshua B.},
162+
title = {Hierarchical Topic Models and the Nested Chinese Restaurant Process},
163+
year = {2003},
164+
publisher = {MIT Press},
165+
address = {Cambridge, MA, USA},
166+
abstract = {We address the problem of learning topic hierarchies from data. The model selection problem in this domain is daunting—which of the large collection of possible trees to use? We take a Bayesian approach, generating an appropriate prior via a distribution on partitions that we refer to as the nested Chinese restaurant process. This nonparametric prior allows arbitrarily large branching factors and readily accommodates growing data collections. We build a hierarchical topic model by combining this prior with a likelihood that is based on a hierarchical variant of latent Dirichlet allocation. We illustrate our approach on simulated data and with an application to the modeling of NIPS abstracts.},
167+
booktitle = {Proceedings of the 16th International Conference on Neural Information Processing Systems},
168+
pages = {17–24},
169+
numpages = {8},
170+
location = {Whistler, British Columbia, Canada},
171+
series = {NIPS'03}
172+
}
173+
174+
@misc{ctm_docs,
175+
author={Bianchi, Federico and Terragni, Silvia and Hovy, Dirk},
176+
title={Contextualized Topic Models — Contextualized Topic Models 2.5.0 documentation}, url={https://contextualized-topic-models.readthedocs.io/en/latest/introduction.html}, year={2020} }
177+
178+
@misc{fastopic,
179+
title={FASTopic: A Fast, Adaptive, Stable, and Transferable Topic Modeling Paradigm},
180+
author={Xiaobao Wu and Thong Nguyen and Delvin Ce Zhang and William Yang Wang and Anh Tuan Luu},
181+
year={2024},
182+
eprint={2405.17978},
183+
archivePrefix={arXiv},
184+
primaryClass={cs.CL},
185+
url={https://arxiv.org/abs/2405.17978},
186+
}
187+
188+
@article{sentence_transformers,
189+
author = {Nils Reimers and
190+
Iryna Gurevych},
191+
title = {Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks},
192+
journal = {CoRR},
193+
volume = {abs/1908.10084},
194+
year = {2019},
195+
url = {http://arxiv.org/abs/1908.10084},
196+
eprinttype = {arXiv},
197+
eprint = {1908.10084},
198+
timestamp = {Thu, 26 Nov 2020 12:13:54 +0100},
199+
biburl = {https://dblp.org/rec/journals/corr/abs-1908-10084.bib},
200+
bibsource = {dblp computer science bibliography, https://dblp.org}
201+
}
202+
203+
@software{topicwizard,
204+
author = {Kardos, Márton},
205+
month = nov,
206+
title = {{topicwizard: Pretty and opinionated topic model visualization in Python}},
207+
url = {https://github.com/x-tabdeveloping/topic-wizard},
208+
version = {0.5.0},
209+
year = {2023}
210+
}

paper.md

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
---
2+
title: 'Turftopic: Topic Modelling with Contextual Embeddings'
3+
tags:
4+
- Python
5+
- topic modelling
6+
- sentence-transformers
7+
- embeddings
8+
authors:
9+
- name: Márton Kardos
10+
orcid: 0000-0001-9652-4498
11+
affiliation: 1
12+
- name: Kenneth C. Enevoldsen
13+
orcid: 0000-0001-8733-0966
14+
affiliation: 1
15+
- name: Jan Kostkan
16+
orcid: 0000-0002-9707-7121
17+
affiliation: 1
18+
- name: Ross Deans Kristensen-McLachlan
19+
orcid: 0000-0001-8714-1911
20+
affiliation: "1, 3"
21+
- name: Roberta Rocca
22+
orcid: 0000-0002-0680-7097
23+
affiliation: 2
24+
affiliations:
25+
- name: Center for Humanities Computing, Aarhus University, Denmark
26+
index: 1
27+
- name: Interactive Minds Center, Aarhus University, Denmark
28+
index: 2
29+
- name: Department of Linguistics, Cognitive Science, and Semiotics, Aarhus University, Denmark
30+
index: 3
31+
date: 17 March 2025
32+
bibliography: paper.bib
33+
---
34+
35+
# Summary
36+
37+
Turftopic is a topic modelling library including a number of recent topic models that go beyond bag-of-words and can understand text in context, utilizing representations from transformers.
38+
The library focuses on ease-of-use, providing a unified, interface for a number of different modern topic models, and boasting both model-specific and model-agnostic interpretation and visualization utilities.
39+
The user is afforded great flexibility in model choice and customization, but the library comes with reasonable defaults, not to overwhelm first-time users with a plethora of choices.
40+
In addition, our library is capable of modeling topics over time, modeling topics in streams of texts, finding hierarchical themes, and multilingual usage.
41+
For ease of interpretation, users can utilize large language models to assign human-readable names to topics, and can model key phrases, lemmas or stems right out of the box.
42+
43+
![An Overview of Turftopic's Functionality](assets/paper_banner.png)
44+
45+
# Statement of need
46+
47+
While a number of software packages have been developed for contextual topic modelling in recent years, including BERTopic [@bertopic_paper], Top2Vec [@top2vec], CTM [@ctm], these packages include implementations of one or two topic models, and most of the utilities they provide are model-specific. This has resulted in the unfortunate situation that practitioners need to switch between different topic modelling libraries if they intend to try different models for their use case, and adapt to their particularities in both interface and functionality.
48+
Some attempts have been made at creating unified packages for modern topic models, including STREAM [@stream] and TopMost [@topmost]. These packages, however have a focus on neural models and topic model evaluation, have abstract and highly specialized interfaces, and do not include all popular topic models.
49+
Additionally, we considered the interpretation utilities included in these libraries fairly limited.
50+
51+
Turftopic unifies state-of-the-art contextual topic models under a superset of the scikit-learn [@scikit-learn] API, which users are likely already familiar with, and can be readily included in scikit-learn workflows and pipelines.
52+
We focused on making Turftopic first and foremost an easy-to-use library, that does not necessitate expert knowledge or excessive amounts of code to get started with, but gives great flexibility to power users.
53+
The library also includes three topic models, which to our knowledge only have implementations in Turftopic, these are: KeyNMF [@keynmf], S^3^ [@s3] and GMM.
54+
55+
# Functionality
56+
57+
Turftopic includes a wide array of contextual topic models from the literature, these include:
58+
FASTopic [@fastopic], Clustering models, such as BERTopic [@bertopic_paper] and Top2Vec [@top2vec], auto-encoding topic models, like CombinedTM [@ctm] and ZeroShotTM [@zeroshot_tm], KeyNMF [@keynmf], Semantic Signal Separation [@s3] and GMM.
59+
We believe these models to be representative of the state of the art in contextual topic modelling and intend to expand on them in the future.
60+
61+
![Components of a Topic Modelling Pipeline in Turftopic](https://x-tabdeveloping.github.io/turftopic/images/topic_modeling_pipeline.png){width="800px"}
62+
63+
Each model in Turftopic has an *encoder* component, which is used for producing continuous document-representations, and a *vectorizer* component, which extracts term counts in each documents, thereby dictating which terms will be considered in topics.
64+
The user has full control over what components should be used at different stages of the topic modelling process, thereby having fine-grained influence on the nature and quality of topics.
65+
66+
The library comes loaded with a lot of utilities to help users interpret their results, including *pretty printing* utilities for exploring topics, *interactive visualizations* partially powered by the `topicwizard` [@topicwizard] Python package, and *automated topic naming* with LLMs.
67+
68+
To accommodate a variety of use-cases, Turftopic can be used for dynamic topic modelling, where we expect topics to change over time, can be used for uncovering hierarchical structure in topics.
69+
Some models can also be fitted in an *online* fashion, where documents are accounted for as they come in by batches.
70+
Turftopic also includes *seeded* topic modelling, where a seed phrase can be used to retrieve topics relevant to the specific research question.
71+
72+
# Target Audience
73+
74+
Topic models can be utilized in a number of research settings, including exploratory data analysis, discourse analysis of many domains, such as newspapers, social media or policy documents.
75+
Turftopic has already been utilized by @keynmf for analyzing information dynamics in Chinese Diaspora Media, and is currently being used in multiple ongoing research projects, including one analyzing discourse on the HPV vaccine in Denmark.
76+
In addition, the library has already seen extensive use in student exam papers at Aarhus University.
77+
We expect that computational researchers, especially in digital humanities and in social sciences will benefit from an easy-to-use, unified contextual topic modelling package.

paper.pdf

906 KB
Binary file not shown.

0 commit comments

Comments
 (0)