You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
abstract = {In this volume, Matthew L. Jockers introduces readers to large-scale literary computing and the revolutionary potential of macroanalysis--a new approach to the study of the literary record designed for probing the digital-textual world as it exists today, in digital form and in large quantities. Using computational analysis to retrieve key words, phrases, and linguistic patterns across thousands of texts in digital libraries, researchers can draw conclusions based on quantifiable evidence regarding how literary trends are employed over time, across periods, within regions, or within demographic groups, as well as how cultural, historical, and societal linkages may bind individual authors, texts, and genres into an aggregate literary culture. Moving beyond the limitations of literary interpretation based on the close-reading of individual works, Jockers describes how this new method of studying large collections of digital material can help us to better understand and contextualize the individual works within those collections.},
198
+
author = {MATTHEW L. JOCKERS},
199
+
publisher = {University of Illinois Press},
200
+
title = {Macroanalysis: Digital Methods and Literary History},
201
+
urldate = {2025-05-27},
202
+
year = {2013}
203
+
}
204
+
205
+
@article{hotel_sector, title={Analysing online customer experience in hotel sector using dynamic topic modelling and net promoter score}, volume={14}, url={https://www.emerald.com/insight/content/doi/10.1108/jhtt-04-2021-0116/full/html}, DOI={10.1108/jhtt-04-2021-0116}, number={2}, journal={Journal of Hospitality and Tourism Technology}, author={Nguyen, Van-Ho and Ho, Thanh}, year={2023}, month=feb, pages={258–277} }
206
+
207
+
@article{social_media_mining, title={Mining social media data via supervised topic model: Can social media posts inform customer satisfaction?}, url={https://onlinelibrary.wiley.com/doi/full/10.1111/deci.12660}, DOI={10.1111/deci.12660}, journal={Decision Sciences}, author={Huang, Yinghui and Li, Mei and Tsung, Fugee and Chang, Xiangyu}, year={2025}, month=jan }
208
+
209
+
@InProceedings{content_recommendation,
210
+
author="Bergamaschi, Sonia
211
+
and Po, Laura",
212
+
editor="Monfort, Val{\'e}rie
213
+
and Krempels, Karl-Heinz",
214
+
title="Comparing LDA and LSA Topic Models for Content-Based Movie Recommendation Systems",
215
+
booktitle="Web Information Systems and Technologies",
216
+
year="2015",
217
+
publisher="Springer International Publishing",
218
+
address="Cham",
219
+
pages="247--263",
220
+
abstract="We propose a plot-based recommendation system, which is based upon an evaluation of similarity between the plot of a video that was watched by a user and a large amount of plots stored in a movie database. Our system is independent from the number of user ratings, thus it is able to propose famous and beloved movies as well as old or unheard movies/programs that are still strongly related to the content of the video the user has watched. The system implements and compares the two Topic Models, Latent Semantic Allocation (LSA) and Latent Dirichlet Allocation (LDA), on a movie database of two hundred thousand plots that has been constructed by integrating different movie databases in a local NoSQL (MongoDB) DBMS. The topic models behaviour has been examined on the basis of standard metrics and user evaluations, performance assessments with 30 users to compare our tool with a commercial system have been conducted.",
221
+
isbn="978-3-319-27030-2"
222
+
}
223
+
224
+
@article{unsupervised_classification,
225
+
author = {Anton Thielmann and Christoph Weisser and Astrid Krenz and Benjamin Säfken and},
226
+
title = {Unsupervised document classification integrating web scraping, one-class SVM and LDA topic modelling},
227
+
journal = {Journal of Applied Statistics},
228
+
volume = {50},
229
+
number = {3},
230
+
pages = {574--591},
231
+
year = {2023},
232
+
publisher = {Taylor \& Francis},
233
+
doi = {10.1080/02664763.2021.1919063},
234
+
note ={PMID: 36819086},
235
+
}
236
+
237
+
@InProceedings{information_retrieval,
238
+
author="Yi, Xing
239
+
and Allan, James",
240
+
editor="Boughanem, Mohand
241
+
and Berrut, Catherine
242
+
and Mothe, Josiane
243
+
and Soule-Dupuy, Chantal",
244
+
title="A Comparative Study of Utilizing Topic Models for Information Retrieval",
245
+
booktitle="Advances in Information Retrieval",
246
+
year="2009",
247
+
publisher="Springer Berlin Heidelberg",
248
+
address="Berlin, Heidelberg",
249
+
pages="29--41",
250
+
abstract="We explore the utility of different types of topic models for retrieval purposes. Based on prior work, we describe several ways that topic models can be integrated into the retrieval process. We evaluate the effectiveness of different types of topic models within those retrieval approaches. We show that: (1) topic models are effective for document smoothing; (2) more rigorous topic models such as Latent Dirichlet Allocation provide gains over cluster-based models; (3) more elaborate topic models that capture topic dependencies provide no additional gains; (4) smoothing documents by using their similar documents is as effective as smoothing them by using topic models; (5) doing query expansion should utilize topics discovered in the top feedback documents instead of coarse-grained topics from the whole corpus; (6) generally, incorporating topics in the feedback documents for building relevance models can benefit the performance more for queries that have more relevant documents.",
251
+
isbn="978-3-642-00958-7"
252
+
}
253
+
254
+
@misc{data_mixers,
255
+
title={Unsupervised Topic Models are Data Mixers for Pre-training Language Models},
256
+
author={Jiahui Peng and Xinlin Zhuang and Qiu Jiantao and Ren Ma and Jing Yu and Tianyi Bai and Conghui He},
Copy file name to clipboardExpand all lines: paper.md
+24-5Lines changed: 24 additions & 5 deletions
Display the source diff
Display the rich diff
Original file line number
Diff line number
Diff line change
@@ -68,19 +68,38 @@ The user has full control over what components should be used at different stage
68
68
69
69
The library comes loaded with numerous utilities to help users interpret their results, including *pretty printing* utilities for exploring topics, *interactive visualizations* partially powered by the `topicwizard`[@topicwizard] Python package, and *automated topic naming* with LLMs.
70
70
71
-
To accommodate a variety of use cases, Turftopic can be used for *dynamic* topic modelling, where we expect topics to change over time.
71
+
To accommodate a variety of use cases, Turftopic can be used for *dynamic* topic modelling, where topics are expected to change over time.
72
72
Turftopic is also capable of extracting topics at multiple levels of granularity, thereby uncovering *hierarchical* topic structures.
73
73
Some models can also be fitted in an *online* fashion, where documents are accounted for as they come in batches.
74
74
Turftopic also includes *seeded* topic modelling, where a seed phrase can be used to retrieve topics relevant to the specific research question.
75
75
76
-
# Use Cases
76
+
# Use cases
77
77
78
-
Topic modelling is a key tool for quantitative text analysis [@quantitative_text_analysis], and can be utilized in a number of research settings, including exploratory data analysis, discourse analysis of diverse domains, such as newspapers, social media or policy documents.
79
-
Turftopic has already been utilized by @keynmf for analyzing information dynamics in Chinese diaspora media, and is currently being used in multiple ongoing research projects, including one analyzing discourse on the HPV vaccine in Denmark, and studying Danish golden-age literature.
78
+
Topic models can be and have been utilized for numerous purposes in both academia and industry.
79
+
They are a key tool in digital/computational humanities, mainly as an instrument of quantitative text analysis or *distant reading*[@quantitative_text_analysis],
80
+
as topic models can pick up on macro-patterns in corpora, at times missed by close readers [@macroanalysis],
81
+
and might be able to provide a more impartial account of a corpus's content.
82
+
Topic models can also aid discourse analysis by facilitating exploratory data analysis, and quantitative modelling of information dynamics [@discourse_analysis].
83
+
Industry analysts might make use of topic models for analyzing customer feedback [@hotel_sector] or social media data related to a company's products [@social_media_mining].
84
+
85
+
Since topic models learn topically informative representations of text, they can also be utilized for down-stream applications,
86
+
such as content filtering, recommendation [@content_recommendation],
87
+
unsupervised classification [@unsupervised_classification], information retrieval [@information_retrieval] and pre-training data curation [@data_mixers].
88
+
89
+
The Turftopic framework has already been utilized by @keynmf for analyzing information dynamics in Chinese diaspora media, and is currently being used in multiple ongoing research projects,
90
+
including one concerning the media coverage of the HPV vaccine in Denmark,
91
+
and another studying Danish golden-age literature.
92
+
We provide examples of correct usage and case studies as part of our documentation.
80
93
81
94
# Target Audience
82
95
83
-
We expect that Turftopic will prove useful to a diverse user base including computational researchers in digital humanities and social sciences, and industry NLP professionals.
96
+
Turftopic's utility has already been demonstrated for computational scholars in digital humanities, and political science,
97
+
and we expect that it will be of utility to a diverse audience of researchers in social sciences, medicine, linguistics and legal studies.
98
+
It can furthermore prove valuable to business analysts working with text-based data to generate qualitative insights.
99
+
100
+
As the focus on pre-training data mixing techniques is on the rise, we expect that Turftopic will help facilitate foundational language model research.
101
+
The library's design, wide array of models, and flexibility are also aimed at enabling usage in more extended production pipelines for retrieval, filtering or content recommendation, and we thus expect the package to be a most valuable tool for the industry NLP practitioner.
102
+
84
103
Turftopic is also an appropriate choice for educational purposes, providing instructors with a single, user-friendly framework for students to explore and compare alternative topic modelling approaches.
0 commit comments