|
| 1 | +@misc{s3, |
| 2 | + title={$S^3$ -- Semantic Signal Separation}, |
| 3 | + author={Márton Kardos and Jan Kostkan and Arnault-Quentin Vermillet and Kristoffer Nielbo and Kenneth Enevoldsen and Roberta Rocca}, |
| 4 | + year={2024}, |
| 5 | + eprint={2406.09556}, |
| 6 | + archivePrefix={arXiv}, |
| 7 | + primaryClass={cs.LG}, |
| 8 | + url={https://arxiv.org/abs/2406.09556}, |
| 9 | +} |
| 10 | + |
| 11 | +@misc{keynmf, |
| 12 | + title={Context is Key(NMF): Modelling Topical Information Dynamics in Chinese Diaspora Media}, |
| 13 | + author={Ross Deans Kristensen-McLachlan and Rebecca M. M. Hicke and Márton Kardos and Mette Thunø}, |
| 14 | + year={2024}, |
| 15 | + eprint={2410.12791}, |
| 16 | + archivePrefix={arXiv}, |
| 17 | + primaryClass={cs.CL}, |
| 18 | + url={https://arxiv.org/abs/2410.12791}, |
| 19 | +} |
| 20 | + |
| 21 | +@misc{bertopic_paper, |
| 22 | + title={BERTopic: Neural topic modeling with a class-based TF-IDF procedure}, |
| 23 | + author={Maarten Grootendorst}, |
| 24 | + year={2022}, |
| 25 | + eprint={2203.05794}, |
| 26 | + archivePrefix={arXiv}, |
| 27 | + primaryClass={cs.CL} |
| 28 | +} |
| 29 | + |
| 30 | +@inproceedings{topmost, |
| 31 | + title = "Towards the {T}op{M}ost: A Topic Modeling System Toolkit", |
| 32 | + author = "Wu, Xiaobao and |
| 33 | + Pan, Fengjun and |
| 34 | + Luu, Anh Tuan", |
| 35 | + editor = "Cao, Yixin and |
| 36 | + Feng, Yang and |
| 37 | + Xiong, Deyi", |
| 38 | + booktitle = "Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 3: System Demonstrations)", |
| 39 | + month = aug, |
| 40 | + year = "2024", |
| 41 | + address = "Bangkok, Thailand", |
| 42 | + publisher = "Association for Computational Linguistics", |
| 43 | + url = "https://aclanthology.org/2024.acl-demos.4/", |
| 44 | + doi = "10.18653/v1/2024.acl-demos.4", |
| 45 | + pages = "31--41", |
| 46 | + abstract = "Topic models have a rich history with various applications and have recently been reinvigorated by neural topic modeling. However, these numerous topic models adopt totally distinct datasets, implementations, and evaluations. This impedes quick utilization and fair comparisons, and thereby hinders their research progress and applications. To tackle this challenge, we in this paper propose a Topic Modeling System Toolkit (TopMost). Compared to existing toolkits, TopMost stands out by supporting more extensive features. It covers a broader spectrum of topic modeling scenarios with their complete lifecycles, including datasets, preprocessing, models, training, and evaluations. Thanks to its highly cohesive and decoupled modular design, TopMost enables rapid utilization, fair comparisons, and flexible extensions of diverse cutting-edge topic models. Our code, tutorials, and documentation are available at https://github.com/bobxwu/topmost." |
| 47 | +} |
| 48 | + |
| 49 | +@inproceedings{stream, |
| 50 | + title = "{STREAM}: Simplified Topic Retrieval, Exploration, and Analysis Module", |
| 51 | + author = {Thielmann, Anton and |
| 52 | + Reuter, Arik and |
| 53 | + Weisser, Christoph and |
| 54 | + Kant, Gillian and |
| 55 | + Kumar, Manish and |
| 56 | + S{\"a}fken, Benjamin}, |
| 57 | + editor = "Ku, Lun-Wei and |
| 58 | + Martins, Andre and |
| 59 | + Srikumar, Vivek", |
| 60 | + booktitle = "Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers)", |
| 61 | + month = aug, |
| 62 | + year = "2024", |
| 63 | + address = "Bangkok, Thailand", |
| 64 | + publisher = "Association for Computational Linguistics", |
| 65 | + url = "https://aclanthology.org/2024.acl-short.41/", |
| 66 | + doi = "10.18653/v1/2024.acl-short.41", |
| 67 | + pages = "435--444", |
| 68 | + abstract = "Topic modeling is a widely used technique to analyze large document corpora. With the ever-growing emergence of scientific contributions in the field, non-technical users may often use the simplest available software module, independent of whether there are potentially better models available. We present a Simplified Topic Retrieval, Exploration, and Analysis Module (STREAM) for user-friendly topic modelling and especially subsequent interactive topic visualization and analysis. For better topic analysis, we implement multiple intruder-word based topic evaluation metrics. Additionally, we publicize multiple new datasets that can extend the so far very limited number of publicly available benchmark datasets in topic modeling. We integrate downstream interpretable analysis modules to enable users to easily analyse the created topics in downstream tasks together with additional tabular information.The code is available at the following link: https://github.com/AnFreTh/STREAM" |
| 69 | +} |
| 70 | + |
| 71 | +@inproceedings{ctm, |
| 72 | + title = "Pre-training is a Hot Topic: Contextualized Document Embeddings Improve Topic Coherence", |
| 73 | + author = "Bianchi, Federico and |
| 74 | + Terragni, Silvia and |
| 75 | + Hovy, Dirk", |
| 76 | + editor = "Zong, Chengqing and |
| 77 | + Xia, Fei and |
| 78 | + Li, Wenjie and |
| 79 | + Navigli, Roberto", |
| 80 | + booktitle = "Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing (Volume 2: Short Papers)", |
| 81 | + month = aug, |
| 82 | + year = "2021", |
| 83 | + address = "Online", |
| 84 | + publisher = "Association for Computational Linguistics", |
| 85 | + url = "https://aclanthology.org/2021.acl-short.96", |
| 86 | + doi = "10.18653/v1/2021.acl-short.96", |
| 87 | + pages = "759--766", |
| 88 | + abstract = "Topic models extract groups of words from documents, whose interpretation as a topic hopefully allows for a better understanding of the data. However, the resulting word groups are often not coherent, making them harder to interpret. Recently, neural topic models have shown improvements in overall coherence. Concurrently, contextual embeddings have advanced the state of the art of neural models in general. In this paper, we combine contextualized representations with neural topic models. We find that our approach produces more meaningful and coherent topics than traditional bag-of-words topic models and recent neural models. Our results indicate that future improvements in language models will translate into better topic models.", |
| 89 | +} |
| 90 | + |
| 91 | +@inproceedings{zeroshot_tm, |
| 92 | + title = "Cross-lingual Contextualized Topic Models with Zero-shot Learning", |
| 93 | + author = "Bianchi, Federico and |
| 94 | + Terragni, Silvia and |
| 95 | + Hovy, Dirk and |
| 96 | + Nozza, Debora and |
| 97 | + Fersini, Elisabetta", |
| 98 | + editor = "Merlo, Paola and |
| 99 | + Tiedemann, Jorg and |
| 100 | + Tsarfaty, Reut", |
| 101 | + booktitle = "Proceedings of the 16th Conference of the European Chapter of the Association for Computational Linguistics: Main Volume", |
| 102 | + month = apr, |
| 103 | + year = "2021", |
| 104 | + address = "Online", |
| 105 | + publisher = "Association for Computational Linguistics", |
| 106 | + url = "https://aclanthology.org/2021.eacl-main.143", |
| 107 | + doi = "10.18653/v1/2021.eacl-main.143", |
| 108 | + pages = "1676--1683", |
| 109 | + abstract = "Many data sets (e.g., reviews, forums, news, etc.) exist parallelly in multiple languages. They all cover the same content, but the linguistic differences make it impossible to use traditional, bag-of-word-based topic models. Models have to be either single-language or suffer from a huge, but extremely sparse vocabulary. Both issues can be addressed by transfer learning. In this paper, we introduce a zero-shot cross-lingual topic model. Our model learns topics on one language (here, English), and predicts them for unseen documents in different languages (here, Italian, French, German, and Portuguese). We evaluate the quality of the topic predictions for the same document in different languages. Our results show that the transferred topics are coherent and stable across languages, which suggests exciting future research directions.", |
| 110 | +} |
| 111 | + |
| 112 | +@article{blei_prob_topic_models, title={Probabilistic topic models}, volume={55}, url={https://doi.org/10.1145/2133806.2133826}, DOI={10.1145/2133806.2133826}, number={4}, journal={Communications of the ACM}, author={Blei, David M.}, year={2012}, month=apr, pages={77–84} } |
| 113 | + |
| 114 | +@misc{top2vec, |
| 115 | + title={Top2Vec: Distributed Representations of Topics}, |
| 116 | + author={Dimo Angelov}, |
| 117 | + year={2020}, |
| 118 | + eprint={2008.09470}, |
| 119 | + archivePrefix={arXiv}, |
| 120 | + primaryClass={cs.CL} |
| 121 | +} |
| 122 | + |
| 123 | +@inproceedings{prodlda, |
| 124 | + title={Autoencoding Variational Inference For Topic Models}, |
| 125 | + author={Akash Srivastava and Charles Sutton}, |
| 126 | + booktitle={International Conference on Learning Representations}, |
| 127 | + year={2017}, |
| 128 | + url={https://api.semanticscholar.org/CorpusID:29842525} |
| 129 | +} |
| 130 | + |
| 131 | +@article{scikit-learn, |
| 132 | + title={Scikit-learn: Machine Learning in {P}ython}, |
| 133 | + author={Pedregosa, F. and Varoquaux, G. and Gramfort, A. and Michel, V. |
| 134 | + and Thirion, B. and Grisel, O. and Blondel, M. and Prettenhofer, P. |
| 135 | + and Weiss, R. and Dubourg, V. and Vanderplas, J. and Passos, A. and |
| 136 | + Cournapeau, D. and Brucher, M. and Perrot, M. and Duchesnay, E.}, |
| 137 | + journal={Journal of Machine Learning Research}, |
| 138 | + volume={12}, |
| 139 | + pages={2825--2830}, |
| 140 | + year={2011} |
| 141 | +} |
| 142 | + |
| 143 | +@inproceedings{blei_dynamic, |
| 144 | +author = {Blei, David M. and Lafferty, John D.}, |
| 145 | +title = {Dynamic Topic Models}, |
| 146 | +year = {2006}, |
| 147 | +isbn = {1595933832}, |
| 148 | +publisher = {Association for Computing Machinery}, |
| 149 | +address = {New York, NY, USA}, |
| 150 | +url = {https://doi.org/10.1145/1143844.1143859}, |
| 151 | +doi = {10.1145/1143844.1143859}, |
| 152 | +abstract = {A family of probabilistic time series models is developed to analyze the time evolution of topics in large document collections. The approach is to use state space models on the natural parameters of the multinomial distributions that represent the topics. Variational approximations based on Kalman filters and nonparametric wavelet regression are developed to carry out approximate posterior inference over the latent topics. In addition to giving quantitative, predictive models of a sequential corpus, dynamic topic models provide a qualitative window into the contents of a large document collection. The models are demonstrated by analyzing the OCR'ed archives of the journal Science from 1880 through 2000.}, |
| 153 | +booktitle = {Proceedings of the 23rd International Conference on Machine Learning}, |
| 154 | +pages = {113–120}, |
| 155 | +numpages = {8}, |
| 156 | +location = {Pittsburgh, Pennsylvania, USA}, |
| 157 | +series = {ICML '06} |
| 158 | +} |
| 159 | + |
| 160 | +@inproceedings{blei_hierarchical, |
| 161 | +author = {Blei, David M. and Jordan, Michael I. and Griffiths, Thomas L. and Tenenbaum, Joshua B.}, |
| 162 | +title = {Hierarchical Topic Models and the Nested Chinese Restaurant Process}, |
| 163 | +year = {2003}, |
| 164 | +publisher = {MIT Press}, |
| 165 | +address = {Cambridge, MA, USA}, |
| 166 | +abstract = {We address the problem of learning topic hierarchies from data. The model selection problem in this domain is daunting—which of the large collection of possible trees to use? We take a Bayesian approach, generating an appropriate prior via a distribution on partitions that we refer to as the nested Chinese restaurant process. This nonparametric prior allows arbitrarily large branching factors and readily accommodates growing data collections. We build a hierarchical topic model by combining this prior with a likelihood that is based on a hierarchical variant of latent Dirichlet allocation. We illustrate our approach on simulated data and with an application to the modeling of NIPS abstracts.}, |
| 167 | +booktitle = {Proceedings of the 16th International Conference on Neural Information Processing Systems}, |
| 168 | +pages = {17–24}, |
| 169 | +numpages = {8}, |
| 170 | +location = {Whistler, British Columbia, Canada}, |
| 171 | +series = {NIPS'03} |
| 172 | +} |
| 173 | + |
| 174 | +@misc{ctm_docs, |
| 175 | +author={Bianchi, Federico and Terragni, Silvia and Hovy, Dirk}, |
| 176 | +title={Contextualized Topic Models — Contextualized Topic Models 2.5.0 documentation}, url={https://contextualized-topic-models.readthedocs.io/en/latest/introduction.html}, year={2020} } |
| 177 | + |
| 178 | +@misc{fastopic, |
| 179 | + title={FASTopic: A Fast, Adaptive, Stable, and Transferable Topic Modeling Paradigm}, |
| 180 | + author={Xiaobao Wu and Thong Nguyen and Delvin Ce Zhang and William Yang Wang and Anh Tuan Luu}, |
| 181 | + year={2024}, |
| 182 | + eprint={2405.17978}, |
| 183 | + archivePrefix={arXiv}, |
| 184 | + primaryClass={cs.CL}, |
| 185 | + url={https://arxiv.org/abs/2405.17978}, |
| 186 | +} |
| 187 | + |
| 188 | +@article{sentence_transformers, |
| 189 | + author = {Nils Reimers and |
| 190 | + Iryna Gurevych}, |
| 191 | + title = {Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks}, |
| 192 | + journal = {CoRR}, |
| 193 | + volume = {abs/1908.10084}, |
| 194 | + year = {2019}, |
| 195 | + url = {http://arxiv.org/abs/1908.10084}, |
| 196 | + eprinttype = {arXiv}, |
| 197 | + eprint = {1908.10084}, |
| 198 | + timestamp = {Thu, 26 Nov 2020 12:13:54 +0100}, |
| 199 | + biburl = {https://dblp.org/rec/journals/corr/abs-1908-10084.bib}, |
| 200 | + bibsource = {dblp computer science bibliography, https://dblp.org} |
| 201 | +} |
| 202 | + |
| 203 | +@software{topicwizard, |
| 204 | + author = {Kardos, Márton}, |
| 205 | + month = nov, |
| 206 | + title = {{topicwizard: Pretty and opinionated topic model visualization in Python}}, |
| 207 | + url = {https://github.com/x-tabdeveloping/topic-wizard}, |
| 208 | + version = {0.5.0}, |
| 209 | + year = {2023} |
| 210 | +} |
0 commit comments