molab-thesis/references.bib at main · thomas-decloedt/molab-thesis · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
@article{abburuOntologyStorageModels2016,
  title = {Ontology {{Storage Models}} and {{Tools}}: {{An Authentic Survey}}},
  shorttitle = {Ontology {{Storage Models}} and {{Tools}}},
  author = {Abburu, Sunitha and Golla, Suresh Babu},
  year = {2016},
  month = oct,
  journal = {Journal of Intelligent Systems},
  volume = {25},
  number = {4},
  pages = {539--553},
  issn = {2191-026X, 0334-1860},
  doi = {10.1515/jisys-2014-0167},
  urldate = {2024-08-17},
  abstract = {Ontology is a formal, explicit specification of a shared conceptualization. Ontology provides domain vocabulary, domain knowledge, common understanding, shareability, information interoperability, reusability, concept hierarchy, and relationships that support semantic information retrieval. Ontology improves performance of the system by addressing interoperability issues due to semantic and syntactic heterogeneity. Vast numbers of application domain experts are using ontologies in diverse applications. Use of effective and efficient ontology storage system results improved performance in applications and enables semantic information retrieval. Many prominent researchers and software agencies have proposed and developed several ontology storage methods and tools with various features. The choice of a specific storage model/tool always depend on the specific purpose of the application and the nature of features that are available in the storage model/tool to be utilized in the specific applications. The familiarity of various ontology storage models and tools with the respective features helps user to choose an appropriate storage structure aiming at high-performance applications. The current research work is a comprehensively authentic study carryout out on various ontology storage models and tools with their respective features, which are very essential for optimum performance.},
  copyright = {http://creativecommons.org/licenses/by-nc-nd/3.0/},
  langid = {english},
}

@article{abdel-nabiDeepLearningbasedQuestion2023,
  title = {Deep Learning-Based Question Answering: A Survey},
  shorttitle = {Deep Learning-Based Question Answering},
  author = {{Abdel-Nabi}, Heba and Awajan, Arafat and Ali, Mostafa Z.},
  year = {2023},
  month = apr,
  journal = {Knowledge and Information Systems},
  volume = {65},
  number = {4},
  pages = {1399--1485},
  issn = {0219-1377, 0219-3116},
  doi = {10.1007/s10115-022-01783-5},
  urldate = {2024-03-04},
  abstract = {Question Answering is a crucial natural language processing task. This field of research has attracted a sudden amount of interest lately due mainly to the integration of the deep learning models in the Question Answering Systems which consequently power up many advancements and improvements. This survey aims to explore and shed light upon the recent and most powerful deep learning-based Question Answering Systems and classify them based on the deep learning model used, stating the details of the used word representation, datasets, and evaluation metrics. It aims to highlight and discuss the currently used models and give insights that direct future research to enhance this increasingly growing field.},
  langid = {english},
}

@inproceedings{abeduLLMBasedChatbotsMining2024,
  title = {{{LLM-Based Chatbots}} for {{Mining Software Repositories}}: {{Challenges}} and {{Opportunities}}},
  shorttitle = {{{LLM-Based Chatbots}} for {{Mining Software Repositories}}},
  booktitle = {Proceedings of the 28th {{International Conference}} on {{Evaluation}} and {{Assessment}} in {{Software Engineering}}},
  author = {Abedu, Samuel and Abdellatif, Ahmad and Shihab, Emad},
  year = {2024},
  month = jun,
  pages = {201--210},
  publisher = {ACM},
  address = {Salerno Italy},
  doi = {10.1145/3661167.3661218},
  urldate = {2024-08-17},
  abstract = {Software repositories have a plethora of information about software development, encompassing details such as code contributions, bug reports, code reviews, and project documentation. This rich source of data can be harnessed to enhance not only software quality and development velocity but also to gain insights into team collaboration, identify potential bottlenecks, and inform strategic decision-making throughout the software development lifecycle. Previous studies show that many stakeholders cannot benefit from the project information due to the technical knowledge and expertise required to extract the project data. To lower the barrier to entry by automating the process of extracting and analyzing repository data, we explored the potential of using a large language model (LLM) to develop a chatbot for answering questions related to software repositories. We evaluated the chatbot on a set of 150 software repository-related questions. We found that the chatbot correctly answered one question about the repository. This result prompted us to shift our focus to investigate the challenges in adopting LLMs for the out-of-the-box development of software repository chatbots. We identified five main challenges related to retrieving data, structuring the data, and generating the answer to the user's query. Among these challenges, the most frequent (83.3\%) is the inaccurate retrieval of data to answer questions. In this paper, we share our experience and challenges in developing an LLM-based chatbot to answer software repository-related questions within the SE community. We also provide recommendations on mitigating these challenges. Our findings will serve as a foundation to drive future research aimed at enhancing LLMs for adoption in extracting useful information from software repositories, fostering advancements in natural language understanding, data retrieval, and response generation within the context of software repository-related questions and analytics.},
  isbn = {9798400717017},
  langid = {english},
}

@article{AbstractSemanticGraph2024,
  title = {Abstract Semantic Graph},
  year = {2024},
  month = jul,
  journal = {Wikipedia},
  url = {https://en.wikipedia.org/w/index.php?title=Abstract_semantic_graph&oldid=1233821054},
  urldate = {2024-11-22},
  abstract = {In computer science, an abstract semantic graph (ASG) or term graph is a form of abstract syntax in which an expression of a formal or programming language is represented by a graph whose vertices are the expression's subterms. An ASG is at a higher level of abstraction than an abstract syntax tree (or AST), which is used to express the syntactic structure of an expression or program. ASGs are more complex and concise than ASTs because they may contain shared subterms (also known as "common subexpressions"). Abstract semantic graphs are often used as an intermediate representation by compilers to store the results of performing common subexpression elimination upon abstract syntax trees. ASTs are trees and are thus incapable of representing shared terms. ASGs are usually directed acyclic graphs (DAG), although in some applications graphs containing cycles may be permitted. For example, a graph containing a cycle might be used to represent the recursive expressions that are commonly used in functional programming languages as non-looping iteration constructs. The mutability of these types of graphs, is studied in the field of graph rewriting. The nomenclature term graph is associated with the field of term graph rewriting, which involves the transformation and processing of expressions by the specification of rewriting rules, whereas abstract semantic graph is used when discussing linguistics, programming languages, type systems and compilation. Abstract syntax trees are not capable of sharing subexpression nodes because it is not possible for a node in a proper tree to have more than one parent. Although this conceptual simplicity is appealing, it may come at the cost of redundant representation and, in turn, possibly inefficiently duplicating the computation of identical terms. For this reason ASGs are often used as an intermediate language at a subsequent compilation stage to abstract syntax tree construction via parsing. An abstract semantic graph is typically constructed from an abstract syntax tree by a process of enrichment and abstraction. The enrichment can for example be the addition of back-pointers, edges from an identifier node (where a variable is being used) to a node representing the declaration of that variable. The abstraction can entail the removal of details which are relevant only in parsing, not for semantics.},
  copyright = {Creative Commons Attribution-ShareAlike License},
  langid = {english},
  annotation = {Page Version ID: 1233821054},
}

@inproceedings{abu-aishehExactGraphEdit2015,
  title = {An {{Exact Graph Edit Distance Algorithm}} for {{Solving Pattern Recognition Problems}}},
  booktitle = {4th {{International Conference}} on {{Pattern Recognition Applications}} and {{Methods}} 2015},
  author = {{Abu-Aisheh}, Zeina and Raveaux, Romain and Ramel, Jean-Yves and Martineau, Patrick},
  year = {2015},
  month = jan,
  address = {Lisbon, Portugal},
  doi = {10.5220/0005209202710278},
  urldate = {2024-08-02},
  abstract = {Graph edit distance is an error tolerant matching technique emerged as a powerful and flexible graph matching paradigm that can be used to address different tasks in pattern recognition, machine learning and data mining; it represents the minimum-cost sequence of basic edit operations to transform one graph into another by means of insertion, deletion and substitution of vertices and/or edges. A widely used method for exact graph edit distance computation is based on the A* algorithm. To overcome its high memory load while traversing the search tree for storing pending solutions to be explored, we propose a depth-first graph edit distance algorithm which requires less memory and searching time. An evaluation of all possible solutions is performed without explicitly enumerating them all. Candidates are discarded using an upper and lower bounds strategy. A solid experimental study is proposed; experiments on a publicly available database empirically demonstrated that our approach is better than the A* graph edit distance computation in terms of speed, accuracy and classification rate.},
  keywords = {Classification,Graph Edit Distance,Graph Matching,Pattern Recognition},
}

@inproceedings{agarwalNeurIPS2020NLC2CMD2021,
  title = {{{NeurIPS}} 2020 {{NLC2CMD Competition}}: {{Translating Natural Language}} to {{Bash Commands}}},
  shorttitle = {{{NeurIPS}} 2020 {{NLC2CMD Competition}}},
  booktitle = {Proceedings of the {{NeurIPS}} 2020 {{Competition}} and {{Demonstration Track}}},
  author = {Agarwal, Mayank and Chakraborti, Tathagata and Fu, Quchen and Gros, David and Lin, Xi Victoria and Maene, Jaron and Talamadupula, Kartik and Teng, Zhongwei and White, Jules},
  year = {2021},
  month = aug,
  pages = {302--324},
  publisher = {PMLR},
  issn = {2640-3498},
  urldate = {2024-08-26},
  abstract = {The NLC2CMD Competition hosted at NeurIPS 2020 aimed to bring the power of natural language processing to the command line. Participants were tasked with building models that can transform descriptions of command line tasks in English to their Bash syntax. This is a report on the competition with details of the task, metrics, data, attempted solutions, and lessons learned.},
  langid = {english},
  annotation = {https://proceedings.mlr.press/v133/agarwal21b.html},
}

@misc{agarwalProjectCLAIInstrumenting2020,
  title = {Project {{CLAI}}: {{Instrumenting}} the {{Command Line}} as a {{New Environment}} for {{AI Agents}}},
  shorttitle = {Project {{CLAI}}},
  author = {Agarwal, Mayank and Barroso, Jorge J. and Chakraborti, Tathagata and Dow, Eli M. and Fadnis, Kshitij and Godoy, Borja and Pallan, Madhavan and Talamadupula, Kartik},
  year = {2020},
  month = jun,
  number = {arXiv:2002.00762},
  eprint = {2002.00762},
  primaryclass = {cs},
  publisher = {arXiv},
  url = {http://arxiv.org/abs/2002.00762},
  urldate = {2023-12-08},
  abstract = {This paper reports on Project CLAI (Command Line AI) which aims to bring the power of AI to the command line interface (CLI). The CLAI platform sets up the CLI as a new environment for AI researchers to conquer by surfacing the command line as a generic environment that researchers can interface to using a simple sense-act API, much like the traditional AI agent architecture. In this paper, we discuss the design and implementation of the platform in detail, through illustrative use cases of new end user interaction patterns enabled by this design, and through quantitative evaluation of the system footprint of a CLAI-enabled terminal. We also report on some early user feedback on CLAI's features from an internal survey.},
  archiveprefix = {arXiv},
  langid = {english},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Human-Computer Interaction},
}

@misc{ahmadanisFineTuningLLAMAv22023,
  title = {Fine {{Tuning LLAMAv2}} with {{QLora}} on {{Google Colab}} for {{Free}}},
  author = {{Ahmad Anis}},
  year = {2023},
  month = sep,
  journal = {KDnuggets},
  url = {https://www.kdnuggets.com/fine-tuning-llamav2-with-qlora-on-google-colab-for-free},
  urldate = {2024-08-14},
  abstract = {Learn how to fine-tune one of the most influential open-source models for free on Google Colab.},
  chapter = {Originals},
  langid = {american},
}

@article{albersModelbasedSystemsEngineering2019,
  title = {Model-Based Systems Engineering in Modular Design},
  author = {Albers, Albert and Bursac, Nikola and Scherer, Helmut and Birk, Clemens and Powelske, Jonas and Muschik, Sabine},
  year = {2019},
  month = jan,
  journal = {Design Science},
  volume = {5},
  pages = {e17},
  issn = {2053-4701},
  doi = {10.1017/dsj.2019.15},
  urldate = {2024-09-11},
  abstract = {Modular design allows to reduce costs based on scaling effects. However, due to strong alternating effects between the resulting modules and products, methods and tools are required that enable engineers to use specific views in which the respective information can be linked and retrieved according to the situation. Within the scope of this paper, the model-based systems engineering (MBSE) approach is used to model the complex real-world problem of vehicle modular kits. The aim is to investigate the potentials in this context, how modular kits and products can be efficiently modeled and finally how MBSE can support modular design. In order to investigate this in detail, two extensive studies are carried out in a company over a period of three years. The studies show that modular kits lead to an increased complexity of development. Across industries and companies, the demand for reference product models is shown, which facilitate the unification of inhomogeneous partial models and serve as a knowledge repository for the development of future product generations. On this basis, a framework is derived which enables the reuse of large proportions of the product models of previous product generations. This framework is evaluated on the basis of five case studies.},
  langid = {english},
  keywords = {case studies,mbse,modular design,PGE - product generation engineering,reference product models},
}

@misc{anglesFoundationsModernQuery2017,
  title = {Foundations of {{Modern Query Languages}} for {{Graph Databases}}},
  author = {Angles, Renzo and Arenas, Marcelo and Barcelo, Pablo and Hogan, Aidan and Reutter, Juan and Vrgoc, Domagoj},
  year = {2017},
  month = jun,
  publisher = {arXiv},
  doi = {10.48550/arXiv.1610.06264},
  urldate = {2024-08-05},
  abstract = {We survey foundational features underlying modern graph query languages. We first discuss two popular graph data models: edge-labelled graphs, where nodes are connected by directed, labelled edges; and property graphs, where nodes and edges can further have attributes. Next we discuss the two most fundamental graph querying functionalities: graph patterns and navigational expressions. We start with graph patterns, in which a graph-structured query is matched against the data. Thereafter we discuss navigational expressions, in which patterns can be matched recursively against the graph to navigate paths of arbitrary length; we give an overview of what kinds of expressions have been proposed, and how they can be combined with graph patterns. We also discuss several semantics under which queries using the previous features can be evaluated, what effects the selection of features and semantics has on complexity, and offer examples of such features in three modern languages that are used to query graphs: SPARQL, Cypher and Gremlin. We conclude by discussing the importance of formalisation for graph query languages; a summary of what is known about SPARQL, Cypher and Gremlin in terms of expressivity and complexity; and an outline of possible future directions for the area.},
  langid = {english},
  keywords = {Computer Science - Databases},
}

@misc{anKnowledgeGraphQuestion2024,
  title = {Knowledge {{Graph Question Answering}} for {{Materials Science}} ({{KGQA4MAT}}): {{Developing Natural Language Interface}} for {{Metal-Organic Frameworks Knowledge Graph}} ({{MOF-KG}}) {{Using LLM}}},
  shorttitle = {Knowledge {{Graph Question Answering}} for {{Materials Science}} ({{KGQA4MAT}})},
  author = {An, Yuan and Greenberg, Jane and Kalinowski, Alex and Zhao, Xintong and Hu, Xiaohua and {Uribe-Romo}, Fernando J. and Langlois, Kyle and Furst, Jacob and {G{\'o}mez-Gualdr{\'o}n}, Diego A.},
  year = {2024},
  month = jun,
  number = {arXiv:2309.11361},
  eprint = {2309.11361},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.2309.11361},
  urldate = {2024-12-17},
  abstract = {We present a comprehensive benchmark dataset for Knowledge Graph Question Answering in Materials Science (KGQA4MAT), with a focus on metal-organic frameworks (MOFs). A knowledge graph for metal-organic frameworks (MOF-KG) has been constructed by integrating structured databases and knowledge extracted from the literature. To enhance MOF-KG accessibility for domain experts, we aim to develop a natural language interface for querying the knowledge graph. We have developed a benchmark comprised of 161 complex questions involving comparison, aggregation, and complicated graph structures. Each question is rephrased in three additional variations, resulting in 644 questions and 161 KG queries. To evaluate the benchmark, we have developed a systematic approach for utilizing the LLM, ChatGPT, to translate natural language questions into formal KG queries. We also apply the approach to the well-known QALD-9 dataset, demonstrating ChatGPT's potential in addressing KGQA issues for different platforms and query languages. The benchmark and the proposed approach aim to stimulate further research and development of user-friendly and efficient interfaces for querying domain-specific materials science knowledge graphs, thereby accelerating the discovery of novel materials.},
  archiveprefix = {arXiv},
  langid = {english},
  keywords = {Computer Science - Artificial Intelligence},
}

@article{arenasSPARQLFormalization,
  title = {{{SPARQL Formalization}}},
  author = {Arenas, Marcelo and Gutierrez, Claudio and P{\'e}rez, Jorge},
  langid = {english},
}

@article{aslaniParallelTBoxClassification,
  title = {Parallel {{TBox Classification}} in {{Description Logics}} -- {{First Experimental Results}}},
  author = {Aslani, Mina and Haarslev, Volker},
  abstract = {One of the most frequently used inference services of description logic reasoners classifies all named classes of OWL ontologies into a subsumption hierarchy. Due to emerging OWL ontologies from the web community consisting of up to hundreds of thousand of named classes and the increasing availability of multi-processor and multi- or many-core computers, we extend our work on parallel TBox classification and propose a new algorithm that is sound and complete and demonstrates in a first experimental evaluation a low overhead w.r.t. subsumption tests (less than 3\%) if compared with sequential classification.},
  langid = {english},
}

@misc{auerSciQABenchmarkDataset2023,
  title = {{{SciQA}} Benchmark: {{Dataset}} and {{RDF}} Dump},
  shorttitle = {{{SciQA}} Benchmark},
  author = {Auer, S{\"o}ren and Barone, Dante A. C. and Bartz, Cassiano and Cortes, Eduardo G. and Jaradeh, Mohamad Yaser and Karras, Oliver and Koubarakis, Manolis and Mouromtsev, Dmitry and Pliukhin, Dmitrii and Radyush, Daniil and Shilin, Ivan and Stocker, Markus and Tsalapati, Eleni},
  year = {2023},
  month = mar,
  publisher = {Zenodo},
  doi = {10.5281/ZENODO.7729047},
  urldate = {2024-07-10},
  abstract = {SciQA benchmark of questions and queries. The data dump is in NTriples format (RDF NT) taken from the ORKG system on 14.02.2023 at 02:04PM. The dump can be imported into a virtuoso endpoint or any RDF engine so it can be queried. The questions/queries are provided as spread sheets (Excel format \& CSV format), also train and test files are provided for each of the sets. Huggingface~datasets are also attached in the archive to make it easy to integrate with existing workflows and to enable the automated evaluation of SciQA within challenges. Types of questions and queries: 	 Handcrafted set of 100 questions 	 Auto-generated set of 2465 questions More details on certain columns: "Classification rationale" It may contain the following values: 	 Nested facts in the question 	 Sorting, sum, average, minimum, maximum or count calculation required 	 Filter used 	 Mappings of Asking Point in the question to the ORKG ontology Explanation of Rationale for Non-factoid: 	 Nested facts in the question. An entity (e.g., a system or a paper) or predicate is requested that is not explicitly stated in the question text and must be inferred while searching for an answer.~ 	 Sorting, sum, average, minimum, maximum or count calculation required. To get the answer to the question it is necessary to make an aggregation of the query results.~ 	 Filter used. To get the answer to the question it is necessary to use filtering of the query results by some conditions.},
  copyright = {Creative Commons Attribution 4.0 International, Open Access},
  langid = {english},
  keywords = {ORKG,QA dataset,Question answering,rdf dump,Scholarly knowledge graph,SciQA benchmark},
}

@article{auerSciQAScientificQuestion2023,
  title = {The {{SciQA Scientific Question Answering Benchmark}} for {{Scholarly Knowledge}}},
  author = {Auer, S{\"o}ren and Barone, Dante A. C. and Bartz, Cassiano and Cortes, Eduardo G. and Jaradeh, Mohamad Yaser and Karras, Oliver and Koubarakis, Manolis and Mouromtsev, Dmitry and Pliukhin, Dmitrii and Radyush, Daniil and Shilin, Ivan and Stocker, Markus and Tsalapati, Eleni},
  year = {2023},
  month = may,
  journal = {Scientific Reports},
  volume = {13},
  number = {1},
  pages = {7240},
  issn = {2045-2322},
  doi = {10.1038/s41598-023-33607-z},
  urldate = {2024-05-27},
  abstract = {Abstract             Knowledge graphs have gained increasing popularity in the last decade in science and technology. However, knowledge graphs are currently relatively simple to moderate semantic structures that are mainly a collection of factual statements. Question answering (QA) benchmarks and systems were so far mainly geared towards encyclopedic knowledge graphs such as DBpedia and Wikidata. We present SciQA a scientific QA benchmark for scholarly knowledge. The benchmark leverages the Open Research Knowledge Graph (ORKG) which includes almost 170,000 resources describing research contributions of almost 15,000 scholarly articles from 709 research fields. Following a bottom-up methodology, we first manually developed a set of 100 complex questions that can be answered using this knowledge graph. Furthermore, we devised eight question templates with which we automatically generated further 2465 questions, that can also be answered with the ORKG. The questions cover a range of research fields and question types and are translated into corresponding SPARQL queries over the ORKG. Based on two preliminary evaluations, we show that the resulting SciQA benchmark represents a challenging task for next-generation QA systems. This task is part of the open competitions at the 22nd International Semantic Web Conference 2023 as the Scholarly Question Answering over Linked Data (QALD) Challenge.},
  langid = {english},
  annotation = {Read\_Status: Read\\
Read\_Status\_Date: 2024-12-10T10:33:01.492Z},
}

@misc{ayoolaReFinEDEfficientZeroshotcapable2022,
  title = {{{ReFinED}}: {{An Efficient Zero-shot-capable Approach}} to {{End-to-End Entity Linking}}},
  shorttitle = {{{ReFinED}}},
  author = {Ayoola, Tom and Tyagi, Shubhi and Fisher, Joseph and Christodoulopoulos, Christos and Pierleoni, Andrea},
  year = {2022},
  month = jul,
  publisher = {arXiv},
  doi = {10.48550/arXiv.2207.04108},
  urldate = {2023-12-18},
  abstract = {We introduce ReFinED, an efficient end-toend entity linking model which uses finegrained entity types and entity descriptions to perform linking. The model performs mention detection, fine-grained entity typing, and entity disambiguation for all mentions within a document in a single forward pass, making it more than 60 times faster than competitive existing approaches. ReFinED also surpasses state-of-the-art performance on standard entity linking datasets by an average of 3.7 F1. The model is capable of generalising to large-scale knowledge bases such as Wikidata (which has 15 times more entities than Wikipedia) and of zero-shot entity linking. The combination of speed, accuracy and scale makes ReFinED an effective and cost-efficient system for extracting entities from web-scale datasets, for which the model has been successfully deployed. Our code and pre-trained models are available at https://github.com/alexa/ReFinED.},
  langid = {english},
  keywords = {Computer Science - Computation and Language,KBQA},
}

@article{badenes-olmedoMuHeQAZeroshotQuestion2023,
  title = {{{MuHeQA}}: {{Zero-shot}} Question Answering over Multiple and Heterogeneous Knowledge Bases},
  shorttitle = {{{MuHeQA}}},
  author = {{Badenes-Olmedo}, Carlos and Corcho, Oscar},
  editor = {Fu, Bo and Lambrix, Patrick and Pesquita, Catia},
  year = {2023},
  month = jun,
  journal = {Semantic Web},
  pages = {1--15},
  issn = {22104968, 15700844},
  doi = {10.3233/SW-233379},
  urldate = {2023-12-19},
  abstract = {Abstract. There are two main limitations in most of the existing Knowledge Graph Question Answering (KGQA) algorithms. First, the approaches depend heavily on the structure and cannot be easily adapted to other KGs. Second, the availability and amount of additional domain-specific data in structured or unstructured formats has also proven to be critical in many of these systems. Such dependencies limit the applicability of KGQA systems and make their adoption difficult. A novel algorithm is D proposed, MuHeQA, that alleviates both limitations by retrieving the answer from textual content automatically generated from KGs instead of queries over them. This new approach (1) works on one or several KGs simultaneously, (2) does not require E training data what makes it is domain-independent, (3) enables the combination of knowledge graphs with unstructured information sources to build the answer, and (4) reduces the dependency on the underlying schema since it does not navigate through T structured content but only reads property values. MuHeQA extracts answers from textual summaries created by combining information related to the question from multiple knowledge bases, be them structured or not. Experiments over Wikidata and C DBpedia show that our approach achieves comparable performance to other approaches in single-fact questions while being domain and KG independent. Results raise important questions for future work about how the textual content that can be created E from knowledge graphs enables answer extraction.},
  langid = {english},
  keywords = {KBQA},
}

@inproceedings{banerjeeModernBaselinesSPARQL2022,
  title = {Modern {{Baselines}} for {{SPARQL Semantic Parsing}}},
  booktitle = {Proceedings of the 45th {{International ACM SIGIR Conference}} on {{Research}} and {{Development}} in {{Information Retrieval}}},
  author = {Banerjee, Debayan and Nair, Pranav Ajit and Kaur, Jivat Neet and Usbeck, Ricardo and Biemann, Chris},
  year = {2022},
  month = jul,
  eprint = {2204.12793},
  primaryclass = {cs},
  pages = {2260--2265},
  doi = {10.1145/3477495.3531841},
  urldate = {2024-05-28},
  abstract = {In this work, we focus on the task of generating SPARQL queries from natural language questions, which can then be executed on Knowledge Graphs (KGs). We assume that gold entity and relations have been provided, and the remaining task is to arrange them in the right order along with SPARQL vocabulary, and input tokens to produce the correct SPARQL query. Pre-trained Language Models (PLMs) have not been explored in depth on this task so far, so we experiment with BART, T5 and PGNs (Pointer Generator Networks) with BERT embeddings, looking for new baselines in the PLM era for this task, on DBpedia and Wikidata KGs. We show that T5 requires special input tokenisation, but produces state of the art performance on LC-QuAD 1.0 and LC-QuAD 2.0 datasets, and outperforms task-specific models from previous works. Moreover, the methods enable semantic parsing for questions where a part of the input needs to be copied to the output query, thus enabling a new paradigm in KG semantic parsing. Code and data used for this work can be found at https://github.com/debayan/sigir2022sparqlbaselines.},
  archiveprefix = {arXiv},
  langid = {english},
  keywords = {Computer Science - Computation and Language,Computer Science - Information Retrieval},
}

@inproceedings{bayerModelBasedSystems2012,
  title = {Model {{Based Systems Engineering}} on the {{Europa}} Mission Concept Study},
  booktitle = {2012 {{IEEE Aerospace Conference}}},
  author = {Bayer, T. J. and {Seung Chung} and Cole, B. and Cooke, B. and Dekens, F. and Delp, C. and Gontijo, I. and Lewis, K. and Moshir, M. and Rasmussen, R. and Wagner, D.},
  year = {2012},
  month = mar,
  pages = {1--18},
  publisher = {IEEE},
  address = {Big Sky, MT},
  doi = {10.1109/AERO.2012.6187337},
  urldate = {2024-03-05},
  abstract = {At the start of 2011, the proposed Jupiter Europa Orbiter (JEO) mission was staffing up in expectation of becoming an official project later in the year for a launch in 2020. A unique aspect of the pre-project work was a strong emphasis and investment on the foundations of Model-Based Systems Engineering (MBSE). As so often happens in this business, plans changed: NASA's budget and science priorities were released and together fundamentally changed the course of JEO. As a result, it returned to being a study task whose objective is to propose more affordable ways to accomplish the science. As part of this transition, the question arose as to whether it could continue to afford the investment in MBSE. In short, the MBSE infusion has survived and is providing clear value to the study effort. In the process, the need to remain relevant in the new environment has brought about a wave of innovation and progress. By leveraging the existing infrastructure and a modest additional investment, striking advances in the capture and analysis of designs using MBSE were achieved. The effort has reaffirmed the importance of architecting. It has successfully harnessed the synergistic relationship of architecting to system modeling. We have found that MBSE can provide greater agility than traditional methods. We have also found that a diverse `ecosystem' of modeling tools and languages (SysML, Mathematica, even Excel) is not only viable, but an important enabler of agility and adaptability. This paper will describe the successful application of MBSE in the dynamic environment of early mission formulation, the significant results produced and lessons learned in the process.},
  isbn = {978-1-4577-0557-1 978-1-4577-0556-4 978-1-4577-0555-7},
  langid = {english},
}

@inproceedings{beersSysMLProfileStandardized2024,
  title = {A {{SysML Profile}} for the {{Standardized Description}} of {{Processes}} during {{System Development}}},
  booktitle = {2024 {{IEEE International Systems Conference}} ({{SysCon}})},
  author = {Beers, Lasse and Nabizada, Hamied and Weigand, Maximilian and Gehlhoff, Felix and Fay, Alexander},
  year = {2024},
  month = apr,
  pages = {1--8},
  issn = {2472-9647},
  doi = {10.1109/SysCon61195.2024.10553402},
  urldate = {2024-12-09},
  abstract = {A key aspect in creating models of production systems with the use of model-based systems engineering (MBSE) lies in the description of system functions. These functions should be described in a clear and standardized manner.The VDI/VDE 3682 standard for Formalised Process Description (FPD) provides a simple and easily understandable representation of processes. These processes can be conceptualized as functions within the system model, making the FPD particularly well-suited for the standardized representation of the required functions. Hence, this contribution focuses on the development of a Domain-Specific Modeling Language (DSML) that facilitates the integration of VDI/VDE 3682 into the Systems Modeling Language (SysML). The presented approach not only extends classical SysML with domain-specific requirements but also facilitates model verification through constraints modeled in Object Constraint Language (OCL). Additionally, it enables automatic serialization of process descriptions into the Extensible Markup Language (XML) using the Velocity Template Language (VTL). This serialization enables the use of process modeling in applications outside of MBSE. The approach was validated using an collar screwing use case in the major component assembly in aircraft production.},
  keywords = {Aircraft propulsion,Atmospheric modeling,Domain-Specific Modeling Language,Formalised Process Description,Model-Based Systems Engineering,Modeling,Process modeling,Production systems,SysML Profile,Systems Modeling Language,VDI/VDE 3682,XML},
  annotation = {Read\_Status: Read\\
Read\_Status\_Date: 2024-12-09T17:44:27.820Z},
}

@inproceedings{benderDangersStochasticParrots2021,
  title = {On the {{Dangers}} of {{Stochastic Parrots}}: {{Can Language Models Be Too Big}}? 🦜},
  shorttitle = {On the {{Dangers}} of {{Stochastic Parrots}}},
  booktitle = {Proceedings of the 2021 {{ACM Conference}} on {{Fairness}}, {{Accountability}}, and {{Transparency}}},
  author = {Bender, Emily M. and Gebru, Timnit and {McMillan-Major}, Angelina and Shmitchell, Shmargaret},
  year = {2021},
  month = mar,
  pages = {610--623},
  publisher = {ACM},
  address = {Virtual Event Canada},
  doi = {10.1145/3442188.3445922},
  urldate = {2023-12-05},
  abstract = {The past 3 years of work in NLP have been characterized by the development and deployment of ever larger language models, especially for English. BERT, its variants, GPT-2/3, and others, most recently Switch-C, have pushed the boundaries of the possible both through architectural innovations and through sheer size. Using these pretrained models and the methodology of fine-tuning them for specific tasks, researchers have extended the state of the art on a wide array of tasks as measured by leaderboards on specific benchmarks for English. In this paper, we take a step back and ask: How big is too big? What are the possible risks associated with this technology and what paths are available for mitigating those risks? We provide recommendations including weighing the environmental and financial costs first, investing resources into curating and carefully documenting datasets rather than ingesting everything on the web, carrying out pre-development exercises evaluating how the planned approach fits into research and development goals and supports stakeholder values, and encouraging research directions beyond ever larger language models.},
  isbn = {978-1-4503-8309-7},
  langid = {english},
  keywords = {Foundational},
}

@article{benderDataStatementsNatural2018,
  title = {Data {{Statements}} for {{Natural Language Processing}}: {{Toward Mitigating System Bias}} and {{Enabling Better Science}}},
  shorttitle = {Data {{Statements}} for {{Natural Language Processing}}},
  author = {Bender, Emily M. and Friedman, Batya},
  editor = {Lee, Lillian and Johnson, Mark and Toutanova, Kristina and Roark, Brian},
  year = {2018},
  journal = {Transactions of the Association for Computational Linguistics},
  volume = {6},
  pages = {587--604},
  publisher = {MIT Press},
  address = {Cambridge, MA},
  doi = {10.1162/tacl_a_00041},
  urldate = {2024-03-02},
  abstract = {In this paper, we propose data statements as a design solution and professional practice for natural language processing technologists, in both research and development. Through the adoption and widespread use of data statements, the field can begin to address critical scientific and ethical issues that result from the use of data from certain populations in the development of technology for other populations. We present a form that data statements can take and explore the implications of adopting them as part of regular practice. We argue that data statements will help alleviate issues related to exclusion and bias in language technology, lead to better precision in claims about how natural language processing research can generalize and thus better engineering results, protect companies from public embarrassment, and ultimately lead to language technology that meets its users in their own preferred linguistic style and furthermore does not misrepresent them to others.},
}

@inproceedings{berantSemanticParsingFreebase2013,
  title = {Semantic {{Parsing}} on {{Freebase}} from {{Question-Answer Pairs}}},
  booktitle = {Proceedings of the 2013 {{Conference}} on {{Empirical Methods}} in {{Natural Language Processing}}},
  author = {Berant, Jonathan and Chou, Andrew and Frostig, Roy and Liang, Percy},
  editor = {Yarowsky, David and Baldwin, Timothy and Korhonen, Anna and Livescu, Karen and Bethard, Steven},
  year = {2013},
  month = oct,
  pages = {1533--1544},
  publisher = {Association for Computational Linguistics},
  address = {Seattle, Washington, USA},
  url = {https://aclanthology.org/D13-1160},
  urldate = {2024-08-04},
}

@article{bestaGraphThoughtsSolving2024,
  title = {Graph of {{Thoughts}}: {{Solving Elaborate Problems}} with {{Large Language Models}}},
  shorttitle = {Graph of {{Thoughts}}},
  author = {Besta, Maciej and Blach, Nils and Kubicek, Ales and Gerstenberger, Robert and Podstawski, Michal and Gianinazzi, Lukas and Gajda, Joanna and Lehmann, Tomasz and Niewiadomski, Hubert and Nyczyk, Piotr and Hoefler, Torsten},
  year = {2024},
  month = mar,
  journal = {Proceedings of the AAAI Conference on Artificial Intelligence},
  volume = {38},
  number = {16},
  pages = {17682--17690},
  issn = {2374-3468, 2159-5399},
  doi = {10.1609/aaai.v38i16.29720},
  urldate = {2024-08-17},
  abstract = {We introduce Graph of Thoughts (GoT): a framework that advances prompting capabilities in large language models (LLMs) beyond those offered by paradigms such as Chain-ofThought or Tree of Thoughts (ToT). The key idea and primary advantage of GoT is the ability to model the information generated by an LLM as an arbitrary graph, where units of information (``LLM thoughts'') are vertices, and edges correspond to dependencies between these vertices. This approach enables combining arbitrary LLM thoughts into synergistic outcomes, distilling the essence of whole networks of thoughts, or enhancing thoughts using feedback loops. We illustrate that GoT offers advantages over state of the art on different tasks, for example increasing the quality of sorting by 62\% over ToT, while simultaneously reducing costs by {$>$}31\%. We ensure that GoT is extensible with new thought transformations and thus can be used to spearhead new prompting schemes. This work brings the LLM reasoning closer to human thinking or brain mechanisms such as recurrence, both of which form complex networks.},
  langid = {english},
}

@incollection{bialySoftwareEngineeringModelBased2017,
  title = {Software {{Engineering}} for {{Model-Based Development}} by {{Domain Experts}}},
  booktitle = {Handbook of {{System Safety}} and {{Security}}},
  author = {Bialy, M. and Pantelic, V. and Jaskolka, J. and Schaap, A. and Patcas, L. and Lawford, M. and Wassyng, A.},
  year = {2017},
  pages = {39--64},
  publisher = {Elsevier},
  doi = {10.1016/B978-0-12-803773-7.00003-6},
  urldate = {2024-12-09},
  copyright = {https://www.elsevier.com/tdm/userlicense/1.0/},
  isbn = {978-0-12-803773-7},
  langid = {english},
  annotation = {Read\_Status: Read\\
Read\_Status\_Date: 2024-12-10T10:14:38.964Z},
}

@article{bienstockNotePrizeCollecting1993,
  title = {A Note on the Prize Collecting Traveling Salesman Problem},
  author = {Bienstock, Daniel and Goemans, Michel X. and {Simchi-Levi}, David and Williamson, David},
  year = {1993},
  month = mar,
  journal = {Mathematical Programming},
  volume = {59},
  number = {1-3},
  pages = {413--420},
  issn = {0025-5610, 1436-4646},
  doi = {10.1007/BF01581256},
  urldate = {2024-08-17},
  abstract = {We study the version of the prize collecting traveling salesman problem, where the objective is to find a tour that visits a subset of vertices such that the length of the tour plus the sum of penalties associated with vertices not in the tour is as small as possible. We present an approximation algorithm with constant bound. The algorithm is based on Christofides' algorithm for the traveling salesman problem as well as a method to round fractional solutions of a linear programming relaxation to integers, feasible for the original problem.},
  copyright = {http://www.springer.com/tdm},
  langid = {english},
}

@misc{biswalCLARAClinicalReport2020,
  title = {{{CLARA}}: {{Clinical Report Auto-completion}}},
  shorttitle = {{{CLARA}}},
  author = {Biswal, Siddharth and Xiao, Cao and Glass, Lucas M. and Westover, M. Brandon and Sun, Jimeng},
  year = {2020},
  month = mar,
  number = {arXiv:2002.11701},
  eprint = {2002.11701},
  publisher = {arXiv},
  doi = {10.48550/arXiv.2002.11701},
  urldate = {2024-01-27},
  abstract = {Generating clinical reports from raw recordings such as Xrays and electroencephalogram (EEG) is an essential and routine task for doctors. However, it is often time-consuming to write accurate and detailed reports. Most existing methods try to generate the whole reports from the raw input with limited success because 1) generated reports often contain errors that need manual review and correction, 2) it does not save time when doctors want to write additional information into the report, and 3) the generated reports are not customized based on individual doctors' preference. We propose CLinicAl Report Auto-completion (CLARA), an interactive method that generates reports in a sentence by sentence fashion based on doctors' anchor words and partially completed sentences. CLARA searches for most relevant sentences from existing reports as the template for the current report. The retrieved sentences are sequentially modified by combining with the input feature representations to create the final report. In our experimental evaluation CLARA achieved 0.393 CIDEr and 0.248 BLEU4 on X-ray reports and 0.482 CIDEr and 0.491 BLEU-4 for EEG reports for sentence-level generation, which is up to 35\% improvement over the best baseline. Also via our qualitative evaluation, CLARA is shown to produce reports which have a significantly higher level of approval by doctors in a user study (3.74 out of 5 for CLARA vs 2.52 out of 5 for the baseline).},
  archiveprefix = {arXiv},
  langid = {english},
  keywords = {Computer Science - Computer Vision and Pattern Recognition,Computer Science - Human-Computer Interaction,Computer Science - Machine Learning,Statistics - Machine Learning},
}

@article{bojanowskiEnrichingWordVectors2017,
  title = {Enriching {{Word Vectors}} with {{Subword Information}}},
  author = {Bojanowski, Piotr and Grave, Edouard and Joulin, Armand and Mikolov, Tomas},
  year = {2017},
  month = dec,
  journal = {Transactions of the Association for Computational Linguistics},
  volume = {5},
  pages = {135--146},
  issn = {2307-387X},
  doi = {10.1162/tacl_a_00051},
  urldate = {2024-05-30},
  abstract = {Continuous word representations, trained on large unlabeled corpora are useful for many natural language processing tasks. Popular models that learn such representations ignore the morphology of words, by assigning a distinct vector to each word. This is a limitation, especially for languages with large vocabularies and many rare words. In this paper, we propose a new approach based on the skipgram model, where each word is represented as a bag of character n-grams. A vector representation is associated to each character n-gram; words being represented as the sum of these representations. Our method is fast, allowing to train models on large corpora quickly and allows us to compute word representations for words that did not appear in the training data. We evaluate our word representations on nine different languages, both on word similarity and analogy tasks. By comparing to recently proposed morphological word representations, we show that our vectors achieve state-of-the-art performance on these tasks.},
  langid = {english},
}

@inproceedings{bontchevaAutomaticReportGeneration2004,
  title = {Automatic {{Report Generation}} from {{Ontologies}}: {{The MIAKT Approach}}},
  shorttitle = {Automatic {{Report Generation}} from {{Ontologies}}},
  booktitle = {Natural {{Language Processing}} and {{Information Systems}}},
  author = {Bontcheva, Kalina and Wilks, Yorick},
  editor = {Meziane, Farid and M{\'e}tais, Elisabeth},
  year = {2004},
  series = {Lecture {{Notes}} in {{Computer Science}}},
  pages = {324--335},
  publisher = {Springer},
  address = {Berlin, Heidelberg},
  doi = {10.1007/978-3-540-27779-8_28},
  abstract = {This paper presented an approach for automatic generation of reports from domain ontologies encoded in Semantic Web standards like OWL. The paper identifies the challenges that need to be addressed when generating text from RDF and OWL and demonstrates how the ontology is used during the different stages of the generation process. The main contribution is in showing how NLG tools that take Semantic Web ontologies as their input can be designed to minimises the portability effort, while offering better output than template-based ontology verbalisers.},
  isbn = {978-3-540-27779-8},
  langid = {english},
  keywords = {Domain Ontology,Natural Language Generation,Property Hierarchy,Resource Description Framework,Resource Description Framework Triple},
}

@article{borrotoSPARQLQAv2SystemKnowledge2023,
  title = {{{SPARQL-QA-v2}} System for {{Knowledge Base Question Answering}}},
  author = {Borroto, Manuel A. and Ricca, Francesco},
  year = {2023},
  month = nov,
  journal = {Expert Systems with Applications},
  volume = {229},
  pages = {120383},
  issn = {09574174},
  doi = {10.1016/j.eswa.2023.120383},
  urldate = {2023-12-01},
  abstract = {Accessing the large volumes of information available in public knowledge bases might be complicated for those users unfamiliar with formal languages, such as the SPARQL query language and the ontology definition languages. This issue can be overcome by providing systems able to answer questions posed in natural language on a knowledge base, a task that is called Knowledge Base Question Answering (KBQA) in the literature. More in detail, many KBQA systems aim at translating automatically questions into the corresponding SPARQL queries to be executed over the knowledge base to get the answers. Effective state-of-the-art KBQA systems are based on neural-machine translation but easily fail to recognize words that are Out Of the Vocabulary (OOV) of the training set. This is a serious issue while querying large ontologies where the list of entities is huge and easily evolves over time. In this paper, we present the SPARQL-QA-v2 system that combines in an innovative way Named Entity Linking, Named Entity Recognition, and Neural Machine Translation for addressing the problem of generating SPARQL queries from questions posed in natural language. We demonstrate empirically that SPARQL-QA-v2 is effective and resilient to OOV words and delivers state-of-the-art performance in well-known datasets for question answering over DBpedia and Wikidata knowledge bases.},
  langid = {english},
  keywords = {KBQA},
}

@article{burgerViewbasedModeldrivenSoftware2016,
  title = {View-Based Model-Driven Software Development with {{ModelJoin}}},
  author = {Burger, Erik and Henss, J{\"o}rg and K{\"u}ster, Martin and Kruse, Steffen and Happe, Lucia},
  year = {2016},
  month = may,
  journal = {Software \& Systems Modeling},
  volume = {15},
  number = {2},
  pages = {473--496},
  issn = {1619-1366, 1619-1374},
  doi = {10.1007/s10270-014-0413-5},
  urldate = {2024-04-15},
  abstract = {Fragmentation of information across instances of different metamodels poses a significant problem for software developers and leads to a major increase in effort of transformation development. Moreover, compositions of metamodels tend to be incomplete, imprecise, and erroneous, making it impossible to present it to users or use it directly as input for applications. Customized views satisfy information needs by focusing on a particular concern, and filtering out information that is not relevant to this concern. For a broad establishment of view-based approaches, an automated solution to deal with separate metamodels and the high complexity of model transformations is necessary. In this paper, we present the ModelJoin approach for the rapid creation of views. Using a human-readable textual DSL, developers can define custom views declaratively without having to write model transformations or define a bridging metamodel.},
  langid = {english},
}

@inproceedings{castrofernandezSeepingSemanticsLinking2018,
  title = {Seeping {{Semantics}}: {{Linking Datasets Using Word Embeddings}} for {{Data Discovery}}},
  shorttitle = {Seeping {{Semantics}}},
  booktitle = {2018 {{IEEE}} 34th {{International Conference}} on {{Data Engineering}} ({{ICDE}})},
  author = {Castro Fernandez, Raul and Mansour, Essam and Qahtan, Abdulhakim A. and Elmagarmid, Ahmed and Ilyas, Ihab and Madden, Samuel and Ouzzani, Mourad and Stonebraker, Michael and Tang, Nan},
  year = {2018},
  month = apr,
  pages = {989--1000},
  publisher = {IEEE},
  address = {Paris},
  doi = {10.1109/ICDE.2018.00093},
  urldate = {2024-08-09},
  abstract = {Employees that spend more time finding relevant data than analyzing it suffer from a data discovery problem. The large volume of data in enterprises, and sometimes the lack of knowledge of the schemas aggravates this problem. Similar to how we navigate the Web, we propose to identify semantic links that assist analysts in their discovery tasks. These links relate tables to each other, to facilitate navigating the schemas. They also relate data to external data sources, such as ontologies and dictionaries, to help explain the schema meaning. We materialize the links in an enterprise knowledge graph, where they become available to analysts. The main challenge is how to find pairs of objects that are semantically related. We propose SEMPROP, a DAG of different components that find links based on syntactic and semantic similarities. SEMPROP is commanded by a semantic matcher which leverages word embeddings to find objects that are semantically related. We introduce coherent group, a technique to combine word embeddings that works better than other state of the art combination alternatives. We implement SEMPROP as part of Aurum, a data discovery system we are building, and conduct user studies, real deployments and a quantitative evaluation to understand the benefits of links for data discovery tasks, as well as the benefits of SEMPROP and coherent groups to find those links.},
  isbn = {978-1-5386-5520-7},
  langid = {english},
}

@misc{ChatGPT2024,
  title = {{{ChatGPT}}},
  year = {2024},
  url = {https://chat.openai.com},
  urldate = {2024-04-06},
  abstract = {A conversational AI system that listens, learns, and challenges},
  langid = {american},
}

@misc{chenBidirectionalAttentiveMemory2019,
  title = {Bidirectional {{Attentive Memory Networks}} for {{Question Answering}} over {{Knowledge Bases}}},
  author = {Chen, Yu and Wu, Lingfei and Zaki, Mohammed J.},
  year = {2019},
  month = may,
  number = {arXiv:1903.02188},
  eprint = {1903.02188},
  primaryclass = {cs},
  publisher = {arXiv},
  url = {http://arxiv.org/abs/1903.02188},
  urldate = {2024-03-05},
  abstract = {When answering natural language questions over knowledge bases (KBs), different question components and KB aspects play different roles. However, most existing embedding-based methods for knowledge base question answering (KBQA) ignore the subtle inter-relationships between the question and the KB (e.g., entity types, relation paths and context). In this work, we propose to directly model the two-way flow of interactions between the questions and the KB via a novel Bidirectional Attentive Memory Network, called BAMnet. Requiring no external resources and only very few hand-crafted features, on the WebQuestions benchmark, our method significantly outperforms existing information-retrieval based methods, and remains competitive with (hand-crafted) semantic parsing based methods. Also, since we use attention mechanisms, our method offers better interpretability compared to other baselines.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Computation and Language},
}

@inproceedings{chenFastAccurateDependency2014,
  title = {A {{Fast}} and {{Accurate Dependency Parser}} Using {{Neural Networks}}},
  booktitle = {Proceedings of the 2014 {{Conference}} on {{Empirical Methods}} in {{Natural Language Processing}} ({{EMNLP}})},
  author = {Chen, Danqi and Manning, Christopher},
  year = {2014},
  pages = {740--750},
  publisher = {Association for Computational Linguistics},
  address = {Doha, Qatar},
  doi = {10.3115/v1/D14-1082},
  urldate = {2024-08-29},
  abstract = {Almost all current dependency parsers classify based on millions of sparse indicator features. Not only do these features generalize poorly, but the cost of feature computation restricts parsing speed significantly. In this work, we propose a novel way of learning a neural network classifier for use in a greedy, transition-based dependency parser. Because this classifier learns and uses just a small number of dense features, it can work very fast, while achieving an about 2\% improvement in unlabeled and labeled attachment scores on both English and Chinese datasets. Concretely, our parser is able to parse more than 1000 sentences per second at 92.2\% unlabeled attachment score on the English Penn Treebank.},
  langid = {english},
}

@misc{chengBuildingNeuralSemantic2018,
  title = {Building a {{Neural Semantic Parser}} from a {{Domain Ontology}}},
  author = {Cheng, Jianpeng and Reddy, Siva and Lapata, Mirella},
  year = {2018},
  month = dec,
  number = {arXiv:1812.10037},
  eprint = {1812.10037},
  primaryclass = {cs},
  publisher = {arXiv},
  url = {http://arxiv.org/abs/1812.10037},
  urldate = {2024-08-19},
  abstract = {Semantic parsing is the task of converting natural language utterances into machine interpretable meaning representations which can be executed against a real-world environment such as a database. Scaling semantic parsing to arbitrary domains faces two interrelated challenges: obtaining broad coverage training data effectively and cheaply; and developing a model that generalizes to compositional utterances and complex intentions. We address these challenges with a framework which allows to elicit training data from a domain ontology and bootstrap a neural parser which recursively builds derivations of logical forms. In our framework meaning representations are described by sequences of natural language templates, where each template corresponds to a decomposed fragment of the underlying meaning representation. Although artificial, templates can be understood and paraphrased by humans to create natural utterances, resulting in parallel triples of utterances, meaning representations, and their decompositions. These allow us to train a neural semantic parser which learns to compose rules in deriving meaning representations. We crowdsource training data on six domains, covering both single-turn utterances which exhibit rich compositionality, and sequential utterances where a complex task is procedurally performed in steps. We then develop neural semantic parsers which perform such compositional tasks. In general, our approach allows to deploy neural semantic parsers quickly and cheaply from a given domain ontology.},
  archiveprefix = {arXiv},
  langid = {english},
  keywords = {Computer Science - Computation and Language,Question generation,SQUALL generation,Synthetic data},
  annotation = {Read\_Status: To Read\\
Read\_Status\_Date: 2024-11-22T18:16:40.840Z},
}

@misc{chenHiQAHierarchicalContextual2024,
  title = {{{HiQA}}: {{A Hierarchical Contextual Augmentation RAG}} for {{Multi-Documents QA}}},
  shorttitle = {{{HiQA}}},
  author = {Chen, Xinyue and Gao, Pengyu and Song, Jiangjiang and Tan, Xiaoyang},
  year = {2024},
  month = sep,
  number = {arXiv:2402.01767},
  eprint = {2402.01767},
  publisher = {arXiv},
  doi = {10.48550/arXiv.2402.01767},
  urldate = {2024-12-05},
  abstract = {Retrieval-augmented generation (RAG) has rapidly advanced the language model field, particularly in question-answering (QA) systems. By integrating external documents during the response generation phase, RAG significantly enhances the accuracy and reliability of language models. This method elevates the quality of responses and reduces the frequency of hallucinations, where the model generates incorrect or misleading information. However, these methods exhibit limited retrieval accuracy when faced with numerous indistinguishable documents, presenting notable challenges in their practical application. In response to these emerging challenges, we present HiQA, an advanced multi-document question-answering (MDQA) framework that integrates cascading metadata into content and a multi-route retrieval mechanism. We also release a benchmark called MasQA to evaluate and research in MDQA. Finally, HiQA demonstrates the state-of-the-art performance in multi-document environments.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language,Computer Science - Machine Learning},
}

@article{chenSubgraphGuidedKnowledgeGraph2024,
  title = {Toward {{Subgraph-Guided Knowledge Graph Question Generation With Graph Neural Networks}}},
  author = {Chen, Yu and Wu, Lingfei and Zaki, Mohammed J.},
  year = {2024},
  journal = {IEEE Transactions on Neural Networks and Learning Systems},
  pages = {1--12},
  issn = {2162-237X, 2162-2388},
  doi = {10.1109/TNNLS.2023.3264519},
  urldate = {2024-03-05},
  abstract = {Knowledge graph (KG) question generation (QG) aims to generate natural language questions from KGs and target answers. Previous works mostly focus on a simple setting that is to generate questions from a single KG triple. In this work, we focus on a more realistic setting where we aim to generate questions from a KG subgraph and target answers. In addition, most previous works built on either RNN- or Transformer-based models to encode a linearized KG subgraph, which totally discards the explicit structure information of a KG subgraph. To address this issue, we propose to apply a bidirectional Graph2Seq model to encode the KG subgraph. Furthermore, we enhance our RNN decoder with a node-level copying mechanism to allow direct copying of node attributes from the KG subgraph to the output question. Both automatic and human evaluation results demonstrate that our model achieves new state-of-the-art scores, outperforming existing methods by a significant margin on two QG benchmarks. Experimental results also show that our QG model can consistently benefit the question-answering (QA) task as a means of data augmentation.},
  langid = {english},
  keywords = {Computer Science - Computation and Language},
}

@misc{chikinChars2vecCharacterbasedWord2023,
  title = {Chars2vec: {{Character-based}} Word Embeddings Model Based on {{RNN}} for Handling Real World Texts},
  shorttitle = {Chars2vec},
  author = {Chikin, Vladimir and Milo, Fabrizio and Solodskih, Kirill},
  year = {2023},
  month = feb,
  url = {https://github.com/IntuitionEngineeringTeam/chars2vec},
  urldate = {2024-05-30},
  abstract = {Character-based word embeddings model based on RNN for handling real world~texts - IntuitionEngineeringTeam/chars2vec},
  copyright = {Apache-2.0},
  howpublished = {Intuition.Engineering},
}

@inproceedings{cohenEvaluationSPARQLQuery2013,
  title = {Evaluation of {{SPARQL}} Query Generation from Natural Language Questions},
  booktitle = {Proceedings of the {{Joint Workshop}} on {{NLP}}\&{{LOD}} and {{SWAIE}}: {{Semantic Web}}, {{Linked Open Data}} and {{Information Extraction}}},
  author = {Cohen, K. Bretonnel and Kim, Jin-Dong},
  editor = {Maynard, Diana and {van Erp}, Marieke and Davis, Brian and Osenova, Petya and Simov, Kiril and Georgiev, Georgi and Nakov, Preslav},
  year = {2013},
  month = sep,
  pages = {3--7},
  publisher = {INCOMA Ltd. Shoumen, BULGARIA},
  address = {Hissar, Bulgaria},
  url = {https://aclanthology.org/W13-5202},
  urldate = {2024-05-23},
}

@inproceedings{cohenPresentingModelBasedSystems2021,
  title = {Presenting {{Model-Based Systems Engineering Information}} to {{Non-Modelers}}},
  booktitle = {2021 {{IEEE Aerospace Conference}} (50100)},
  author = {Cohen, Jeffrey R. and Arai, Sarah and Rakalina, Tatyana and Griffin, Emily and Heiser, Jared and Urbina, Michelle and McGuire, Kerry M. and Rubin, David and Seigel, Alex J. and Shah, Alay and Ramachandran, Sandhya and Dixit, Anusha and Legaspi, Jennifer and Mindock, Jennifer A. and Bardina, Jorge and Hailey, Melinda J.},
  year = {2021},
  month = mar,
  pages = {1--18},
  publisher = {IEEE},
  address = {Big Sky, MT, USA},
  doi = {10.1109/AERO50100.2021.9438292},
  urldate = {2024-11-20},
  abstract = {NASA's Human Research Program's (HRP) Exploration Medical Capability (ExMC) Element adopted Systems Engineering (SE) principles and Model Based Systems Engineering (MBSE) tools to capture the system functions, system architecture, requirements, interfaces, and clinical capabilities for a future exploration medical system. There are many different stakeholders who may use the information in the model: systems engineers, clinicians (physicians, nurses, and pharmacists), scientists, and program managers. Many of these individuals do not have access to MBSE modeling tools or have never used these tools. Many of these individuals (clinicians, scientists, even program managers) may have no experience with SE in general let alone interpreting a systems model. The challenge faced by ExMC was how to present the content in the model to non-modelers in a way they could understand with limited to no training in MBSE or the Systems Modeling Language (SysML) without using the modeling tool. Therefore, from the model, ExMC created an HTML report that is accessible to anyone with a browser. When creating the HTML report, the ExMC SE team talked to stakeholders and received their feedback on what content they wanted and how to display this content. Factoring in feedback, the report arranges the content in a way that not only directs readers through the SE process taken to derive the requirements, but also helps them to understand the fundamental steps in an SE approach. The report includes links to source information (i.e., NASA documentation that describes levels of care) and other SE deliverables (e.g., Concept of Operations). These links were provided to aid in the understanding of how the team created this content through a methodical SE approach. This paper outlines the process used to develop the model, the data chosen to share with stakeholders, many of the model elements used in the report, the review process stakeholders followed, the comments received from the stakeholders, and the lessons ExMC learned through producing this HTML report.},
  copyright = {https://doi.org/10.15223/policy-029},
  isbn = {978-1-72817-436-5},
  langid = {english},
}

@misc{cortesQuestionAnsweringLinked2022,
  title = {Question {{Answering}} over Linked Data Benchmark Comparison},
  author = {Cortes, Eduardo and Karras, Oliver},
  year = {2022},
  publisher = {Open Research Knowledge Graph},
  doi = {10.48366/R161787},
  urldate = {2024-12-17},
  abstract = {A Question Answering over linked data benchmark comparison.},
  copyright = {Creative Commons Attribution Share Alike 4.0 International},
  langid = {english},
  keywords = {Natural Language Processing}
}

@article{cuiLifelongEmbeddingLearning2023,
  title = {Lifelong {{Embedding Learning}} and {{Transfer}} for {{Growing Knowledge Graphs}}},
  author = {Cui, Yuanning and Wang, Yuxin and Sun, Zequn and Liu, Wenqiang and Jiang, Yiqiao and Han, Kexin and Hu, Wei},
  year = {2023},
  month = jun,
  journal = {Proceedings of the AAAI Conference on Artificial Intelligence},
  volume = {37},
  number = {4},
  pages = {4217--4224},
  issn = {2374-3468, 2159-5399},
  doi = {10.1609/aaai.v37i4.25539},
  urldate = {2024-07-10},
  abstract = {Existing knowledge graph (KG) embedding models have primarily focused on static KGs. However, real-world KGs do not remain static, but rather evolve and grow in tandem with the development of KG applications. Consequently, new facts and previously unseen entities and relations continually emerge, necessitating an embedding model that can quickly learn and transfer new knowledge through growth. Motivated by this, we delve into an expanding field of KG embedding in this paper, i.e., lifelong KG embedding. We consider knowledge transfer and retention of the learning on growing snapshots of a KG without having to learn embeddings from scratch. The proposed model includes a masked KG autoencoder for embedding learning and update, with an embedding transfer strategy to inject the learned knowledge into the new entity and relation embeddings, and an embedding regularization method to avoid catastrophic forgetting. To investigate the impacts of different aspects of KG growth, we construct four datasets to evaluate the performance of lifelong KG embedding. Experimental results show that the proposed model outperforms the state-of-the-art inductive and lifelong embedding baselines.},
  langid = {english},
}

@misc{dahlMBSEModelsGenerated2021,
  title = {From {{MBSE}} Models to Generated Documents},
  author = {Dahl, Ida},
  year = {2021},
  month = feb,
  journal = {Samares Engineering},
  url = {https://www.samares-engineering.com/en/2021/02/19/from-mbse-models-to-generated-documents},
  urldate = {2024-11-19},
  abstract = {undefined},
  langid = {british},
}

@techreport{darmSystemEngineeringModels2022,
  type = {Executive {{Report Summary}}},
  title = {System {{Engineering Models Meet Knowledge Graphs Executive Report Summary}}},
  author = {Darm, Paul and Berquand, Audrey and Riccardi, Annalisa and Minisci, Edmondo},
  year = {2022},
  month = jun,
  institution = {European Space Agency},
  langid = {english},
}

@misc{DeepLearningGenerating,
  title = {Deep Learning in Generating Radiology Reports: {{A}} Survey - {{ScienceDirect}}},
  url = {https://www.sciencedirect.com/science/article/pii/S0933365719302635#sec0085},
  urldate = {2024-08-05},
  keywords = {Evaluation}
}

@inproceedings{delpModelBasedDocument2013,
  title = {Model Based Document and Report Generation for Systems Engineering},
  booktitle = {2013 {{IEEE Aerospace Conference}}},
  author = {Delp, C. and Lam, D. and Fosse, E. and {Cin-Young Lee}},
  year = {2013},
  month = mar,
  pages = {1--11},
  publisher = {IEEE},
  address = {Big Sky, MT},
  doi = {10.1109/AERO.2013.6496926},
  urldate = {2023-12-01},
  abstract = {As Model Based Systems Engineering (MBSE) practices gain adoption, various approaches have been developed in order to simplify and automate the process of generating documents from models. Essentially, all of these techniques can be unified around the concept of producing different views of the model according to the needs of the intended audience. In this paper, we will describe a technique developed at JPL of applying SysML Viewpoints and Views to generate documents and reports. An architecture of model-based view and document generation will be presented, and the necessary extensions to SysML with associated rationale will be explained. A survey of examples will highlight a variety of views that can be generated, and will provide some insight into how collaboration and integration is enabled. We will also describe the basic architecture for the enterprise applications that support this approach.},
  isbn = {978-1-4673-1813-6},
  langid = {english},
}

@misc{delpViewpointModelingModel2013,
  title = {Viewpoint {{Modeling}} and {{Model Based Media Generation}} for {{Systems Engineers}} -- {{Document Generation}} and {{Scalable Model Based Engineering}}},
  author = {Delp, Christopher},
  year = {2013},
  copyright = {NASA/Caltech Jet Propulsion Laboratory},
}

@article{DescriptionLogic2024,
  title = {Description Logic},
  year = {2024},
  month = nov,
  journal = {Wikipedia},
  url = {https://en.wikipedia.org/w/index.php?title=Description_logic&oldid=1256136767},
  urldate = {2024-11-27},
  abstract = {Description logics (DL) are a family of formal knowledge representation languages. Many DLs are more expressive than propositional logic but less expressive than first-order logic. In contrast to the latter, the core reasoning problems for DLs are (usually) decidable, and efficient decision procedures have been designed and implemented for these problems. There are general, spatial, temporal, spatiotemporal, and fuzzy description logics, and each description logic features a different balance between expressive power and reasoning complexity by supporting different sets of mathematical constructors. DLs are used in artificial intelligence to describe and reason about the relevant concepts of an application domain (known as terminological knowledge). It is of particular importance in providing a logical formalism for ontologies and the Semantic Web: the Web Ontology Language (OWL) and its profiles are based on DLs. The most notable application of DLs and OWL is in biomedical informatics where DL assists in the codification of biomedical knowledge.},
  copyright = {Creative Commons Attribution-ShareAlike License},
  langid = {english},
  annotation = {Page Version ID: 1256136767},
}

@misc{dettmersQLoRAEfficientFinetuning2023,
  title = {{{QLoRA}}: {{Efficient Finetuning}} of {{Quantized LLMs}}},
  shorttitle = {{{QLoRA}}},
  author = {Dettmers, Tim and Pagnoni, Artidoro and Holtzman, Ari and Zettlemoyer, Luke},
  year = {2023},
  month = may,
  number = {arXiv:2305.14314},
  eprint = {2305.14314},
  publisher = {arXiv},
  doi = {10.48550/arXiv.2305.14314},
  urldate = {2024-06-01},
  abstract = {We present QLoRA, an efficient finetuning approach that reduces memory usage enough to finetune a 65B parameter model on a single 48GB GPU while preserving full 16-bit finetuning task performance. QLoRA backpropagates gradients through a frozen, 4-bit quantized pretrained language model into Low Rank Adapters{\textasciitilde}(LoRA). Our best model family, which we name Guanaco, outperforms all previous openly released models on the Vicuna benchmark, reaching 99.3\% of the performance level of ChatGPT while only requiring 24 hours of finetuning on a single GPU. QLoRA introduces a number of innovations to save memory without sacrificing performance: (a) 4-bit NormalFloat (NF4), a new data type that is information theoretically optimal for normally distributed weights (b) double quantization to reduce the average memory footprint by quantizing the quantization constants, and (c) paged optimziers to manage memory spikes. We use QLoRA to finetune more than 1,000 models, providing a detailed analysis of instruction following and chatbot performance across 8 instruction datasets, multiple model types (LLaMA, T5), and model scales that would be infeasible to run with regular finetuning (e.g. 33B and 65B parameter models). Our results show that QLoRA finetuning on a small high-quality dataset leads to state-of-the-art results, even when using smaller models than the previous SoTA. We provide a detailed analysis of chatbot performance based on both human and GPT-4 evaluations showing that GPT-4 evaluations are a cheap and reasonable alternative to human evaluation. Furthermore, we find that current chatbot benchmarks are not trustworthy to accurately evaluate the performance levels of chatbots. A lemon-picked analysis demonstrates where Guanaco fails compared to ChatGPT. We release all of our models and code, including CUDA kernels for 4-bit training.},
  archiveprefix = {arXiv},
  langid = {english},
  keywords = {Computer Science - Machine Learning},
}

@article{dialloComprehensiveEvaluationNeural2024,
  title = {A {{Comprehensive Evaluation}} of {{Neural SPARQL Query Generation From Natural Language Questions}}},
  author = {Diallo, Papa Abdou Karim Karou and Reyd, Samuel and Zouaq, Amal},
  year = {2024},
  month = sep,
  journal = {IEEE Access},
  volume = {12},
  eprint = {2304.07772},
  primaryclass = {cs},
  pages = {125057--125078},
  issn = {2169-3536},
  doi = {10.1109/ACCESS.2024.3453215},
  urldate = {2024-05-23},
  abstract = {In recent years, the field of neural machine translation (NMT) for SPARQL query generation has witnessed significant growth. Incorporating the copy mechanism with traditional encoder-decoder architectures and using pre-trained encoderdecoders and large language models have set new performance benchmarks. This paper presents various experiments that replicate and expand upon recent NMT-based SPARQL generation studies, comparing pre-trained language models (PLMs), non-pretrained language models (NPLMs), and large language models (LLMs), highlighting the impact of question annotation and the copy mechanism and testing various fine-tuning methods using LLMs. In particular, we provide a systematic error analysis of the models and test their generalization ability. Our study demonstrates that the copy mechanism yields significant performance enhancements for most PLMs and NPLMs. Annotating the data is pivotal to generating correct URIs, with the "tag-within" strategy emerging as the most effective approach. Additionally, our findings reveal that the primary source of errors stems from incorrect URIs in SPARQL queries that are sometimes replaced with hallucinated URIs when using base models. This does not happen using the copy mechanism, but it sometimes leads to selecting wrong URIs among candidates. Finally, the performance of the tested LLMs fell short of achieving the desired outcomes.},
  archiveprefix = {arXiv},
  copyright = {https://creativecommons.org/licenses/by/4.0/legalcode},
  langid = {english},
  keywords = {Computer Science - Computation and Language,Computer Science - Machine Learning},
}

@article{DomainModel2024,
  title = {Domain Model},
  year = {2024},
  month = may,
  journal = {Wikipedia},
  url = {https://en.wikipedia.org/w/index.php?title=Domain_model&oldid=1223669885},
  urldate = {2024-11-20},
  abstract = {In software engineering, a domain model is a conceptual model of the domain that incorporates both behavior and data. In ontology engineering, a domain model is a formal representation of a knowledge domain with concepts, roles, datatypes, individuals, and rules, typically grounded in a description logic.},
  copyright = {Creative Commons Attribution-ShareAlike License},
  langid = {english},
  annotation = {Page Version ID: 1223669885},
}

@misc{dubeyEARLJointEntity2018,
  title = {{{EARL}}: {{Joint Entity}} and {{Relation Linking}} for {{Question Answering}} over {{Knowledge Graphs}}},
  shorttitle = {{{EARL}}},
  author = {Dubey, Mohnish and Banerjee, Debayan and Chaudhuri, Debanjan and Lehmann, Jens},
  year = {2018},
  month = jun,
  number = {arXiv:1801.03825},
  eprint = {1801.03825},
  primaryclass = {cs},
  publisher = {arXiv},
  url = {http://arxiv.org/abs/1801.03825},
  urldate = {2024-03-07},
  abstract = {Many question answering systems over knowledge graphs rely on entity and relation linking components in order to connect the natural language input to the underlying knowledge graph. Traditionally, entity linking and relation linking have been performed either as dependent sequential tasks or as independent parallel tasks. In this paper, we propose a framework called EARL, which performs entity linking and relation linking as a joint task. EARL implements two different solution strategies for which we provide a comparative analysis in this paper: The first strategy is a formalisation of the joint entity and relation linking tasks as an instance of the Generalised Travelling Salesman Problem (GTSP). In order to be computationally feasible, we employ approximate GTSP solvers. The second strategy uses machine learning in order to exploit the connection density between nodes in the knowledge graph. It relies on three base features and re-ranking steps in order to predict entities and relations. We compare the strategies and evaluate them on a dataset with 5000 questions. Both strategies significantly outperform the current state-of-the-art approaches for entity and relation linking.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language},
}

@incollection{dubeyLCQuAD20Large2019,
  title = {{{LC-QuAD}} 2.0: {{A Large Dataset}} for {{Complex Question Answering}} over {{Wikidata}} and {{DBpedia}}},
  shorttitle = {{{LC-QuAD}} 2.0},
  booktitle = {The {{Semantic Web}} -- {{ISWC}} 2019},
  author = {Dubey, Mohnish and Banerjee, Debayan and Abdelkawi, Abdelrahman and Lehmann, Jens},
  editor = {Ghidini, Chiara and Hartig, Olaf and Maleshkova, Maria and Sv{\'a}tek, Vojt{\v e}ch and Cruz, Isabel and Hogan, Aidan and Song, Jie and Lefran{\c c}ois, Maxime and Gandon, Fabien},
  year = {2019},
  volume = {11779},
  pages = {69--78},
  publisher = {Springer International Publishing},
  address = {Cham},
  doi = {10.1007/978-3-030-30796-7_5},
  urldate = {2024-02-29},
  abstract = {Providing machines with the capability of exploring knowledge graphs and answering natural language questions has been an active area of research over the past decade. In this direction translating natural language questions to formal queries has been one of the key approaches. To advance the research area, several datasets like WebQuestions, QALD and LCQuAD have been published in the past. The biggest data set available for complex questions (LCQuAD) over knowledge graphs contains five thousand questions. We now provide LC-QuAD 2.0 (Large-Scale Complex Question Answering Dataset) with 30,000 questions, their paraphrases and their corresponding SPARQL queries. LC-QuAD 2.0 is compatible with both Wikidata and DBpedia 2018 knowledge graphs. In this article, we explain how the dataset was created and the variety of questions available with examples. We further provide a statistical analysis of the dataset.},
  isbn = {978-3-030-30796-7},
  langid = {english},
}

@misc{duffyStructuralTransferLearning2023,
  title = {Structural {{Transfer Learning}} in {{NL-to-Bash Semantic Parsers}}},
  author = {Duffy, Kyle and Bhattamishra, Satwik and Blunsom, Phil},
  year = {2023},
  month = jul,
  number = {arXiv:2307.16795},
  eprint = {2307.16795},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.2307.16795},
  urldate = {2024-11-22},
  abstract = {Large-scale pre-training has made progress in many fields of natural language processing, though little is understood about the design of pre-training datasets. We propose a methodology for obtaining a quantitative understanding of structural overlap between machine translation tasks. We apply our methodology to the natural language to Bash semantic parsing task (NLBash) and show that it is largely reducible to lexical alignment. We also find that there is strong structural overlap between NLBash and natural language to SQL. Additionally, we perform a study varying compute expended during pre-training on the English to German machine translation task and find that more compute expended during pre-training does not always correspond semantic representations with stronger transfer to NLBash.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language,Computer Science - Machine Learning},
  annotation = {Read\_Status: To Read\\
Read\_Status\_Date: 2024-11-22T17:07:43.767Z},
}

@misc{elaasarOntologicalModelingLanguage2024,
  title = {Ontological {{Modeling Language}} V2},
  author = {Elaasar, Maged and Rouquette, Nicolas},
  year = {2024},
  month = nov,
  url = {https://www.opencaesar.io/oml/},
  urldate = {2025-01-05},
  copyright = {Copyright {\copyright} 2019-2024 California Institute of Technology. Government sponsorship acknowledged.},
}

@inproceedings{elaasarOpenCAESARBalancingAgility2023,
  title = {{{openCAESAR}}: {{Balancing Agility}} and {{Rigor}} in {{Model-Based Systems Engineering}}},
  shorttitle = {{{openCAESAR}}},
  booktitle = {2023 {{ACM}}/{{IEEE International Conference}} on {{Model Driven Engineering Languages}} and {{Systems Companion}} ({{MODELS-C}})},
  author = {Elaasar, Maged and Rouquette, Nicolas and Wagner, David and Oakes, Bentley James and {Hamou-Lhadj}, Abdelwahab and Hamdaqa, Mohammad},
  year = {2023},
  month = oct,
  pages = {221--230},
  publisher = {IEEE},
  address = {V{\"a}ster{\aa}s, Sweden},
  doi = {10.1109/MODELS-C59198.2023.00051},
  urldate = {2024-08-26},
  abstract = {Model-Based System Engineering (MBSE) employs models and formal languages to support development of complex (systems-of-) systems. NASA Jet Propulsion Laboratory (JPL) sees MBSE as a key approach to managing the complexity of system development. However, balancing agility and rigor in MBSE has been reported as a challenging task not yet addressed by modeling tools and frameworks. This is because existing MBSE approaches may enable agility but compromise rigor, or enhance rigor but impede agility. We discuss the challenges of balancing agility and rigor in MBSE across seven systems engineering architectural functions defined by the JPL Integrated Model-Centric Engineering (IMCE) initiative. We demonstrate how openCAESAR, an open-source MBSE methodology and framework created at JPL, can strike a balance between agility and rigor through a case study of the Kepler16b project and discussion of lessons learned from past projects.},
  copyright = {https://doi.org/10.15223/policy-029},
  isbn = {9798350324983},
  langid = {english},
}

@misc{fabbriTreeBasedSemanticParsing2017,
  title = {Tree-{{Based Semantic Parsing}}},
  author = {Fabbri, Alexander and Radev, Dragomir},
  year = 2017,
  publisher = {Department of Computer Science, Yale University},
  url = {https://web.archive.org/web/20250113091733/https://yale-lily.github.io/public/fabbri_alexander_tree_based_semantic_parsing.pdf},
  urldate = {2024-12-05},
}

@misc{fanSurveyRAGMeeting2024,
  title = {A {{Survey}} on {{RAG Meeting LLMs}}: {{Towards Retrieval-Augmented Large Language Models}}},
  shorttitle = {A {{Survey}} on {{RAG Meeting LLMs}}},
  author = {Fan, Wenqi and Ding, Yujuan and Ning, Liangbo and Wang, Shijie and Li, Hengyun and Yin, Dawei and Chua, Tat-Seng and Li, Qing},
  year = {2024},
  month = jun,
  number = {arXiv:2405.06211},
  eprint = {2405.06211},
  publisher = {arXiv},
  doi = {10.48550/arXiv.2405.06211},
  urldate = {2024-08-25},
  abstract = {As one of the most advanced techniques in AI, Retrieval-Augmented Generation (RAG) can offer reliable and up-to-date external knowledge, providing huge convenience for numerous tasks. Particularly in the era of AI-Generated Content (AIGC), the powerful capacity of retrieval in providing additional knowledge enables RAG to assist existing generative AI in producing high-quality outputs. Recently, Large Language Models (LLMs) have demonstrated revolutionary abilities in language understanding and generation, while still facing inherent limitations, such as hallucinations and out-ofdate internal knowledge. Given the powerful abilities of RAG in providing the latest and helpful auxiliary information, RetrievalAugmented Large Language Models (RA-LLMs) have emerged to harness external and authoritative knowledge bases, rather than solely relying on the model's internal knowledge, to augment the generation quality of LLMs. In this survey, we comprehensively review existing research studies in RA-LLMs, covering three primary technical perspectives: architectures, training strategies, and applications. As the preliminary knowledge, we briefly introduce the foundations and recent advances of LLMs. Then, to illustrate the practical significance of RAG for LLMs, we systematically review mainstream relevant work by their architectures, training strategies, and application areas, detailing specifically the challenges of each and the corresponding capabilities of RA-LLMs. Finally, to deliver deeper insights, we discuss current limitations and several promising directions for future research. Updated information about this survey can be found at https:// advanced-recommendersystems.github.io/ RAG-Meets-LLMs/ 1.},
  archiveprefix = {arXiv},
  langid = {english},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language,Computer Science - Information Retrieval},
}

@misc{fengIncubatorCaseStudy2021,
  title = {The {{Incubator Case Study}} for {{Digital Twin Engineering}}},
  author = {Feng, Hao and Gomes, Cl{\'a}udio and Thule, Casper and Lausdahl, Kenneth and Sandberg, Michael and Larsen, Peter Gorm},
  year = {2021},
  month = feb,
  number = {arXiv:2102.10390},
  eprint = {2102.10390},
  primaryclass = {cs, eess},
  publisher = {arXiv},
  url = {http://arxiv.org/abs/2102.10390},
  urldate = {2024-01-11},
  abstract = {To demystify the Digital Twin concept, we built a simple yet representative thermal incubator system. The incubator is an insulated box fitted with a heatbed, and complete with a software system for communication, a controller, and simulation models. We developed two simulation models to predict the temperature inside the incubator, one with two free parameters and one with four free parameters. Our experiments showed that the latter model was better at predicting the thermal inertia of the heatbed itself, which makes it more appropriate for further development of the digital twin. The hardware and software used in this case study are available open source, providing an accessible platform for those who want to develop and verify their own techniques for digital twins.},
  archiveprefix = {arXiv},
  langid = {english},
  keywords = {Electrical Engineering and Systems Science - Systems and Control},
}

@misc{ferreSQUALL,
  title = {{{SQUALL}}},
  author = {Ferr{\'e}, S{\'e}bastien},
  url = {https://people.irisa.fr/Sebastien.Ferre/software/squall/},
  urldate = {2024-04-07},
  abstract = {SQUALL (Semantic Query and Update High-Level Language) is a controlled natural language (CNL) for querying and updating RDF graphs. The main advantage of CNLs is to reconcile the high-level and natural syntax of natural languages, and the precision and lack of ambiguity of formal languages. SQUALL has a strong adequacy with RDF, and covers all constructs of SPARQL, and many of SPARQL 1.1. Its syntax completely abstracts from low-level notions such as bindings and relational algebra. It features disjunction, negation, quantifiers, built-in predicates, aggregations with grouping, and n-ary relations through reification.},
  langid = {american},
}

@misc{ferreSquall2sparql2023,
  title = {Squall2sparql},
  author = {Ferr{\'e}, S{\'e}bastien},
  year = {2023},
  month = sep,
  url = {http://people.irisa.fr/Sebastien.Ferre/software/squall},
  urldate = {2024-08-18},
  abstract = {A translator from SQUALL (Semantic Query and Update High Level Language) to SPARQL.}
}

@inproceedings{ferreSquall2sparqlTranslatorControlled2013,
  title = {Squall2sparql: A {{Translator}} from {{Controlled English}} to {{Full SPARQL}} 1.1},
  booktitle = {{{CEUR Workshop Proceedings}}},
  author = {Ferr{\'e}, S{\'e}bastien},
  year = {2013},
  month = sep,
  volume = {1179},
  address = {Valencia, Spain},
  abstract = {This paper reports on the participation of the system squall2sparql in the QALD-3 question answering challenge for DBpedia. squall2sparql is a translator from SQUALL, a controlled natural language for English, to SPARQL 1.1, a standard expressive query and update language for linked open data. It covers nearly all features of SPARQL 1.1, and is directly applicable to any SPARQL endpoint.},
  langid = {english},
}

@misc{ferreSQUALLExamples,
  title = {{{SQUALL Examples}}},
  author = {Ferr{\'e}, S{\'e}bastien},
  url = {http://servolis.irisa.fr:3838/squall/examples},
  urldate = {2024-08-23},
}

@article{ferreSQUALLExpressivenessSPARQL2014,
  title = {{{SQUALL}}: {{The}} Expressiveness of {{SPARQL}} 1.1 Made Available as a Controlled Natural Language},
  shorttitle = {{{SQUALL}}},
  author = {Ferr{\'e}, S{\'e}bastien},
  year = {2014},
  month = nov,
  journal = {Data \& Knowledge Engineering},
  volume = {94},
  pages = {163--188},
  issn = {0169023X},
  doi = {10.1016/j.datak.2014.07.010},
  urldate = {2024-03-10},
  abstract = {The Semantic Web (SW) is now made of billions of triples, which are available as Linked Open Data (LOD) or as RDF stores. The SPARQL query language provides a very expressive way to search and explore this wealth of semantic data. However, userfriendly interfaces are needed to bridge the gap between end-users and SW formalisms. Navigation-based interfaces and natural language interfaces require no or little training, but they cover a small fragment of SPARQL's expressivity. We propose SQUALL, a query and update language that provides the full expressiveness of SPARQL 1.1 through a flexible controlled natural language (e.g., solution modifiers through superlatives, relational algebra through coordinations, filters through comparatives). A comprehensive and modular definition is given as a Montague grammar, and an evaluation of naturalness is done on the QALD challenge. SQUALL is conceived as a component of natural language interfaces, to be combined with lexicons, guided input, and contextual disambiguation. It is available as a Web service that translates SQUALL sentences to SPARQL, and submits them to SPARQL endpoints (e.g., DBpedia), therefore ensuring SW compliance, and leveraging the efficiency of SPARQL engines.},
  langid = {english},
}

@article{FutureMathematics2023,
  title = {Future of Mathematics},
  year = {2023},
  month = oct,
  journal = {Wikipedia},
  url = {https://en.wikipedia.org/w/index.php?title=Future_of_mathematics&oldid=1182226887},
  urldate = {2024-12-09},
  abstract = {The progression of both the nature of mathematics and individual mathematical problems into the future is a widely debated topic; many past predictions about modern mathematics have been misplaced or completely false, so there is reason to believe that many predictions today will follow a similar path. However, the subject still carries an important weight and has been written about by many notable mathematicians. Typically, they are motivated by a desire to set a research agenda to direct efforts to specific problems, or a wish to clarify, update and extrapolate the way that subdisciplines relate to the general discipline of mathematics and its possibilities. Examples of agendas pushing for progress in specific areas in the future, historical and recent, include  Felix Klein's Erlangen program, Hilbert's problems, Langlands program, and the Millennium Prize Problems. In the Mathematics Subject Classification section 01Axx History of mathematics and mathematicians, subsection 01A67 is titled Future prospectives. The accuracy of predictions about mathematics has varied widely and has proceeded very closely to that of technology.  As such, it is important to keep in mind that many of the predictions by researchers below may be misguided or turn out to be untrue.},
  copyright = {Creative Commons Attribution-ShareAlike License},
  langid = {english},
  annotation = {Page Version ID: 1182226887},
}

@misc{gaoRetrievalAugmentedGenerationLarge2024,
  title = {Retrieval-{{Augmented Generation}} for {{Large Language Models}}: {{A Survey}}},
  shorttitle = {Retrieval-{{Augmented Generation}} for {{Large Language Models}}},
  author = {Gao, Yunfan and Xiong, Yun and Gao, Xinyu and Jia, Kangxiang and Pan, Jinliu and Bi, Yuxi and Dai, Yi and Sun, Jiawei and Guo, Qianyu and Wang, Meng and Wang, Haofen},
  year = {2024},
  month = jan,
  publisher = {arXiv},
  doi = {10.48550/arXiv.2312.10997},
  urldate = {2024-02-22},
  abstract = {Large Language Models (LLMs) demonstrate significant capabilities but face challenges such as hallucination, outdated knowledge, and nontransparent, untraceable reasoning processes. Retrieval-Augmented Generation (RAG) has emerged as a promising solution by incorporating knowledge from external databases. This enhances the accuracy and credibility of the models, particularly for knowledge-intensive tasks, and allows for continuous knowledge updates and integration of domain-specific information. RAG synergistically merges LLMs' intrinsic knowledge with the vast, dynamic repositories of external databases. This comprehensive review paper offers a detailed examination of the progression of RAG paradigms, encompassing the Naive RAG, the Advanced RAG, and the Modular RAG. It meticulously scrutinizes the tripartite foundation of RAG frameworks, which includes the retrieval , the generation and the augmentation techniques. The paper highlights the state-of-the-art technologies embedded in each of these critical components, providing a profound understanding of the advancements in RAG systems. Furthermore, this paper introduces the metrics and benchmarks for assessing RAG models, along with the most up-to-date evaluation framework. In conclusion, the paper delineates prospective avenues for research, including the identification of challenges, the expansion of multi-modalities, and the progression of the RAG infrastructure and its ecosystem. 1.},
  langid = {english},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language},
}

@misc{gebhartKnowledgeSheavesSheafTheoretic2023,
  title = {Knowledge {{Sheaves}}: {{A Sheaf-Theoretic Framework}} for {{Knowledge Graph Embedding}}},
  shorttitle = {Knowledge {{Sheaves}}},
  author = {Gebhart, Thomas and Hansen, Jakob and Schrater, Paul},
  year = {2023},
  month = mar,
  number = {arXiv:2110.03789},
  eprint = {2110.03789},
  primaryclass = {cs, math, stat},
  publisher = {arXiv},
  url = {http://arxiv.org/abs/2110.03789},
  urldate = {2024-03-01},
  abstract = {Knowledge graph embedding involves learning representations of entities -- the vertices of the graph -- and relations -- the edges of the graph -- such that the resulting representations encode the known factual information represented by the knowledge graph and can be used in the inference of new relations. We show that knowledge graph embedding is naturally expressed in the topological and categorical language of {\textbackslash}textit\{cellular sheaves\}: a knowledge graph embedding can be described as an approximate global section of an appropriate {\textbackslash}textit\{knowledge sheaf\} over the graph, with consistency constraints induced by the knowledge graph's schema. This approach provides a generalized framework for reasoning about knowledge graph embedding models and allows for the expression of a wide range of prior constraints on embeddings. Further, the resulting embeddings can be easily adapted for reasoning over composite relations without special training. We implement these ideas to highlight the benefits of the extensions inspired by this new perspective.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Machine Learning,Mathematics - Algebraic Topology,Statistics - Machine Learning},
}

@article{gebruDatasheetsDatasets2021,
  title = {Datasheets for Datasets},
  author = {Gebru, Timnit and Morgenstern, Jamie and Vecchione, Briana and Vaughan, Jennifer Wortman and Wallach, Hanna and Iii, Hal Daum{\'e} and Crawford, Kate},
  year = {2021},
  month = dec,
  journal = {Communications of the ACM},
  volume = {64},