diff --git a/.github/workflows/draft-pdf.yml b/.github/workflows/draft-pdf.yml new file mode 100644 index 0000000..6563593 --- /dev/null +++ b/.github/workflows/draft-pdf.yml @@ -0,0 +1,23 @@ +on: [push] + +jobs: + paper: + runs-on: ubuntu-latest + name: Paper Draft + steps: + - name: Checkout + uses: actions/checkout@v3 + - name: Build draft PDF + uses: openjournals/openjournals-draft-action@master + with: + journal: joss + # This should be the path to the paper within your repo. + paper-path: paper/paper.md + - name: Upload + uses: actions/upload-artifact@v1 + with: + name: paper + # This is the output path where Pandoc will write the compiled + # PDF. Note, this should be the same directory as the input + # paper.md + path: paper/paper.pdf \ No newline at end of file diff --git a/docs/Phase_1.png b/docs/Phase_1.png index b16802c..f9f80eb 100644 Binary files a/docs/Phase_1.png and b/docs/Phase_1.png differ diff --git a/docs/Phase_3.png b/docs/Phase_3.png index 132d453..72ae513 100644 Binary files a/docs/Phase_3.png and b/docs/Phase_3.png differ diff --git a/paper/paper.bib b/paper/paper.bib new file mode 100644 index 0000000..1833e6b --- /dev/null +++ b/paper/paper.bib @@ -0,0 +1,141 @@ +@article{allison:2016, + title = {Reproducibility: {A} tragedy of errors}, + volume = {530}, + issn = {1476-4687}, + shorttitle = {Reproducibility}, + url = {http://www.nature.com/articles/530027a}, + doi = {10.1038/530027a}, + abstract = {Mistakes in peer-reviewed papers are easy to find but hard to fix, report David B. Allison and colleagues.}, + language = {en}, + number = {7588}, + urldate = {2022-06-01}, + journal = {Nature}, + author = {Allison, David B. and Brown, Andrew W. and George, Brandon J. and Kaiser, Kathryn A.}, + month = feb, + year = {2016}, + note = {Number: 7588 +Publisher: Nature Publishing Group}, + keywords = {Communication, Peer review, Publishing}, + pages = {27--29}, + file = {Full Text PDF:C\:\\Users\\beld\\Zotero\\storage\\N5X2EGIU\\Allison et al. - 2016 - Reproducibility A tragedy of errors.pdf:application/pdf;Snapshot:C\:\\Users\\beld\\Zotero\\storage\\HR2DJUNN\\530027a.html:text/html}, +} + +@article{cosentino_systematic_2017, + title = {A {Systematic} {Mapping} {Study} of {Software} {Development} {With} {GitHub}}, + volume = {5}, + issn = {2169-3536}, + doi = {10.1109/ACCESS.2017.2682323}, + abstract = {Context: GitHub, nowadays the most popular social coding platform, has become the reference for mining Open Source repositories, a growing research trend aiming at learning from previous software projects to improve the development of new ones. In the last years, a considerable amount of research papers have been published reporting findings based on data mined from GitHub. As the community continues to deepen in its understanding of software engineering thanks to the analysis performed on this platform, we believe that it is worthwhile to reflect on how research papers have addressed the task of mining GitHub and what findings they have reported. Objective: The main objective of this paper is to identify the quantity, topic, and empirical methods of research works, targeting the analysis of how software development practices are influenced by the use of a distributed social coding platform like GitHub. Method: A systematic mapping study was conducted with four research questions and assessed 80 publications from 2009 to 2016. Results: Most works focused on the interaction around coding-related tasks and project communities. We also identified some concerns about how reliable were these results based on the fact that, overall, papers used small data sets and poor sampling techniques, employed a scarce variety of methodologies and/or were hard to replicate. Conclusions: This paper attested the high activity of research work around the field of Open Source collaboration, especially in the software domain, revealed a set of shortcomings and proposed some actions to mitigate them. We hope that this paper can also create the basis for additional studies on other collaborative activities (like book writing for instance) that are also moving to GitHub.}, + journal = {IEEE Access}, + author = {Cosentino, Valerio and Cánovas Izquierdo, Javier L. and Cabot, Jordi}, + year = {2017}, + note = {Conference Name: IEEE Access}, + keywords = {Collaboration, Conferences, Data mining, GitHub, Libraries, open source software, Software, Software engineering, systematic mapping study, Systematics}, + pages = {7173--7192}, + file = {IEEE Xplore Abstract Record:C\:\\Users\\beld\\Zotero\\storage\\DNTPHCVB\\7887704.html:text/html;IEEE Xplore Full Text PDF:C\:\\Users\\beld\\Zotero\\storage\\CR27J8FJ\\Cosentino et al. - 2017 - A Systematic Mapping Study of Software Development.pdf:application/pdf}, +} + +@article{wilkinson_fair_2016, + title = {The {FAIR} {Guiding} {Principles} for scientific data management and stewardship}, + volume = {3}, + copyright = {2016 The Author(s)}, + issn = {2052-4463}, + url = {http://www.nature.com/articles/sdata201618}, + doi = {10.1038/sdata.2016.18}, + abstract = {There is an urgent need to improve the infrastructure supporting the reuse of scholarly data. A diverse set of stakeholders—representing academia, industry, funding agencies, and scholarly publishers—have come together to design and jointly endorse a concise and measureable set of principles that we refer to as the FAIR Data Principles. The intent is that these may act as a guideline for those wishing to enhance the reusability of their data holdings. Distinct from peer initiatives that focus on the human scholar, the FAIR Principles put specific emphasis on enhancing the ability of machines to automatically find and use the data, in addition to supporting its reuse by individuals. This Comment is the first formal publication of the FAIR Principles, and includes the rationale behind them, and some exemplar implementations in the community.}, + language = {en}, + number = {1}, + urldate = {2022-05-11}, + journal = {Scientific Data}, + author = {Wilkinson, Mark D. and Dumontier, Michel and Aalbersberg, IJsbrand Jan and Appleton, Gabrielle and Axton, Myles and Baak, Arie and Blomberg, Niklas and Boiten, Jan-Willem and da Silva Santos, Luiz Bonino and Bourne, Philip E. and Bouwman, Jildau and Brookes, Anthony J. and Clark, Tim and Crosas, Mercè and Dillo, Ingrid and Dumon, Olivier and Edmunds, Scott and Evelo, Chris T. and Finkers, Richard and Gonzalez-Beltran, Alejandra and Gray, Alasdair J. G. and Groth, Paul and Goble, Carole and Grethe, Jeffrey S. and Heringa, Jaap and ’t Hoen, Peter A. C. and Hooft, Rob and Kuhn, Tobias and Kok, Ruben and Kok, Joost and Lusher, Scott J. and Martone, Maryann E. and Mons, Albert and Packer, Abel L. and Persson, Bengt and Rocca-Serra, Philippe and Roos, Marco and van Schaik, Rene and Sansone, Susanna-Assunta and Schultes, Erik and Sengstag, Thierry and Slater, Ted and Strawn, George and Swertz, Morris A. and Thompson, Mark and van der Lei, Johan and van Mulligen, Erik and Velterop, Jan and Waagmeester, Andra and Wittenburg, Peter and Wolstencroft, Katherine and Zhao, Jun and Mons, Barend}, + month = mar, + year = {2016}, + note = {Number: 1 +Publisher: Nature Publishing Group}, + keywords = {Research data, Publication characteristics}, + pages = {160018}, + file = {Full Text PDF:C\:\\Users\\beld\\Zotero\\storage\\6QBMLUND\\Wilkinson et al. - 2016 - The FAIR Guiding Principles for scientific data ma.pdf:application/pdf;Snapshot:C\:\\Users\\beld\\Zotero\\storage\\VD4UQEDH\\sdata201618.html:text/html}, +} + +@article{lamprecht_towards_2020, + title = {Towards FAIR principles for research software}, + volume = {3}, + issn = {24518492, 24518484}, + url = {https://www.medra.org/servlet/aliasResolver?alias=iospress&doi=10.3233/DS-190026}, + doi = {10.3233/DS-190026}, + abstract = {The FAIR Guiding Principles, published in 2016, aim to improve the findability, accessibility, interoperability and reusability of digital research objects for both humans and machines. Until now the FAIR principles have been mostly applied to research data. The ideas behind these principles are, however, also directly relevant to research software. Hence there is a distinct need to explore how the FAIR principles can be applied to software. In this work, we aim to summarize the current status of the debate around FAIR and software, as basis for the development of community-agreed principles for FAIR research software in the future. We discuss what makes software different from data with regard to the application of the FAIR principles, and which desired characteristics of research software go beyond FAIR. Then we present an analysis of where the existing principles can directly be applied to software, where they need to be adapted or reinterpreted, and where the definition of additional principles is required. Here interoperability has proven to be the most challenging principle, calling for particular attention in future discussions. Finally, we outline next steps on the way towards definite FAIR principles for research software.}, + language = {en}, + number = {1}, + urldate = {2022-03-09}, + journal = {Data Science}, + author = {Lamprecht, Anna-Lena and Garcia, Leyla and Kuzak, Mateusz and Martinez, Carlos and Arcila, Ricardo and Martin Del Pico, Eva and Dominguez Del Angel, Victoria and van de Sandt, Stephanie and Ison, Jon and Martinez, Paula Andrea and McQuilton, Peter and Valencia, Alfonso and Harrow, Jennifer and Psomopoulos, Fotis and Gelpi, Josep Ll. and Chue Hong, Neil and Goble, Carole and Capella-Gutierrez, Salvador}, + editor = {Groth, Paul and Groth, Paul and Dumontier, Michel}, + month = jun, + year = {2020}, + pages = {37--59}, + file = {Lamprecht et al. - 2020 - Towards FAIR principles for research software.pdf:C\:\\Users\\beld\\Zotero\\storage\\ZHTGDEKQ\\Lamprecht et al. - 2020 - Towards FAIR principles for research software.pdf:application/pdf}, +} + +@software{Spaaks_howfairis_2022, +author = {Spaaks, Jurriaan H. and Verhoeven, Stefan and Tjong Kim Sang, Erik and Diblen, Faruk and Martinez-Ortiz, Carlos and Etuk, Edidiong and Kuzak, Mateusz and van Werkhoven, Ben and Soares Siqueira, Abel and Saladi, Shyam and Holding, Andrew}, +license = {Apache-2.0}, +month = sep, +title = {{howfairis}}, +url = {https://github.com/fair-software/howfairis}, +version = {0.14.2}, +year = {2022} +} + + +@mastersthesis{quach_mapping_2022, + title = {Mapping {Research} {Software} {Landscapes} through {Exploratory} {Studies} of {GitHub} {Data}}, + copyright = {CC-BY-NC-ND}, + url = {https://studenttheses.uu.nl/handle/20.500.12932/43162}, + abstract = {Research software enables data processing and plays a vital role in academia and industry. As such, it is essential to have findable, accessible, interoperable, and reusable (FAIR) research software. However, what precisely the landscape of research software looks like is unknown. Thus, we would like to understand the research software landscape better and utilize this information to infer actionable recommendations for the Research Software Engineer (RSE) practice. This study provides insights into the research software landscape at Utrecht University through an exploratory analysis while also considering the different scientific domains. We achieve this by collecting GitHub data and analyzing repository FAIRness and characteristics through heatmaps, histograms, statistical tables, and tests. Our method retrieved 176 users with 1521 repositories, of which 823 are considered research software. Others can adopt the proposed method to gain insights into their specific organization, as it is designed to be reproducible and reusable. The analysis showed significant differences between faculty characteristics and how to support the application of FAIR variables. Among other things, our results showed that Geosciences have the highest percentage of unlicensed repositories with 57\%. Also, Social Sciences are an outlier in language usage, as they are the only faculty to primarily use R, while other faculties primarily use Python. A first classification model is developed that achieves 70\% accuracy in identifying research software that can be used for future labelling tasks. Our recommendations include expanding the R café, creating FAIR reference documents, featuring and highlighting high impact and FAIR research software, and creating yearly reports. We conclude that our labelled GitHub dataset allows us to infer actionable recommendations on RSE practice.}, + language = {EN}, + urldate = {2023-07-08}, + author = {Quach, Keven}, + year = {2022}, + note = {Accepted: 2022-11-08T00:00:40Z}, + file = {Full Text PDF:C\:\\Users\\beld\\Zotero\\storage\\V6KC5CBI\\Quach - 2022 - Mapping Research Software Landscapes through Explo.pdf:application/pdf}, +} + +@misc{quach_keven_2023_8150215, + author = {Quach, Keven and + Lamprecht, Anna-Lena and + De Bruin, Jonathan}, + title = {{Mapping Research Software Landscapes through + Exploratory Studies of GitHub Data}}, + month = jul, + year = 2023, + publisher = {Zenodo}, + doi = {10.5281/zenodo.8150215}, + url = {https://doi.org/10.5281/zenodo.8150215} +} + +@Article{Barker2022, +author={Barker, Michelle +and Chue Hong, Neil P. +and Katz, Daniel S. +and Lamprecht, Anna-Lena +and Martinez-Ortiz, Carlos +and Psomopoulos, Fotis +and Harrow, Jennifer +and Castro, Leyla Jael +and Gruenpeter, Morane +and Martinez, Paula Andrea +and Honeyman, Tom}, +title={Introducing the FAIR Principles for research software}, +journal={Scientific Data}, +year={2022}, +month={Oct}, +day={14}, +volume={9}, +number={1}, +pages={622}, +abstract={Research software is a fundamental and vital part of research, yet significant challenges to discoverability, productivity, quality, reproducibility, and sustainability exist. Improving the practice of scholarship is a common goal of the open science, open source, and FAIR (Findable, Accessible, Interoperable and Reusable) communities and research software is now being understood as a type of digital object to which FAIR should be applied. This emergence reflects a maturation of the research community to better understand the crucial role of FAIR research software in maximising research value. The FAIR for Research Software (FAIR4RS) Working Group has adapted the FAIR Guiding Principles to create the FAIR Principles for Research Software (FAIR4RS Principles). The contents and context of the FAIR4RS Principles are summarised here to provide the basis for discussion of their adoption. Examples of implementation by organisations are provided to share information on how to maximise the value of research outputs, and to encourage others to amplify the importance and impact of this work.}, +issn={2052-4463}, +doi={10.1038/s41597-022-01710-x}, +url={https://doi.org/10.1038/s41597-022-01710-x} +} + diff --git a/paper/paper.md b/paper/paper.md new file mode 100644 index 0000000..b9551a8 --- /dev/null +++ b/paper/paper.md @@ -0,0 +1,62 @@ +--- +title: "SWORDS: A framework for the Scan and revieW of Open Research Data and Software" +tags: + - Python + - data analysis + - FAIR + - open science +authors: + - name: Keven Quach + orcid: 0009-0002-7017-0331 + equal-contrib: true + corresponding: true + affiliation: 1 + - name: Jonathan de Bruin + orcid: 0000-0002-4297-0502 + equal-contrib: true + affiliation: 2 + - name: Anna-Lena Lamprecht + orcid: 0000-0003-1953-5606 + equal-contrib: true + affiliation: 3 +affiliations: + - name: Independent Researcher, Germany + index: 1 + - name: Utrecht University, Netherlands + index: 2 + - name: University of Potsdam, Germany + index: 3 +date: 18 June 2023 +bibliography: paper.bib + +--- + +# Summary + +SWORDS (Scan and revieW of Open Research Data and Software) is a framework designed to provide insights into an organization's, as well as their members open-source activities, through a structured approach. The framework focuses on organizations within the research domain by taking academic publishing principles into account. A big challenge for such a framework lies in the decentralization of open-source activities. It is divided into three core stages that can be executed independently: + +1. Finding GitHub user profiles associated with an organization. + ![Phase 1](phase_1.png){width=50%} +2. Extracting relevant repositories. + ![Phase 2](phase_2.png){width=50%} +3. Studying the contents of the repositories. Content evaluation includes aspects of quality assessment, documentation availability, and FAIRness [@wilkinson_fair_2016] scores [@Spaaks_howfairis_2022] (Findability, Accessibility, Interoperability, and Reusability). + ![Phase 3](phase_3.png){width=50%} + +To illustrate, an organization that already has collected the GitHub user profiles does not need to execute phase 1. An organization that is only interested in collecting the GitHub user profiles does not need to execute the following phases. An organization that has already collected relevant repositories does not need to execute phases 1 and 2. + +In the past, research organizations had insufficient tooling for the analysis of research output like software and data. Over the years, many initiatives were introduced and especially the introduction of the FAIR principles contributed to an improvement of available tools [@Barker2022]. This enabled us to have a better understanding of the publication principles for transparent and reproducible publication of data and software that is also quantifiable through monitoring and evaluation. These insights can be useful as described in the [statement of need](#statement-of-need). + +Written in Python, SWORDS provides a template for easy implementation within any organization and focuses on GitHub, which is the go-to reference for mining open-source repositories [@cosentino_systematic_2017]. It is designed to be extensible and flexible, which allows to evaluate repositories on custom-defined metrics and collect users according to different strategies. The framework was applied to Utrecht University as part of a research project [@quach_mapping_2022]. The results of the aforementioned research project were presented at a conference [@quach_keven_2023_8150215]. Ongoing research projects are currently being conducted at the University of Potsdam. There are also related open source program office (OSPO) tools, which do not fit the academic use-case yet. + +# Statement of need + +Open Science, promoting transparency in academic publications, data, software, and other types of output, is crucial for enhancing scientific and societal impact in today's research climate. The application of Open Science principles to research data and software is vital for ensuring scientific integrity and reproducibility, which can sometimes be lackluster [@allison:2016]. However, substantial challenges persist in tracking, managing, and understanding open-source research software due to the scattered and fragmented nature of these activities across multiple platforms [@lamprecht_towards_2020]. + +The SWORDS framework addresses this need by providing a systematic approach to collating, analyzing, and understanding an organization's open-source research software. The insights gained from implementing SWORDS can help organizations connect initiatives, improve quality, reward and recognize contributions, and foster a collaborative and open research environment. Thus, SWORDS presents an invaluable tool for any research organization aiming to improve and better understand its open-source activities and drive forward Open Science. + + +# Acknowledgements + +We acknowledge contributions from Christopher Slewe during the genesis of this project. We also acknowledge the funding programs: The Open Science Programme (OSP) and FAIR Research IT. + +# References diff --git a/paper/paper.pdf b/paper/paper.pdf new file mode 100644 index 0000000..856fee3 Binary files /dev/null and b/paper/paper.pdf differ diff --git a/paper/phase_1.png b/paper/phase_1.png new file mode 100644 index 0000000..2ffcf0c Binary files /dev/null and b/paper/phase_1.png differ diff --git a/paper/phase_2.png b/paper/phase_2.png new file mode 100644 index 0000000..67e224e Binary files /dev/null and b/paper/phase_2.png differ diff --git a/paper/phase_3.png b/paper/phase_3.png new file mode 100644 index 0000000..e31f5f3 Binary files /dev/null and b/paper/phase_3.png differ