literature tidy-up

sebastianrosenzweig · sebastianrosenzweig · commit a335cdee6305 · 2025-05-25T22:11:59.000+02:00
diff --git a/paper/paper.bib b/paper/paper.bib
@@ -55,6 +55,20 @@ @book{Mueller21_FMP_SPRINGER
     url-details = {http://www.music-processing.de}
 }
 
+@article{RosenzweigCWSGM20_DCS_TISMIR,
+  author    = {Sebastian Rosenzweig and Helena Cuesta and Christof Wei{\ss} and Frank Scherbaum and Emilia G{\'o}mez and Meinard M{\"u}ller},
+  title     = {{D}agstuhl {ChoirSet}: {A} Multitrack Dataset for {MIR} Research on Choral Singing},
+  journal   = {Transactions of the International Society for Music Information Retrieval ({TISMIR})},
+  volume    = {3},
+  number    = {1},
+  year      = {2020},
+  pages     = {98--110},
+  publisher = {Ubiquity Press},
+  doi       = {10.5334/tismir.48},
+  url-pdf   = {2020_RosenzweigCWSGM_DagstuhlChoirSet_TISMIR_ePrint.pdf},
+  url-demo  = {https://www.audiolabs-erlangen.de/resources/MIR/2020-DagstuhlChoirSet}
+}
+
 @inproceedings{ScherbaumMRM19_MultimediaRecordings_FMA,
   author    = {Frank Scherbaum and Nana Mzhavanadze and Sebastian Rosenzweig and Meinard M{\"u}ller},
   title     = {Multi-media recordings of traditional {G}eorgian vocal music for computational analysis},
@@ -111,77 +125,6 @@ @inproceedings{CannamLS10_SonicVisualizer_ICMC
   year      = {2010},
 }
 
-@article{Boersma01_Praat_GI,
-    author  = {Paul Boersma},
-    journal = {Glot International},
-    number  = {9/10},
-    pages   = {341--345},
-    title   = {{Praat}, a system for doing phonetics by computer},
-    volume  = {5},
-    year    = {2001}
-}
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-@article{CheveigneK02_YIN_JASA,
-  author    = {Alain de Cheveign{\'e} and Hideki Kawahara},
-  title     = {{YIN}, a fundamental frequency estimator for speech and music.},
-  journal   = {Journal of the Acoustical Society of America (JASA)},
-  year      = {2002},
-  volume    = {111},
-  pages     = {1917--1930},
-  number    = {4},
-}
-
-@inproceedings{MauchD14_pYIN_ICASSP,
-  author    = {Matthias Mauch and Simon Dixon},
-  title     = {{pYIN}: A Fundamental Frequency Estimator Using Probabilistic Threshold Distributions},
-  booktitle = {{IEEE} International Conference on Acoustics, Speech and Signal Processing ({ICASSP})},
-  year      = {2014},
-  address  = {Florence, Italy},
-  pages     = {659--663},
-}
-
-@article{SalamonG12_MelodyExtraction_TASLP,
-  Author = {Justin Salamon and Emilia G{\'o}mez},
-  Title = {Melody Extraction from Polyphonic Music Signals using Pitch Contour Characteristics},
-  Journal = {IEEE Transactions on Audio, Speech, and Language Processing},
-  Number = {6},
-  Volume = {20},
-  Pages = {1759--1770},
-  Year = {2012},
-  doi  = {10.1109/TASL.2012.2188515}
-}
-
 @article{CamachoH08_SawtoothWaveform_JASA,
   author    = {Arturo Camacho and John G. Harris},
   title     = {A sawtooth waveform inspired pitch estimator for speech and music},
@@ -193,79 +136,24 @@ @article{CamachoH08_SawtoothWaveform_JASA
   pages     = {1638--1652},
 }
 
-@inproceedings{BittnerFRJCK19_mirdata_ISMIR,
-  author    = {Rachel M. Bittner and Magdalena Fuentes and David Rubinstein and Andreas Jansson and Keunwoo Choi and Thor Kell},
-  title     = {{mirdata}: Software for Reproducible Usage of Datasets},
-  booktitle = {Proceedings of the International Society for Music Information Retrieval Conference ({ISMIR})},
-  pages     = {99--106},
-  year      = {2019},
-  address   = {Delft, The Netherlands},
-  url       = {http://archives.ismir.net/ismir2019/paper/000009.pdf}
-}
-
-@inproceedings{RaffelMHSNLE14_MirEval_ISMIR,
-  author    = {Colin Raffel and Brian McFee and Eric J. Humphrey and Justin Salamon and Oriol Nieto and Dawen Liang and Daniel P. W. Ellis},
-  title     = {{MIR{\_}EVAL}: {A} Transparent Implementation of Common {MIR} Metrics},
-  pages     = {367--372},
-  booktitle = {Proceedings of the International Society for Music Information Retrieval Conference ({ISMIR})},
-  address   = {Taipei, Taiwan},
-  year      = {2014},
-}
-
-@article{RosenzweigCWSGM20_DCS_TISMIR,
-  author    = {Sebastian Rosenzweig and Helena Cuesta and Christof Wei{\ss} and Frank Scherbaum and Emilia G{\'o}mez and Meinard M{\"u}ller},
-  title     = {{D}agstuhl {ChoirSet}: {A} Multitrack Dataset for {MIR} Research on Choral Singing},
-  journal   = {Transactions of the International Society for Music Information Retrieval ({TISMIR})},
-  volume    = {3},
-  number    = {1},
-  year      = {2020},
-  pages     = {98--110},
-  publisher = {Ubiquity Press},
-  doi       = {10.5334/tismir.48},
-  url-pdf   = {2020_RosenzweigCWSGM_DagstuhlChoirSet_TISMIR_ePrint.pdf},
-  url-demo  = {https://www.audiolabs-erlangen.de/resources/MIR/2020-DagstuhlChoirSet}
-}
-
-@inproceedings{BittnerSBB17_PitchContours_AES,
-  author    = {Rachel M. Bittner and Justin Salamon and Juan J. Bosch and Juan Pablo Bello},
-  title     = {Pitch Contours as a Mid-Level Representation for Music Informatics},
-  booktitle = {Proceedings of the {AES} International Conference on Semantic Audio},
-  address   = {Erlangen, Germany},
-  pages     = {100--107},
-  year      = {2017},
-  url       = {http://www.aes.org/e-lib/browse.cfm?elib=18756}
-}
-
-@inproceedings{RosenzweigSM21_F0Reliability_ICASSP,
-  author    = {Sebastian Rosenzweig and Frank Scherbaum and Meinard M{\"u}ller},
-  title     = {Reliability Assessment of Singing Voice {F0}-Estimates Using Multiple Algorithms},
-  booktitle = {Proceedings of the {IEEE} International Conference on Acoustics, Speech, and Signal Processing ({ICASSP})},
-  pages     = {261--265},
-  address   = {Toronto, Canada},
-  year      = {2021},
-  doi       = {10.1109/ICASSP39728.2021.9413372}
-}
-
-@book{Mueller21_FMP_SPRINGER,
-	author    = {Meinard M\"{u}ller},
-	title     = {Fundamentals of Music Processing -- Using Python and Jupyter Notebooks},
-	type      = {Monograph},
-	year      = {2021},
-	isbn      = {978-3-030-69807-2},
-	publisher = {Springer Verlag},
-    edition   = {2nd},
-	pages     = {1--495},
-	doi       = {10.1007/978-3-030-69808-9},
-    url-details = {http://www.music-processing.de}
+@article{CheveigneK02_YIN_JASA,
+  author    = {Alain de Cheveign{\'e} and Hideki Kawahara},
+  title     = {{YIN}, a fundamental frequency estimator for speech and music.},
+  journal   = {Journal of the Acoustical Society of America (JASA)},
+  year      = {2002},
+  volume    = {111},
+  pages     = {1917--1930},
+  number    = {4},
 }
 
-@article{Scherbaum16_LarynxMicrophones_IWFMA,
-  author    = {Frank Scherbaum},
-  title     = {On the Benefit of Larynx-Microphone Field Recordings for the Documentation and Analysis of Polyphonic Vocal Music},
-  journal   = {Proceedings of the International Workshop Folk Music Analysis},
-  pages     = {80--87},
-  address   = {Dublin,Ireland},
-  year      = {2016}
+@article{Boersma01_Praat_GI,
+    author  = {Paul Boersma},
+    journal = {Glot International},
+    number  = {9/10},
+    pages   = {341--345},
+    title   = {{Praat}, a system for doing phonetics by computer},
+    volume  = {5},
+    year    = {2001}
 }
 
 @book{HagermanS80_Barbershop_CITESEER,
diff --git a/paper/paper.md b/paper/paper.md
@@ -9,10 +9,12 @@ tags:
 authors:
   - name: Sebastian Rosenzweig
     orcid: 0000-0003-4964-9217
+    equal-contrib: true
     corresponding: true
     affiliation: 1
   - name: Marius Kriegerowski
     orcid:
+    equal-contrib: true
     corresponding: false
     affiliation: 2
   - name: Frank Scherbaum
@@ -35,7 +37,7 @@ bibliography: paper.bib
 # Summary
 Polyphonic singing is one of the most widespread forms of music-making. During a performance, singers must constantly adjust their pitch to stay in tune with one another — a complex skill that requires extensive practice. Research has shown that pitch monitoring tools can assist singers in fine-tuning their intonation during a performance [@BerglinPD22_VisualFeedback_JPM]. Specifically, real-time visualizations of the fundamental frequency (F0), which represents the pitch of the singing voice, help singers assess their pitch relative to a fixed reference or other voices.
 To support the monitoring of polyphonic singing performances, we developed `pytch`, an interactive Python tool with a graphical user interface (GUI) designed to record, process, and visualize multiple voices in real time. The GUI displays vocal spectra and estimated F0 trajectories for all singers, as well as the harmonic intervals between them. Additionally, users can adjust visual and algorithmic parameters interactively to accommodate different input devices, microphone signals, singing styles and use cases. Written in Python, `pytch` utilizes the `libf0-realtime` library [@MeierSM25_RealTimeF0_ISMIR] for real-time F0 estimation and `pyqtgraph`[^1] for efficient visualizations of the analysis results.
-Our tool builds upon a late-breaking demo in [@KriegerowskiS_Pytch_2017] - which we refer to as version 1. Since then, the tool has been significantly extended with a new real-time graphics engine, a modular audio processing backend that facilitates the integration of additional algorithms, and improved support for a wider range of platforms and recording hardware - which we refer to as version 2. Over its seven years of development, `pytch` has been tested and refined through use in several rehearsals, workshops, and field studies — including Sardinian quartet singing (see demo video [^2]) and traditional Georgian singing (see demo video [^3]).
+Our tool builds upon a late-breaking demo in [@KriegerowskiS_Pytch_2017] - which we refer to as version 1. Since then, the tool has been significantly extended with a new real-time graphics engine, a modular audio processing backend that facilitates the integration of additional algorithms, and improved support for a wider range of platforms and recording hardware - which we refer to as version 2. Over its seven years of development, `pytch` has been tested and refined through use in several rehearsals, workshops, and field studies — including Sardinian quartet singing (see demo video[^2]) and traditional Georgian singing (see demo video[^3]).
 
 [^1]: <https://www.pyqtgraph.org>
 [^2]: <https://www.uni-potsdam.de/de/soundscapelab/computational-ethnomusicology/the-benefit-of-body-vibration-recordings/real-time-analysis-of-larynx-microphone-recordings>
@@ -57,19 +59,19 @@ In addition to its practical applications, `pytch` also provides a flexible plat
 
 # Multitrack Singing Recordings
 
-To fully leverage the capabilities of `pytch`, it is essential to record each singer with an individual microphone. In contrast, stereo recordings—such as those captured by a room microphone placed in front of the ensemble—often suffer from overlapping signals, making it difficult to analyze individual voices. Suitable multitrack recordings can be obtained using handheld dynamic microphones or headset microphones, both of which provide good audio quality. However, these setups are prone to cross-talk, especially when singers are positioned close together.
+To fully leverage the capabilities of `pytch`, it is essential to record each singer with an individual microphone. Stereo recordings—such as those captured by a room microphone placed in front of the ensemble—often suffer from overlapping signals, making it difficult to analyze individual voices. While there is no hard limit on the number of channels, we recommend to record up to four individual singers to ensure visibility of the charts and responsiveness of the GUI. Suitable multitrack recordings can be obtained using handheld dynamic microphones or headset microphones. However, these setups are prone to cross-talk, especially when singers are positioned close together.
 
-One way to reduce cross-talk is to increase the physical distance between singers or to record them in isolation. However, this is not always feasible, as singers need to hear one another to maintain accurate tuning. An effective workaround is the use of contact microphones, such as throat microphones, which capture vocal fold vibrations directly from the skin of the throat. This method offers a significant advantage: the recorded signals are largely immune to interference from other singers, resulting in much cleaner, more isolated recordings [@Scherbaum16_LarynxMicrophones_IWFMA].
+One way to reduce cross-talk is to increase the physical distance between singers or to record them in isolation. However, this is not always feasible, as singers need to hear one another to maintain accurate tuning. An effective workaround is the use of contact microphones, such as throat microphones, which capture vocal fold vibrations directly from the skin of the throat. This method offers a significant advantage: the recorded signals are largely immune to interference from other singers, resulting in much cleaner, more isolated recordings. Throat microphones have successfully been used to record vocal ensembles in several past studies [@Scherbaum16_LarynxMicrophones_IWFMA].
 
 
 # Audio Processing
 The real-time audio processing pipeline implemented in the file `audio.py` is the heart of `pytch` and consists of two main stages: recording and analysis. The recording stage records multichannel audio waveforms from the soundcard or an external audio interface using the `sounddevice` library. The library is based on PortAudio and supports a wide range of operating systems, audio devices, and sampling rates. The recorded audio is received in chunks via a recording callback and fed into a ring buffer shared with the analysis process. When the buffer is sufficiently filled with audio chunks, the analysis process reads the recorded audio to compute several audio features.
 
-For each channel, the analysis stage computes the audio level in dBFS, a time-frequency representation of the audio signal via the Short-Time Fourier Transform (see [@Mueller21_FMP_SPRINGER] for fundamentals of music processing), and an estimate of the F0 along with a confidence value using the `libf0-realtime` library [@MeierSM25_RealTimeF0_ISMIR]. The library includes several real-time implementations of well-known F0 estimation algorithms, such as YIN [@CheveigneK02_YIN_JASA] and SWIPE. YIN is a time-domain algorithm that computes the F0 based on a tweaked auto-correlation function. It is computationally efficient and well-suited for low-latency applications, but it tends to suffer from estimation errors, particularly confusions with higher harmonics such as the octave. In contrast, SWIPE is a frequency-domain algorithm that estimates the F0 by matching different spectral representations of the audio with sawtooth-like kernels. While more computationally demanding, SWIPE typically yields more reliable estimates, in particular for vocal input signals. `pytch` allows users to choose between these algorithms depending on their specific needs and system capabilities. The obtained F0 estimates, which are natively computed in the unit Hz are converted to the unit cents using a user-specified reference frequency. Depending on the audio quality and vocal characteristics, F0 estimates may exhibit artifacts such as discontinuities or pitch slides, which can make the resulting trajectories difficult to interpret [@RosenzweigSM19_StableF0_ISMIR]. Previous research has shown that using throat microphones can improve the isolation of individual voices in group singing contexts, resulting in cleaner signals and more accurate F0 estimates [@Scherbaum16_LarynxMicrophones_IWFMA]. To further enhance interpretability, `pytch` includes several optional post-processing steps: a confidence threshold to discard estimates with low confidence score, a median filter to smooth the trajectories, and a gradient filter to suppress abrupt pitch slides. As a final step in the audio analysis, the harmonic intervals between the F0 trajectories are computed. Every audio feature is stored separately in a dedicated ring buffer. After processing, the pipeline sets a flag that notifies the GUI that new data is ready for visualization.
+For each channel, the analysis stage computes the audio level in dBFS, a time-frequency representation of the audio signal via the Short-Time Fourier Transform (see [@Mueller21_FMP_SPRINGER] for fundamentals of music processing), and an estimate of the F0 along with a confidence value using the `libf0-realtime` library [@MeierSM25_RealTimeF0_ISMIR]. The library includes several real-time implementations of well-known F0 estimation algorithms, such as YIN [@CheveigneK02_YIN_JASA] and SWIPE [@CamachoH08_SawtoothWaveform_JASA]. YIN is a time-domain algorithm that computes the F0 based on a tweaked auto-correlation function. It is computationally efficient and well-suited for low-latency applications, but it tends to suffer from estimation errors, particularly confusions with higher harmonics such as the octave. In contrast, SWIPE is a frequency-domain algorithm that estimates the F0 by matching different spectral representations of the audio with sawtooth-like kernels. While more computationally demanding, SWIPE typically yields more reliable estimates, in particular for vocal input signals. `pytch` allows users to choose between these algorithms depending on their specific needs and system capabilities. The obtained F0 estimates, which are natively computed in the unit Hz are converted to the unit cents using a user-specified reference frequency. Depending on the audio quality and vocal characteristics, F0 estimates may exhibit artifacts such as discontinuities or pitch slides, which can make the resulting trajectories difficult to interpret [@RosenzweigSM19_StableF0_ISMIR]. Previous research has shown that using throat microphones can improve the isolation of individual voices in group singing contexts, resulting in cleaner signals and more accurate F0 estimates [@Scherbaum16_LarynxMicrophones_IWFMA]. To further enhance interpretability, `pytch` includes several optional post-processing steps: a confidence threshold to discard estimates with low confidence score, a median filter to smooth the trajectories, and a gradient filter to suppress abrupt pitch slides. As a final step in the audio analysis, the harmonic intervals between the F0 trajectories are computed. Every audio feature is stored separately in a dedicated ring buffer. After processing, the pipeline sets a flag that notifies the GUI that new data is ready for visualization.
 
 
 # Graphical User Interface (GUI)
-In this section, we provide a step-by-step explanation of the `pytch` GUI implemented in the file `gui.py`. Right after the program start, a startup menu opens in which the user is asked to specify the soundcard, input channels, sampling rate, and window size for processing. Furthermore, the user can choose to store the recorded audio and the F0 trajectories on disk. These configuration choices are required to initialize the audio processing module and the main GUI which is loaded when the user clicks "ok". While there is no hard limit on the number of channels, we recommend to use up to four input channels to ensure visibility of the charts and responsiveness of the GUI. A screenshot of the main GUI is shown in Figure \autoref{fig:GUI}.
+In this section, we provide a step-by-step explanation of the `pytch` GUI implemented in the file `gui.py`. Right after the program start, a startup menu opens in which the user is asked to specify the soundcard, input channels, sampling rate, and window size for processing. Furthermore, the user can choose to store the recorded audio and the F0 trajectories on disk. These configuration choices are required to initialize the audio processing module and the main GUI which is loaded when the user clicks "ok". A screenshot of the main GUI which opens after successful initialization is shown in Figure \autoref{fig:GUI}.
 
 ![`pytch` GUI monitoring three singing voices.\label{fig:GUI}](../pictures/screenshot.png){ width=90% }