Skip to content

Commit a335cde

Browse files
literature tidy-up
1 parent 5586980 commit a335cde

File tree

2 files changed

+37
-147
lines changed

2 files changed

+37
-147
lines changed

paper/paper.bib

Lines changed: 30 additions & 142 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,20 @@ @book{Mueller21_FMP_SPRINGER
5555
url-details = {http://www.music-processing.de}
5656
}
5757

58+
@article{RosenzweigCWSGM20_DCS_TISMIR,
59+
author = {Sebastian Rosenzweig and Helena Cuesta and Christof Wei{\ss} and Frank Scherbaum and Emilia G{\'o}mez and Meinard M{\"u}ller},
60+
title = {{D}agstuhl {ChoirSet}: {A} Multitrack Dataset for {MIR} Research on Choral Singing},
61+
journal = {Transactions of the International Society for Music Information Retrieval ({TISMIR})},
62+
volume = {3},
63+
number = {1},
64+
year = {2020},
65+
pages = {98--110},
66+
publisher = {Ubiquity Press},
67+
doi = {10.5334/tismir.48},
68+
url-pdf = {2020_RosenzweigCWSGM_DagstuhlChoirSet_TISMIR_ePrint.pdf},
69+
url-demo = {https://www.audiolabs-erlangen.de/resources/MIR/2020-DagstuhlChoirSet}
70+
}
71+
5872
@inproceedings{ScherbaumMRM19_MultimediaRecordings_FMA,
5973
author = {Frank Scherbaum and Nana Mzhavanadze and Sebastian Rosenzweig and Meinard M{\"u}ller},
6074
title = {Multi-media recordings of traditional {G}eorgian vocal music for computational analysis},
@@ -111,77 +125,6 @@ @inproceedings{CannamLS10_SonicVisualizer_ICMC
111125
year = {2010},
112126
}
113127

114-
@article{Boersma01_Praat_GI,
115-
author = {Paul Boersma},
116-
journal = {Glot International},
117-
number = {9/10},
118-
pages = {341--345},
119-
title = {{Praat}, a system for doing phonetics by computer},
120-
volume = {5},
121-
year = {2001}
122-
}
123-
124-
125-
126-
127-
128-
129-
130-
131-
132-
133-
134-
135-
136-
137-
138-
139-
140-
141-
142-
143-
144-
145-
146-
147-
148-
149-
150-
151-
152-
153-
154-
155-
@article{CheveigneK02_YIN_JASA,
156-
author = {Alain de Cheveign{\'e} and Hideki Kawahara},
157-
title = {{YIN}, a fundamental frequency estimator for speech and music.},
158-
journal = {Journal of the Acoustical Society of America (JASA)},
159-
year = {2002},
160-
volume = {111},
161-
pages = {1917--1930},
162-
number = {4},
163-
}
164-
165-
@inproceedings{MauchD14_pYIN_ICASSP,
166-
author = {Matthias Mauch and Simon Dixon},
167-
title = {{pYIN}: A Fundamental Frequency Estimator Using Probabilistic Threshold Distributions},
168-
booktitle = {{IEEE} International Conference on Acoustics, Speech and Signal Processing ({ICASSP})},
169-
year = {2014},
170-
address = {Florence, Italy},
171-
pages = {659--663},
172-
}
173-
174-
@article{SalamonG12_MelodyExtraction_TASLP,
175-
Author = {Justin Salamon and Emilia G{\'o}mez},
176-
Title = {Melody Extraction from Polyphonic Music Signals using Pitch Contour Characteristics},
177-
Journal = {IEEE Transactions on Audio, Speech, and Language Processing},
178-
Number = {6},
179-
Volume = {20},
180-
Pages = {1759--1770},
181-
Year = {2012},
182-
doi = {10.1109/TASL.2012.2188515}
183-
}
184-
185128
@article{CamachoH08_SawtoothWaveform_JASA,
186129
author = {Arturo Camacho and John G. Harris},
187130
title = {A sawtooth waveform inspired pitch estimator for speech and music},
@@ -193,79 +136,24 @@ @article{CamachoH08_SawtoothWaveform_JASA
193136
pages = {1638--1652},
194137
}
195138

196-
@inproceedings{BittnerFRJCK19_mirdata_ISMIR,
197-
author = {Rachel M. Bittner and Magdalena Fuentes and David Rubinstein and Andreas Jansson and Keunwoo Choi and Thor Kell},
198-
title = {{mirdata}: Software for Reproducible Usage of Datasets},
199-
booktitle = {Proceedings of the International Society for Music Information Retrieval Conference ({ISMIR})},
200-
pages = {99--106},
201-
year = {2019},
202-
address = {Delft, The Netherlands},
203-
url = {http://archives.ismir.net/ismir2019/paper/000009.pdf}
204-
}
205-
206-
@inproceedings{RaffelMHSNLE14_MirEval_ISMIR,
207-
author = {Colin Raffel and Brian McFee and Eric J. Humphrey and Justin Salamon and Oriol Nieto and Dawen Liang and Daniel P. W. Ellis},
208-
title = {{MIR{\_}EVAL}: {A} Transparent Implementation of Common {MIR} Metrics},
209-
pages = {367--372},
210-
booktitle = {Proceedings of the International Society for Music Information Retrieval Conference ({ISMIR})},
211-
address = {Taipei, Taiwan},
212-
year = {2014},
213-
}
214-
215-
@article{RosenzweigCWSGM20_DCS_TISMIR,
216-
author = {Sebastian Rosenzweig and Helena Cuesta and Christof Wei{\ss} and Frank Scherbaum and Emilia G{\'o}mez and Meinard M{\"u}ller},
217-
title = {{D}agstuhl {ChoirSet}: {A} Multitrack Dataset for {MIR} Research on Choral Singing},
218-
journal = {Transactions of the International Society for Music Information Retrieval ({TISMIR})},
219-
volume = {3},
220-
number = {1},
221-
year = {2020},
222-
pages = {98--110},
223-
publisher = {Ubiquity Press},
224-
doi = {10.5334/tismir.48},
225-
url-pdf = {2020_RosenzweigCWSGM_DagstuhlChoirSet_TISMIR_ePrint.pdf},
226-
url-demo = {https://www.audiolabs-erlangen.de/resources/MIR/2020-DagstuhlChoirSet}
227-
}
228-
229-
@inproceedings{BittnerSBB17_PitchContours_AES,
230-
author = {Rachel M. Bittner and Justin Salamon and Juan J. Bosch and Juan Pablo Bello},
231-
title = {Pitch Contours as a Mid-Level Representation for Music Informatics},
232-
booktitle = {Proceedings of the {AES} International Conference on Semantic Audio},
233-
address = {Erlangen, Germany},
234-
pages = {100--107},
235-
year = {2017},
236-
url = {http://www.aes.org/e-lib/browse.cfm?elib=18756}
237-
}
238-
239-
@inproceedings{RosenzweigSM21_F0Reliability_ICASSP,
240-
author = {Sebastian Rosenzweig and Frank Scherbaum and Meinard M{\"u}ller},
241-
title = {Reliability Assessment of Singing Voice {F0}-Estimates Using Multiple Algorithms},
242-
booktitle = {Proceedings of the {IEEE} International Conference on Acoustics, Speech, and Signal Processing ({ICASSP})},
243-
pages = {261--265},
244-
address = {Toronto, Canada},
245-
year = {2021},
246-
doi = {10.1109/ICASSP39728.2021.9413372}
247-
}
248-
249-
@book{Mueller21_FMP_SPRINGER,
250-
author = {Meinard M\"{u}ller},
251-
title = {Fundamentals of Music Processing -- Using Python and Jupyter Notebooks},
252-
type = {Monograph},
253-
year = {2021},
254-
isbn = {978-3-030-69807-2},
255-
publisher = {Springer Verlag},
256-
edition = {2nd},
257-
pages = {1--495},
258-
doi = {10.1007/978-3-030-69808-9},
259-
url-details = {http://www.music-processing.de}
139+
@article{CheveigneK02_YIN_JASA,
140+
author = {Alain de Cheveign{\'e} and Hideki Kawahara},
141+
title = {{YIN}, a fundamental frequency estimator for speech and music.},
142+
journal = {Journal of the Acoustical Society of America (JASA)},
143+
year = {2002},
144+
volume = {111},
145+
pages = {1917--1930},
146+
number = {4},
260147
}
261148

262-
@article{Scherbaum16_LarynxMicrophones_IWFMA,
263-
author = {Frank Scherbaum},
264-
title = {On the Benefit of Larynx-Microphone Field Recordings for the Documentation and Analysis of Polyphonic Vocal Music},
265-
journal = {Proceedings of the International Workshop Folk Music Analysis},
266-
pages = {80--87},
267-
address = {Dublin,Ireland},
268-
year = {2016}
149+
@article{Boersma01_Praat_GI,
150+
author = {Paul Boersma},
151+
journal = {Glot International},
152+
number = {9/10},
153+
pages = {341--345},
154+
title = {{Praat}, a system for doing phonetics by computer},
155+
volume = {5},
156+
year = {2001}
269157
}
270158

271159
@book{HagermanS80_Barbershop_CITESEER,

paper/paper.md

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -9,10 +9,12 @@ tags:
99
authors:
1010
- name: Sebastian Rosenzweig
1111
orcid: 0000-0003-4964-9217
12+
equal-contrib: true
1213
corresponding: true
1314
affiliation: 1
1415
- name: Marius Kriegerowski
1516
orcid:
17+
equal-contrib: true
1618
corresponding: false
1719
affiliation: 2
1820
- name: Frank Scherbaum
@@ -35,7 +37,7 @@ bibliography: paper.bib
3537
# Summary
3638
Polyphonic singing is one of the most widespread forms of music-making. During a performance, singers must constantly adjust their pitch to stay in tune with one another — a complex skill that requires extensive practice. Research has shown that pitch monitoring tools can assist singers in fine-tuning their intonation during a performance [@BerglinPD22_VisualFeedback_JPM]. Specifically, real-time visualizations of the fundamental frequency (F0), which represents the pitch of the singing voice, help singers assess their pitch relative to a fixed reference or other voices.
3739
To support the monitoring of polyphonic singing performances, we developed `pytch`, an interactive Python tool with a graphical user interface (GUI) designed to record, process, and visualize multiple voices in real time. The GUI displays vocal spectra and estimated F0 trajectories for all singers, as well as the harmonic intervals between them. Additionally, users can adjust visual and algorithmic parameters interactively to accommodate different input devices, microphone signals, singing styles and use cases. Written in Python, `pytch` utilizes the `libf0-realtime` library [@MeierSM25_RealTimeF0_ISMIR] for real-time F0 estimation and `pyqtgraph`[^1] for efficient visualizations of the analysis results.
38-
Our tool builds upon a late-breaking demo in [@KriegerowskiS_Pytch_2017] - which we refer to as version 1. Since then, the tool has been significantly extended with a new real-time graphics engine, a modular audio processing backend that facilitates the integration of additional algorithms, and improved support for a wider range of platforms and recording hardware - which we refer to as version 2. Over its seven years of development, `pytch` has been tested and refined through use in several rehearsals, workshops, and field studies — including Sardinian quartet singing (see demo video [^2]) and traditional Georgian singing (see demo video [^3]).
40+
Our tool builds upon a late-breaking demo in [@KriegerowskiS_Pytch_2017] - which we refer to as version 1. Since then, the tool has been significantly extended with a new real-time graphics engine, a modular audio processing backend that facilitates the integration of additional algorithms, and improved support for a wider range of platforms and recording hardware - which we refer to as version 2. Over its seven years of development, `pytch` has been tested and refined through use in several rehearsals, workshops, and field studies — including Sardinian quartet singing (see demo video[^2]) and traditional Georgian singing (see demo video[^3]).
3941

4042
[^1]: <https://www.pyqtgraph.org>
4143
[^2]: <https://www.uni-potsdam.de/de/soundscapelab/computational-ethnomusicology/the-benefit-of-body-vibration-recordings/real-time-analysis-of-larynx-microphone-recordings>
@@ -57,19 +59,19 @@ In addition to its practical applications, `pytch` also provides a flexible plat
5759

5860
# Multitrack Singing Recordings
5961

60-
To fully leverage the capabilities of `pytch`, it is essential to record each singer with an individual microphone. In contrast, stereo recordings—such as those captured by a room microphone placed in front of the ensemble—often suffer from overlapping signals, making it difficult to analyze individual voices. Suitable multitrack recordings can be obtained using handheld dynamic microphones or headset microphones, both of which provide good audio quality. However, these setups are prone to cross-talk, especially when singers are positioned close together.
62+
To fully leverage the capabilities of `pytch`, it is essential to record each singer with an individual microphone. Stereo recordings—such as those captured by a room microphone placed in front of the ensemble—often suffer from overlapping signals, making it difficult to analyze individual voices. While there is no hard limit on the number of channels, we recommend to record up to four individual singers to ensure visibility of the charts and responsiveness of the GUI. Suitable multitrack recordings can be obtained using handheld dynamic microphones or headset microphones. However, these setups are prone to cross-talk, especially when singers are positioned close together.
6163

62-
One way to reduce cross-talk is to increase the physical distance between singers or to record them in isolation. However, this is not always feasible, as singers need to hear one another to maintain accurate tuning. An effective workaround is the use of contact microphones, such as throat microphones, which capture vocal fold vibrations directly from the skin of the throat. This method offers a significant advantage: the recorded signals are largely immune to interference from other singers, resulting in much cleaner, more isolated recordings [@Scherbaum16_LarynxMicrophones_IWFMA].
64+
One way to reduce cross-talk is to increase the physical distance between singers or to record them in isolation. However, this is not always feasible, as singers need to hear one another to maintain accurate tuning. An effective workaround is the use of contact microphones, such as throat microphones, which capture vocal fold vibrations directly from the skin of the throat. This method offers a significant advantage: the recorded signals are largely immune to interference from other singers, resulting in much cleaner, more isolated recordings. Throat microphones have successfully been used to record vocal ensembles in several past studies [@Scherbaum16_LarynxMicrophones_IWFMA].
6365

6466

6567
# Audio Processing
6668
The real-time audio processing pipeline implemented in the file `audio.py` is the heart of `pytch` and consists of two main stages: recording and analysis. The recording stage records multichannel audio waveforms from the soundcard or an external audio interface using the `sounddevice` library. The library is based on PortAudio and supports a wide range of operating systems, audio devices, and sampling rates. The recorded audio is received in chunks via a recording callback and fed into a ring buffer shared with the analysis process. When the buffer is sufficiently filled with audio chunks, the analysis process reads the recorded audio to compute several audio features.
6769

68-
For each channel, the analysis stage computes the audio level in dBFS, a time-frequency representation of the audio signal via the Short-Time Fourier Transform (see [@Mueller21_FMP_SPRINGER] for fundamentals of music processing), and an estimate of the F0 along with a confidence value using the `libf0-realtime` library [@MeierSM25_RealTimeF0_ISMIR]. The library includes several real-time implementations of well-known F0 estimation algorithms, such as YIN [@CheveigneK02_YIN_JASA] and SWIPE. YIN is a time-domain algorithm that computes the F0 based on a tweaked auto-correlation function. It is computationally efficient and well-suited for low-latency applications, but it tends to suffer from estimation errors, particularly confusions with higher harmonics such as the octave. In contrast, SWIPE is a frequency-domain algorithm that estimates the F0 by matching different spectral representations of the audio with sawtooth-like kernels. While more computationally demanding, SWIPE typically yields more reliable estimates, in particular for vocal input signals. `pytch` allows users to choose between these algorithms depending on their specific needs and system capabilities. The obtained F0 estimates, which are natively computed in the unit Hz are converted to the unit cents using a user-specified reference frequency. Depending on the audio quality and vocal characteristics, F0 estimates may exhibit artifacts such as discontinuities or pitch slides, which can make the resulting trajectories difficult to interpret [@RosenzweigSM19_StableF0_ISMIR]. Previous research has shown that using throat microphones can improve the isolation of individual voices in group singing contexts, resulting in cleaner signals and more accurate F0 estimates [@Scherbaum16_LarynxMicrophones_IWFMA]. To further enhance interpretability, `pytch` includes several optional post-processing steps: a confidence threshold to discard estimates with low confidence score, a median filter to smooth the trajectories, and a gradient filter to suppress abrupt pitch slides. As a final step in the audio analysis, the harmonic intervals between the F0 trajectories are computed. Every audio feature is stored separately in a dedicated ring buffer. After processing, the pipeline sets a flag that notifies the GUI that new data is ready for visualization.
70+
For each channel, the analysis stage computes the audio level in dBFS, a time-frequency representation of the audio signal via the Short-Time Fourier Transform (see [@Mueller21_FMP_SPRINGER] for fundamentals of music processing), and an estimate of the F0 along with a confidence value using the `libf0-realtime` library [@MeierSM25_RealTimeF0_ISMIR]. The library includes several real-time implementations of well-known F0 estimation algorithms, such as YIN [@CheveigneK02_YIN_JASA] and SWIPE [@CamachoH08_SawtoothWaveform_JASA]. YIN is a time-domain algorithm that computes the F0 based on a tweaked auto-correlation function. It is computationally efficient and well-suited for low-latency applications, but it tends to suffer from estimation errors, particularly confusions with higher harmonics such as the octave. In contrast, SWIPE is a frequency-domain algorithm that estimates the F0 by matching different spectral representations of the audio with sawtooth-like kernels. While more computationally demanding, SWIPE typically yields more reliable estimates, in particular for vocal input signals. `pytch` allows users to choose between these algorithms depending on their specific needs and system capabilities. The obtained F0 estimates, which are natively computed in the unit Hz are converted to the unit cents using a user-specified reference frequency. Depending on the audio quality and vocal characteristics, F0 estimates may exhibit artifacts such as discontinuities or pitch slides, which can make the resulting trajectories difficult to interpret [@RosenzweigSM19_StableF0_ISMIR]. Previous research has shown that using throat microphones can improve the isolation of individual voices in group singing contexts, resulting in cleaner signals and more accurate F0 estimates [@Scherbaum16_LarynxMicrophones_IWFMA]. To further enhance interpretability, `pytch` includes several optional post-processing steps: a confidence threshold to discard estimates with low confidence score, a median filter to smooth the trajectories, and a gradient filter to suppress abrupt pitch slides. As a final step in the audio analysis, the harmonic intervals between the F0 trajectories are computed. Every audio feature is stored separately in a dedicated ring buffer. After processing, the pipeline sets a flag that notifies the GUI that new data is ready for visualization.
6971

7072

7173
# Graphical User Interface (GUI)
72-
In this section, we provide a step-by-step explanation of the `pytch` GUI implemented in the file `gui.py`. Right after the program start, a startup menu opens in which the user is asked to specify the soundcard, input channels, sampling rate, and window size for processing. Furthermore, the user can choose to store the recorded audio and the F0 trajectories on disk. These configuration choices are required to initialize the audio processing module and the main GUI which is loaded when the user clicks "ok". While there is no hard limit on the number of channels, we recommend to use up to four input channels to ensure visibility of the charts and responsiveness of the GUI. A screenshot of the main GUI is shown in Figure \autoref{fig:GUI}.
74+
In this section, we provide a step-by-step explanation of the `pytch` GUI implemented in the file `gui.py`. Right after the program start, a startup menu opens in which the user is asked to specify the soundcard, input channels, sampling rate, and window size for processing. Furthermore, the user can choose to store the recorded audio and the F0 trajectories on disk. These configuration choices are required to initialize the audio processing module and the main GUI which is loaded when the user clicks "ok". A screenshot of the main GUI which opens after successful initialization is shown in Figure \autoref{fig:GUI}.
7375

7476
![`pytch` GUI monitoring three singing voices.\label{fig:GUI}](../pictures/screenshot.png){ width=90% }
7577

0 commit comments

Comments
 (0)