Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 26 additions & 14 deletions cython/_pocketsphinx.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -1990,23 +1990,28 @@ cdef class Endpointer:
return (<const unsigned char *>&outbuf[0])[:out_n_samples * 2]

cdef class AlignmentEntry:
"""Entry (word, phone, state) in an alignment.
"""Entry (word, phone, or state) in an alignment.

Iterating over this will iterate over its children (i.e. the
phones in a word or the states in a phone) if any. For example::
Iterating over this will iterate over its children (phones in a
word, or states in a phone) if any. For example, to print
word and phone timings in seconds::

for word in decoder.get_alignment():
print("%s from %.2f to %.2f" % (word.name, word.start,
word.start + word.duration))
print("%s from %.3f to %.3f seconds" % (word.name,
word.start / 100,
(word.start + word.duration) / 100))
for phone in word:
print("%s at %.2f duration %.2f" %
(phone.name, phone.start, phone.duration))
print(" %s at %.3f for %.3f seconds" % (phone.name,
phone.start / 100,
phone.duration / 100))

Attributes:
name(str): Name of segment (word, phone name, state id)
start(int): Index of start frame.
duration(int): Duration in frames.
score(float): Acoustic score (density).
name(str): Text of this entry (word string, phone symbol, or
state ID as string).
start(int): Start frame index. Divide by frame rate for seconds
(default 100, i.e. 10ms per frame).
duration(int): Duration in frames. Divide by frame rate for seconds.
score(int): Acoustic score (log probability, higher is better).
"""
cdef public int start
cdef public int duration
Expand Down Expand Up @@ -2034,9 +2039,16 @@ cdef class AlignmentEntry:
cdef class Alignment:
"""Sub-word alignment as returned by `get_alignment`.

For the moment this is read-only. You are able to iterate over
the words, phones, or states in it, as well as sub-iterating over
each of their children, as described in `AlignmentEntry`.
Alignments have three levels: words, phones, and HMM states.
Words contain phones, and phones contain states.

There are two ways to iterate:

Flat iteration over a single level using `words()`, `phones()`,
or `states()`.

Hierarchical iteration by iterating over an `AlignmentEntry` to
get its children (phones of a word, or states of a phone).
"""
cdef ps_alignment_t *_al

Expand Down
16 changes: 16 additions & 0 deletions include/pocketsphinx/alignment.h
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,22 @@ extern "C" {
/**
* @struct ps_alignment_t pocketsphinx/alignment.h
* @brief Multi-level alignment (words, phones, states) over an utterance.
*
* Alignments are organized hierarchically: words contain phones, and
* phones contain HMM states. Use ps_alignment_words(),
* ps_alignment_phones(), or ps_alignment_states() to iterate at each
* level, and ps_alignment_iter_children() to descend into children.
*
* Each entry has the following fields, accessible via
* ps_alignment_iter_seg() and ps_alignment_iter_name():
*
* - name: Text (word string, phone symbol, or state ID as string)
* - start: Start frame index
* - duration: Duration in frames
* - score: Acoustic score (log probability, higher is better)
*
* To convert frames to seconds, divide by the frame rate (default
* 100, i.e. 10ms per frame).
*/
typedef struct ps_alignment_s ps_alignment_t;

Expand Down
10 changes: 10 additions & 0 deletions programs/pocketsphinx_main.c
Original file line number Diff line number Diff line change
Expand Up @@ -730,6 +730,13 @@ usage(char *name, int help_config)
fprintf(stderr, "\tsox -qd $(%s soxflags) | %s live -\n", name, name);
fprintf(stderr, "\t%s single INPUT\n", name);
fprintf(stderr, "\t%s align INPUT WORDS...\n", name);
fprintf(stderr, "\nOutput format:\n");
fprintf(stderr, " JSON with the following fields:\n");
fprintf(stderr, " b Begin time in seconds\n");
fprintf(stderr, " d Duration in seconds\n");
fprintf(stderr, " p Probability (acoustic model score)\n");
fprintf(stderr, " t Text of utterance or segment\n");
fprintf(stderr, " w Array of word segments\n");
fprintf(stderr, "\nFor detailed PARAMS values, run %s help-config\n", name);
if (help_config) {
err_set_loglevel(ERR_INFO);
Expand All @@ -750,6 +757,9 @@ usage_align(char *name)
fprintf(stderr, " (default: no)\n");
fprintf(stderr, " -state_align yes/no Run a second pass to align phones and states and print their\n");
fprintf(stderr, " durations. This implies -phone_align yes (default: no)\n");
fprintf(stderr, "\nBy default, output contains words only. With -phone_align, each\n");
fprintf(stderr, "word in \"w\" contains a nested \"w\" array of phones. With\n");
fprintf(stderr, "-state_align, each phone also contains a nested \"w\" of HMM states.\n");
fprintf(stderr, "\nExamples:\n");
fprintf(stderr, " # Basic word alignment:\n");
fprintf(stderr, " %s align audio.wav \"hello world\"\n", name);
Expand Down
3 changes: 2 additions & 1 deletion src/ps_alignment_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,8 @@ extern "C" {
typedef struct ps_alignment_entry_s {
int32 start; /**< Start frame index. */
int32 duration; /**< Duration in frames. */
int32 score; /**< Alignment score (fairly meaningless). */
int32 score; /**< Acoustic score (log probability). Higher
(less negative) is better. */
/**
* Index of parent node.
*
Expand Down