diff --git a/cython/_pocketsphinx.pyx b/cython/_pocketsphinx.pyx index d94f4a3c..d5ef2da6 100644 --- a/cython/_pocketsphinx.pyx +++ b/cython/_pocketsphinx.pyx @@ -1990,23 +1990,28 @@ cdef class Endpointer: return (&outbuf[0])[:out_n_samples * 2] cdef class AlignmentEntry: - """Entry (word, phone, state) in an alignment. + """Entry (word, phone, or state) in an alignment. - Iterating over this will iterate over its children (i.e. the - phones in a word or the states in a phone) if any. For example:: + Iterating over this will iterate over its children (phones in a + word, or states in a phone) if any. For example, to print + word and phone timings in seconds:: for word in decoder.get_alignment(): - print("%s from %.2f to %.2f" % (word.name, word.start, - word.start + word.duration)) + print("%s from %.3f to %.3f seconds" % (word.name, + word.start / 100, + (word.start + word.duration) / 100)) for phone in word: - print("%s at %.2f duration %.2f" % - (phone.name, phone.start, phone.duration)) + print(" %s at %.3f for %.3f seconds" % (phone.name, + phone.start / 100, + phone.duration / 100)) Attributes: - name(str): Name of segment (word, phone name, state id) - start(int): Index of start frame. - duration(int): Duration in frames. - score(float): Acoustic score (density). + name(str): Text of this entry (word string, phone symbol, or + state ID as string). + start(int): Start frame index. Divide by frame rate for seconds + (default 100, i.e. 10ms per frame). + duration(int): Duration in frames. Divide by frame rate for seconds. + score(int): Acoustic score (log probability, higher is better). """ cdef public int start cdef public int duration @@ -2034,9 +2039,16 @@ cdef class AlignmentEntry: cdef class Alignment: """Sub-word alignment as returned by `get_alignment`. - For the moment this is read-only. You are able to iterate over - the words, phones, or states in it, as well as sub-iterating over - each of their children, as described in `AlignmentEntry`. + Alignments have three levels: words, phones, and HMM states. + Words contain phones, and phones contain states. + + There are two ways to iterate: + + Flat iteration over a single level using `words()`, `phones()`, + or `states()`. + + Hierarchical iteration by iterating over an `AlignmentEntry` to + get its children (phones of a word, or states of a phone). """ cdef ps_alignment_t *_al diff --git a/include/pocketsphinx/alignment.h b/include/pocketsphinx/alignment.h index bcc6e97b..65b84a74 100644 --- a/include/pocketsphinx/alignment.h +++ b/include/pocketsphinx/alignment.h @@ -64,6 +64,22 @@ extern "C" { /** * @struct ps_alignment_t pocketsphinx/alignment.h * @brief Multi-level alignment (words, phones, states) over an utterance. + * + * Alignments are organized hierarchically: words contain phones, and + * phones contain HMM states. Use ps_alignment_words(), + * ps_alignment_phones(), or ps_alignment_states() to iterate at each + * level, and ps_alignment_iter_children() to descend into children. + * + * Each entry has the following fields, accessible via + * ps_alignment_iter_seg() and ps_alignment_iter_name(): + * + * - name: Text (word string, phone symbol, or state ID as string) + * - start: Start frame index + * - duration: Duration in frames + * - score: Acoustic score (log probability, higher is better) + * + * To convert frames to seconds, divide by the frame rate (default + * 100, i.e. 10ms per frame). */ typedef struct ps_alignment_s ps_alignment_t; diff --git a/programs/pocketsphinx_main.c b/programs/pocketsphinx_main.c index 9fe8de8d..1c7a6c0c 100644 --- a/programs/pocketsphinx_main.c +++ b/programs/pocketsphinx_main.c @@ -730,6 +730,13 @@ usage(char *name, int help_config) fprintf(stderr, "\tsox -qd $(%s soxflags) | %s live -\n", name, name); fprintf(stderr, "\t%s single INPUT\n", name); fprintf(stderr, "\t%s align INPUT WORDS...\n", name); + fprintf(stderr, "\nOutput format:\n"); + fprintf(stderr, " JSON with the following fields:\n"); + fprintf(stderr, " b Begin time in seconds\n"); + fprintf(stderr, " d Duration in seconds\n"); + fprintf(stderr, " p Probability (acoustic model score)\n"); + fprintf(stderr, " t Text of utterance or segment\n"); + fprintf(stderr, " w Array of word segments\n"); fprintf(stderr, "\nFor detailed PARAMS values, run %s help-config\n", name); if (help_config) { err_set_loglevel(ERR_INFO); @@ -750,6 +757,9 @@ usage_align(char *name) fprintf(stderr, " (default: no)\n"); fprintf(stderr, " -state_align yes/no Run a second pass to align phones and states and print their\n"); fprintf(stderr, " durations. This implies -phone_align yes (default: no)\n"); + fprintf(stderr, "\nBy default, output contains words only. With -phone_align, each\n"); + fprintf(stderr, "word in \"w\" contains a nested \"w\" array of phones. With\n"); + fprintf(stderr, "-state_align, each phone also contains a nested \"w\" of HMM states.\n"); fprintf(stderr, "\nExamples:\n"); fprintf(stderr, " # Basic word alignment:\n"); fprintf(stderr, " %s align audio.wav \"hello world\"\n", name); diff --git a/src/ps_alignment_internal.h b/src/ps_alignment_internal.h index 72e2d6a5..7f6e67ed 100644 --- a/src/ps_alignment_internal.h +++ b/src/ps_alignment_internal.h @@ -57,7 +57,8 @@ extern "C" { typedef struct ps_alignment_entry_s { int32 start; /**< Start frame index. */ int32 duration; /**< Duration in frames. */ - int32 score; /**< Alignment score (fairly meaningless). */ + int32 score; /**< Acoustic score (log probability). Higher + (less negative) is better. */ /** * Index of parent node. *