cmusphinx · lenzo-ka · Jan 14, 2026 · Jan 14, 2026 · Jan 14, 2026
diff --git a/cython/_pocketsphinx.pyx b/cython/_pocketsphinx.pyx
@@ -1990,23 +1990,28 @@ cdef class Endpointer:
         return (<const unsigned char *>&outbuf[0])[:out_n_samples * 2]
 
 cdef class AlignmentEntry:
-    """Entry (word, phone, state) in an alignment.
+    """Entry (word, phone, or state) in an alignment.
 
-    Iterating over this will iterate over its children (i.e. the
-    phones in a word or the states in a phone) if any.  For example::
+    Iterating over this will iterate over its children (phones in a
+    word, or states in a phone) if any.  For example, to print
+    word and phone timings in seconds::
 
         for word in decoder.get_alignment():
-            print("%s from %.2f to %.2f" % (word.name, word.start,
-                                            word.start + word.duration))
+            print("%s from %.3f to %.3f seconds" % (word.name,
+                                                    word.start / 100,
+                                                    (word.start + word.duration) / 100))
             for phone in word:
-                print("%s at %.2f duration %.2f" %
-                      (phone.name, phone.start, phone.duration))
+                print("  %s at %.3f for %.3f seconds" % (phone.name,
+                                                         phone.start / 100,
+                                                         phone.duration / 100))
 
     Attributes:
-      name(str): Name of segment (word, phone name, state id)
-      start(int): Index of start frame.
-      duration(int): Duration in frames.
-      score(float): Acoustic score (density).
+      name(str): Text of this entry (word string, phone symbol, or
+                 state ID as string).
+      start(int): Start frame index.  Divide by frame rate for seconds
+                  (default 100, i.e. 10ms per frame).
+      duration(int): Duration in frames.  Divide by frame rate for seconds.
+      score(int): Acoustic score (log probability, higher is better).
     """
     cdef public int start
     cdef public int duration
@@ -2034,9 +2039,16 @@ cdef class AlignmentEntry:
 cdef class Alignment:
     """Sub-word alignment as returned by `get_alignment`.
 
-    For the moment this is read-only.  You are able to iterate over
-    the words, phones, or states in it, as well as sub-iterating over
-    each of their children, as described in `AlignmentEntry`.
+    Alignments have three levels: words, phones, and HMM states.
+    Words contain phones, and phones contain states.
+
+    There are two ways to iterate:
+
+    Flat iteration over a single level using `words()`, `phones()`,
+    or `states()`.
+
+    Hierarchical iteration by iterating over an `AlignmentEntry` to
+    get its children (phones of a word, or states of a phone).
     """
     cdef ps_alignment_t *_al
 

diff --git a/include/pocketsphinx/alignment.h b/include/pocketsphinx/alignment.h
@@ -64,6 +64,22 @@ extern "C" {
 /**
  * @struct ps_alignment_t pocketsphinx/alignment.h
  * @brief Multi-level alignment (words, phones, states) over an utterance.
+ *
+ * Alignments are organized hierarchically: words contain phones, and
+ * phones contain HMM states.  Use ps_alignment_words(),
+ * ps_alignment_phones(), or ps_alignment_states() to iterate at each
+ * level, and ps_alignment_iter_children() to descend into children.
+ *
+ * Each entry has the following fields, accessible via
+ * ps_alignment_iter_seg() and ps_alignment_iter_name():
+ *
+ *  - name: Text (word string, phone symbol, or state ID as string)
+ *  - start: Start frame index
+ *  - duration: Duration in frames
+ *  - score: Acoustic score (log probability, higher is better)
+ *
+ * To convert frames to seconds, divide by the frame rate (default
+ * 100, i.e. 10ms per frame).
  */
 typedef struct ps_alignment_s ps_alignment_t;
 

diff --git a/programs/pocketsphinx_main.c b/programs/pocketsphinx_main.c
@@ -730,6 +730,13 @@ usage(char *name, int help_config)
     fprintf(stderr, "\tsox -qd $(%s soxflags) | %s live -\n", name, name);
     fprintf(stderr, "\t%s single INPUT\n", name);
     fprintf(stderr, "\t%s align INPUT WORDS...\n", name);
+    fprintf(stderr, "\nOutput format:\n");
+    fprintf(stderr, "  JSON with the following fields:\n");
+    fprintf(stderr, "    b  Begin time in seconds\n");
+    fprintf(stderr, "    d  Duration in seconds\n");
+    fprintf(stderr, "    p  Probability (acoustic model score)\n");
+    fprintf(stderr, "    t  Text of utterance or segment\n");
+    fprintf(stderr, "    w  Array of word segments\n");
     fprintf(stderr, "\nFor detailed PARAMS values, run %s help-config\n", name);
     if (help_config) {
         err_set_loglevel(ERR_INFO);
@@ -750,6 +757,9 @@ usage_align(char *name)
     fprintf(stderr, "                         (default: no)\n");
     fprintf(stderr, "  -state_align yes/no    Run a second pass to align phones and states and print their\n");
     fprintf(stderr, "                         durations. This implies -phone_align yes (default: no)\n");
+    fprintf(stderr, "\nBy default, output contains words only.  With -phone_align, each\n");
+    fprintf(stderr, "word in \"w\" contains a nested \"w\" array of phones.  With\n");
+    fprintf(stderr, "-state_align, each phone also contains a nested \"w\" of HMM states.\n");
     fprintf(stderr, "\nExamples:\n");
     fprintf(stderr, "  # Basic word alignment:\n");
     fprintf(stderr, "  %s align audio.wav \"hello world\"\n", name);

diff --git a/src/ps_alignment_internal.h b/src/ps_alignment_internal.h
@@ -57,7 +57,8 @@ extern "C" {
 typedef struct ps_alignment_entry_s {
     int32 start;  /**< Start frame index. */
     int32 duration; /**< Duration in frames. */
-    int32 score;  /**< Alignment score (fairly meaningless). */
+    int32 score;  /**< Acoustic score (log probability).  Higher
+                       (less negative) is better. */
     /**
      * Index of parent node.
      *