Skip to content

Commit 971b3ef

Browse files
committed
test(transcript): expand abbreviation boundary regression corpus
Add coverage for pronoun-start and locative initialism cases while preserving conjunction non-boundary behavior around etc./u.s. periods.
1 parent 7530d40 commit 971b3ef

File tree

1 file changed

+174
-0
lines changed

1 file changed

+174
-0
lines changed

apps/sotto/internal/transcript/assemble_test.go

Lines changed: 174 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,180 @@ func TestAssembleSentenceCaseDoesNotCapitalizeDomainOrDecimalFragments(t *testin
6262
require.Equal(t, "Check example.com and v2.1 first. Then reply", got)
6363
}
6464

65+
func TestAssembleSentenceCaseAbbreviationRegressionCorpus(t *testing.T) {
66+
t.Parallel()
67+
68+
cases := []struct {
69+
name string
70+
in string
71+
want string
72+
}{
73+
{
74+
name: "measurement_tbsp",
75+
in: "add 1 tbsp. sugar and stir",
76+
want: "Add 1 tbsp. sugar and stir",
77+
},
78+
{
79+
name: "measurement_min",
80+
in: "mix for 5 min. then serve",
81+
want: "Mix for 5 min. then serve",
82+
},
83+
{
84+
name: "title_abbreviation_inside_sentence",
85+
in: "we spoke with dr. smith yesterday. then we left",
86+
want: "We spoke with dr. smith yesterday. Then we left",
87+
},
88+
{
89+
name: "ambiguous_etc_sentence_starter",
90+
in: "we covered apples, etc. then moved on",
91+
want: "We covered apples, etc. Then moved on",
92+
},
93+
{
94+
name: "ambiguous_etc_pronoun_boundary",
95+
in: "we listed apples, etc. we moved on",
96+
want: "We listed apples, etc. We moved on",
97+
},
98+
{
99+
name: "ambiguous_etc_conjunction_continuation",
100+
in: "bring apples etc. and bananas",
101+
want: "Bring apples etc. and bananas",
102+
},
103+
{
104+
name: "ambiguous_vs_conservative",
105+
in: "compare this vs. that option. then decide",
106+
want: "Compare this vs. that option. Then decide",
107+
},
108+
{
109+
name: "initialism_sentence_starter",
110+
in: "we moved to the u.s. then we celebrated",
111+
want: "We moved to the u.s. Then we celebrated",
112+
},
113+
{
114+
name: "initialism_pronoun_boundary",
115+
in: "we moved to the u.s. we celebrated",
116+
want: "We moved to the u.s. We celebrated",
117+
},
118+
{
119+
name: "initialism_embedded_locative_pronoun_boundary",
120+
in: "i lived in the u.s. we can travel there",
121+
want: "I lived in the u.s. We can travel there",
122+
},
123+
{
124+
name: "initialism_conjunction_continuation",
125+
in: "we moved to the u.s. and stayed",
126+
want: "We moved to the u.s. and stayed",
127+
},
128+
{
129+
name: "initialism_locative_non_boundary",
130+
in: "in the u.s. we have states",
131+
want: "In the u.s. we have states",
132+
},
133+
{
134+
name: "initialism_locative_non_boundary_after_sentence_boundary",
135+
in: "this is true. in the u.s. we have states",
136+
want: "This is true. In the u.s. we have states",
137+
},
138+
{
139+
name: "initialism_origin_non_boundary",
140+
in: "from the u.s. we have exports",
141+
want: "From the u.s. we have exports",
142+
},
143+
{
144+
name: "initialism_embedded_origin_boundary",
145+
in: "i came from the u.s. we celebrated",
146+
want: "I came from the u.s. We celebrated",
147+
},
148+
{
149+
name: "no_default_boundary",
150+
in: "no. then we continue",
151+
want: "No. Then we continue",
152+
},
153+
}
154+
155+
for _, tc := range cases {
156+
tc := tc
157+
t.Run(tc.name, func(t *testing.T) {
158+
t.Parallel()
159+
160+
got := Assemble([]string{tc.in}, Options{
161+
TrailingSpace: false,
162+
CapitalizeSentences: true,
163+
})
164+
require.Equal(t, tc.want, got)
165+
})
166+
}
167+
}
168+
169+
func TestAssembleSentenceCaseDoesNotCapitalizeAfterCommonAbbreviations(t *testing.T) {
170+
t.Parallel()
171+
172+
got := Assemble([]string{"for i.e. this case and e.g. that case. then proceed"}, Options{
173+
TrailingSpace: false,
174+
CapitalizeSentences: true,
175+
})
176+
require.Equal(t, "For i.e. this case and e.g. that case. Then proceed", got)
177+
}
178+
179+
func TestAssembleSentenceCaseKeepsPronounIDistinctFromIEAbbreviation(t *testing.T) {
180+
t.Parallel()
181+
182+
got := Assemble([]string{"i said i.e. this should stay lowercase"}, Options{
183+
TrailingSpace: false,
184+
CapitalizeSentences: true,
185+
})
186+
require.Equal(t, "I said i.e. this should stay lowercase", got)
187+
}
188+
189+
func TestAssembleSentenceCaseKeepsLeadingIEAbbreviationLowercase(t *testing.T) {
190+
t.Parallel()
191+
192+
got := Assemble([]string{"i.e. this should stay lowercase"}, Options{
193+
TrailingSpace: false,
194+
CapitalizeSentences: true,
195+
})
196+
require.Equal(t, "i.e. this should stay lowercase", got)
197+
}
198+
199+
func TestAssembleSentenceCaseKeepsPostBoundaryIEAbbreviationLowercase(t *testing.T) {
200+
t.Parallel()
201+
202+
got := Assemble([]string{"this is true. i.e. this should stay lowercase"}, Options{
203+
TrailingSpace: false,
204+
CapitalizeSentences: true,
205+
})
206+
require.Equal(t, "This is true. i.e. this should stay lowercase", got)
207+
}
208+
209+
func TestAssembleSentenceCaseCapitalizesTitleAbbreviationAtSentenceStart(t *testing.T) {
210+
t.Parallel()
211+
212+
got := Assemble([]string{"dr. smith can help"}, Options{
213+
TrailingSpace: false,
214+
CapitalizeSentences: true,
215+
})
216+
require.Equal(t, "Dr. smith can help", got)
217+
}
218+
219+
func TestAssembleSentenceCaseCapitalizesTitleAbbreviationAfterBoundary(t *testing.T) {
220+
t.Parallel()
221+
222+
got := Assemble([]string{"this happened. dr. smith replied"}, Options{
223+
TrailingSpace: false,
224+
CapitalizeSentences: true,
225+
})
226+
require.Equal(t, "This happened. Dr. smith replied", got)
227+
}
228+
229+
func TestAssembleSentenceCaseDoesNotCapitalizeAfterInitialismAbbreviation(t *testing.T) {
230+
t.Parallel()
231+
232+
got := Assemble([]string{"in the u.s. government report. then we continue"}, Options{
233+
TrailingSpace: false,
234+
CapitalizeSentences: true,
235+
})
236+
require.Equal(t, "In the u.s. government report. Then we continue", got)
237+
}
238+
65239
func TestAssembleSentenceCaseHandlesQuoteAfterBoundary(t *testing.T) {
66240
t.Parallel()
67241

0 commit comments

Comments
 (0)