1-
1+ using System ;
2+ using System . Text . RegularExpressions ;
23using System . Collections . Generic ;
34using System . Globalization ;
45using System . Linq ;
@@ -17,6 +18,8 @@ public class ClipSegmentationService : IClipSegmentationService
1718{
1819 private const string DefaultSystemPrompt = "You are a news segment tool. Analyse timestamped transcripts, choose where new stories begin, and output JSON suitable for ffmpeg clip creation." ;
1920
21+ private const int ParagraphSentenceCount = 4 ;
22+
2023 private const string DefaultPrompt = """
2124You will receive a transcript formatted as numbered sentences (index. timestamp range :: sentence).
2225Identify up to {{max_clips}} places where a new story starts and return ONLY JSON:
@@ -29,9 +32,13 @@ You will receive a transcript formatted as numbered sentences (index. timestamp
2932Rules:
3033- `index` is the numbered sentence (1-based) where the new story begins.
3134- `score` ranges from 0-1; higher means stronger confidence.
35+ - Consider the optional heuristic cues before discarding a boundary.
3236- Keep boundaries chronological and avoid duplicates.
3337- Do not invent timestamps; rely only on the provided lines.
3438
39+ Heuristic cues (if provided):
40+ {{heuristic_notes}}
41+
3542Transcript:
3643{{transcript}}
3744""" ;
@@ -57,7 +64,8 @@ public async Task<IReadOnlyList<ClipDefinition>> GenerateClipsAsync(IReadOnlyLis
5764
5865 try
5966 {
60- var prompt = BuildPrompt ( transcript , settings ) ;
67+ var heuristicHits = BuildHeuristicHits ( transcript , settings ) ;
68+ var prompt = BuildPrompt ( transcript , settings , heuristicHits ) ;
6169 var systemPrompt = string . IsNullOrWhiteSpace ( settings ? . SystemPrompt ) ? DefaultSystemPrompt : settings ! . SystemPrompt ! ;
6270 var payload = new
6371 {
@@ -79,7 +87,7 @@ public async Task<IReadOnlyList<ClipDefinition>> GenerateClipsAsync(IReadOnlyLis
7987 var body = await response . Content . ReadAsStringAsync ( cancellationToken ) ;
8088 response . EnsureSuccessStatusCode ( ) ;
8189
82- var clipDefinitions = ParseResponse ( body , transcript ) ;
90+ var clipDefinitions = ParseResponse ( body , transcript , settings , heuristicHits ) ;
8391 if ( clipDefinitions . Count == 0 )
8492 {
8593 _logger . LogWarning ( "LLM segmentation did not return any clips. Falling back to a single clip definition." ) ;
@@ -101,25 +109,36 @@ private ClipDefinition BuildFallbackClip(IReadOnlyList<TimestampedTranscript> tr
101109 return new ClipDefinition ( "Full Program" , "AutoClipper fallback clip" , TimeSpan . Zero , end ) ;
102110 }
103111
104- private string BuildPrompt ( IReadOnlyList < TimestampedTranscript > transcript , ClipSegmentationSettings ? overrides )
112+ private string BuildPrompt ( IReadOnlyList < TimestampedTranscript > transcript , ClipSegmentationSettings ? overrides , IReadOnlyList < HeuristicHit > heuristicHits )
105113 {
106114 var template = ! string . IsNullOrWhiteSpace ( overrides ? . PromptOverride )
107115 ? overrides ! . PromptOverride !
108116 : string . IsNullOrWhiteSpace ( _options . LlmPrompt ) ? DefaultPrompt : _options . LlmPrompt ;
117+ var includesHeuristicPlaceholder = template . Contains ( "{{heuristic_notes}}" ) ;
109118 var limit = overrides ? . PromptCharacterLimit ?? Math . Max ( 1000 , _options . LlmPromptCharacterLimit ) ;
110119 var transcriptBody = BuildPromptTranscript ( transcript , limit ) ;
120+ var heuristicNotes = BuildHeuristicNotes ( heuristicHits , transcript ) ;
111121
112122 var maxClips = overrides ? . MaxStories ?? _options . LlmMaxStories ;
113123 if ( maxClips <= 0 ) maxClips = _options . LlmMaxStories ;
114124
115- return template
125+ var prompt = template
116126 . Replace ( "{{max_clips}}" , maxClips . ToString ( CultureInfo . InvariantCulture ) )
117- . Replace ( "{{transcript}}" , transcriptBody ) ;
127+ . Replace ( "{{transcript}}" , transcriptBody )
128+ . Replace ( "{{heuristic_notes}}" , heuristicNotes ) ;
129+
130+ if ( ! includesHeuristicPlaceholder && ! string . IsNullOrWhiteSpace ( heuristicNotes ) )
131+ {
132+ prompt += "\n \n Heuristic cues (for reference):\n " + heuristicNotes ;
133+ }
134+
135+ return prompt ;
118136 }
119137
120138 private string BuildPromptTranscript ( IReadOnlyList < TimestampedTranscript > transcript , int limit )
121139 {
122140 var builder = new StringBuilder ( ) ;
141+ builder . AppendLine ( "Sentences:" ) ;
123142 for ( var i = 0 ; i < transcript . Count ; i ++ )
124143 {
125144 var sentence = transcript [ i ] ;
@@ -129,9 +148,87 @@ private string BuildPromptTranscript(IReadOnlyList<TimestampedTranscript> transc
129148 break ;
130149 builder . AppendLine ( line ) ;
131150 }
151+
152+ builder . AppendLine ( ) ;
153+ builder . AppendLine ( "Paragraphs:" ) ;
154+ var paragraphNumber = 1 ;
155+ var index = 0 ;
156+ while ( index < transcript . Count && builder . Length < limit )
157+ {
158+ var start = index ;
159+ var end = Math . Min ( index + ParagraphSentenceCount , transcript . Count ) ;
160+ var sentences = new List < string > ( ) ;
161+ for ( var j = start ; j < end ; j ++ )
162+ {
163+ var sentence = transcript [ j ] ;
164+ if ( string . IsNullOrWhiteSpace ( sentence . Text ) ) continue ;
165+ sentences . Add ( sentence . Text . Trim ( ) ) ;
166+ }
167+
168+ if ( sentences . Count > 0 )
169+ {
170+ var line = $ "Paragraph { paragraphNumber } (sentences { start + 1 } -{ end } ): { string . Join ( " / " , sentences ) } ";
171+ if ( builder . Length + line . Length > limit ) break ;
172+ builder . AppendLine ( line ) ;
173+ paragraphNumber ++ ;
174+ }
175+
176+ index += ParagraphSentenceCount ;
177+ }
178+
132179 return builder . ToString ( ) ;
133180 }
134181
182+ private string BuildHeuristicNotes ( IReadOnlyList < HeuristicHit > ? hits , IReadOnlyList < TimestampedTranscript > transcript )
183+ {
184+ if ( hits == null || hits . Count == 0 ) return "<none>" ;
185+ var sb = new StringBuilder ( ) ;
186+ foreach ( var hit in hits . OrderBy ( h => h . Index ) )
187+ {
188+ var sentence = transcript [ hit . Index ] ;
189+ var snippet = string . IsNullOrWhiteSpace ( sentence . Text ) ? string . Empty : sentence . Text . Trim ( ) ;
190+ sb . AppendLine ( $ "Sentence { hit . Index + 1 } ({ FormatTimestamp ( sentence . Start ) } ): pattern '{ hit . Pattern } ' -> { snippet } ") ;
191+ }
192+ return sb . ToString ( ) . Trim ( ) ;
193+ }
194+
195+ private IReadOnlyList < HeuristicHit > BuildHeuristicHits ( IReadOnlyList < TimestampedTranscript > transcript , ClipSegmentationSettings ? settings )
196+ {
197+ if ( transcript == null || transcript . Count == 0 ) return Array . Empty < HeuristicHit > ( ) ;
198+ if ( settings ? . KeywordPatterns == null || settings . KeywordPatterns . Count == 0 ) return Array . Empty < HeuristicHit > ( ) ;
199+ var weight = settings . HeuristicBoundaryWeight ?? 0 ;
200+ if ( weight <= 0 ) return Array . Empty < HeuristicHit > ( ) ;
201+
202+ var hits = new List < HeuristicHit > ( ) ;
203+ var categoryLookup = settings . KeywordCategories ?? new Dictionary < string , string > ( ) ;
204+ foreach ( var pattern in settings . KeywordPatterns )
205+ {
206+ if ( string . IsNullOrWhiteSpace ( pattern ) ) continue ;
207+ Regex regex ;
208+ try
209+ {
210+ regex = new Regex ( pattern , RegexOptions . IgnoreCase | RegexOptions . Compiled ) ;
211+ }
212+ catch ( Exception ex )
213+ {
214+ _logger . LogWarning ( ex , "Invalid heuristic pattern: {Pattern}" , pattern ) ;
215+ continue ;
216+ }
217+
218+ var category = categoryLookup . TryGetValue ( pattern , out var mappedCategory ) ? mappedCategory : null ;
219+
220+ for ( var i = 0 ; i < transcript . Count ; i ++ )
221+ {
222+ var textValue = transcript [ i ] . Text ;
223+ if ( string . IsNullOrWhiteSpace ( textValue ) ) continue ;
224+ if ( regex . IsMatch ( textValue ) )
225+ hits . Add ( new HeuristicHit ( i , pattern , weight , category ) ) ;
226+ }
227+ }
228+
229+ return hits ;
230+ }
231+
135232 private string BuildRequestUri ( )
136233 {
137234 var baseUrl = _options . LlmApiUrl ? . TrimEnd ( '/' ) ?? string . Empty ;
@@ -144,7 +241,7 @@ private string BuildRequestUri()
144241 return string . IsNullOrWhiteSpace ( version ) ? path : $ "{ path } ?api-version={ version } ";
145242 }
146243
147- private IReadOnlyList < ClipDefinition > ParseResponse ( string ? body , IReadOnlyList < TimestampedTranscript > transcript )
244+ private IReadOnlyList < ClipDefinition > ParseResponse ( string ? body , IReadOnlyList < TimestampedTranscript > transcript , ClipSegmentationSettings ? settings , IReadOnlyList < HeuristicHit > heuristicHits )
148245 {
149246 if ( string . IsNullOrWhiteSpace ( body ) ) return Array . Empty < ClipDefinition > ( ) ;
150247
@@ -178,12 +275,13 @@ JsonValueKind.Object when root.Value.TryGetProperty("boundaries", out var bounda
178275 var zeroIndex = Math . Clamp ( rawIndex - 1 , 0 , transcript . Count - 1 ) ;
179276 var title = item . TryGetProperty ( "title" , out var titleElement ) ? titleElement . GetString ( ) ?? "Clip" : "Clip" ;
180277 var summary = item . TryGetProperty ( "summary" , out var summaryElement ) ? summaryElement . GetString ( ) ?? string . Empty : string . Empty ;
278+ var category = item . TryGetProperty ( "category" , out var categoryElement ) ? categoryElement . GetString ( ) : null ;
181279 var score = item . TryGetProperty ( "score" , out var scoreElement ) && scoreElement . TryGetDouble ( out var rawScore ) ? Math . Clamp ( rawScore , 0 , 1 ) : 1.0 ;
182- candidates . Add ( new BoundaryCandidate ( zeroIndex , title , summary , score ) ) ;
280+ candidates . Add ( new BoundaryCandidate ( zeroIndex , title , summary , score , false , category ) ) ;
183281 }
184282
185283 var threshold = Math . Clamp ( _options . LlmBoundaryScoreThreshold , 0 , 1 ) ;
186- return CreateClipDefinitions ( transcript , candidates , threshold ) ;
284+ return CreateClipDefinitions ( transcript , candidates , threshold , heuristicHits ) ;
187285 }
188286 catch ( Exception ex )
189287 {
@@ -192,7 +290,7 @@ JsonValueKind.Object when root.Value.TryGetProperty("boundaries", out var bounda
192290 }
193291 }
194292
195- private IReadOnlyList < ClipDefinition > CreateClipDefinitions ( IReadOnlyList < TimestampedTranscript > transcript , List < BoundaryCandidate > candidates , double threshold )
293+ private IReadOnlyList < ClipDefinition > CreateClipDefinitions ( IReadOnlyList < TimestampedTranscript > transcript , List < BoundaryCandidate > candidates , double threshold , IReadOnlyList < HeuristicHit > heuristicHits )
196294 {
197295 if ( transcript == null || transcript . Count == 0 )
198296 return Array . Empty < ClipDefinition > ( ) ;
@@ -205,12 +303,23 @@ private IReadOnlyList<ClipDefinition> CreateClipDefinitions(IReadOnlyList<Timest
205303 map [ index ] = candidate with { Index = index } ;
206304 }
207305
306+ if ( heuristicHits != null && heuristicHits . Count > 0 )
307+ {
308+ foreach ( var hit in heuristicHits )
309+ {
310+ var index = Math . Clamp ( hit . Index , 0 , transcript . Count - 1 ) ;
311+ var heuristicCandidate = new BoundaryCandidate ( index , $ "Heuristic boundary ({ hit . Pattern } )", string . Empty , hit . Weight , true , hit . Category ) ;
312+ if ( ! map . TryGetValue ( index , out var existing ) || heuristicCandidate . Score > existing . Score )
313+ map [ index ] = heuristicCandidate ;
314+ }
315+ }
316+
208317 var ordered = map . Values . OrderBy ( c => c . Index ) . ToList ( ) ;
209318 if ( ordered . Count == 0 )
210319 ordered . Add ( new BoundaryCandidate ( 0 , "Full Program" , "AutoClipper fallback clip" , 1 ) ) ;
211320
212321 if ( ordered [ 0 ] . Index != 0 )
213- ordered . Insert ( 0 , ordered [ 0 ] with { Index = 0 , Score = 1 } ) ;
322+ ordered . Insert ( 0 , ordered [ 0 ] with { Index = 0 , Score = 1 , IsHeuristic = false } ) ;
214323
215324 var filtered = new List < BoundaryCandidate > ( ) ;
216325 foreach ( var candidate in ordered )
@@ -226,17 +335,37 @@ private IReadOnlyList<ClipDefinition> CreateClipDefinitions(IReadOnlyList<Timest
226335 {
227336 var boundary = filtered [ i ] ;
228337 var start = transcript [ boundary . Index ] . Start ;
338+ var endIndex = i + 1 < filtered . Count ? filtered [ i + 1 ] . Index : transcript . Count - 1 ;
229339 var end = i + 1 < filtered . Count ? transcript [ filtered [ i + 1 ] . Index ] . Start : transcript [ ^ 1 ] . End ;
230340 if ( end <= start ) continue ;
231341 var title = string . IsNullOrWhiteSpace ( boundary . Title ) ? $ "Clip { i + 1 } " : boundary . Title ;
232342 var summary = string . IsNullOrWhiteSpace ( boundary . Summary ) ? string . Empty : boundary . Summary ;
233- list . Add ( new ClipDefinition ( title , summary , start , end ) ) ;
343+ var category = DetermineCategory ( boundary , heuristicHits , boundary . Index , endIndex ) ?? "News" ;
344+ list . Add ( new ClipDefinition ( title , summary , start , end , category ) ) ;
345+ _logger . LogInformation ( "Boundary {BoundaryIndex}: {Title} ({Start}-{End}) Score={Score:0.00} Heuristic={IsHeuristic} Category={Category}" , boundary . Index + 1 , title , start , end , boundary . Score , boundary . IsHeuristic , category ) ;
234346 }
235347
236348 return FilterOverlaps ( list ) ;
237349 }
238350
239- private sealed record BoundaryCandidate ( int Index , string Title , string Summary , double Score ) ;
351+
352+
353+
354+ private string ? DetermineCategory ( BoundaryCandidate boundary , IReadOnlyList < HeuristicHit > ? hits , int startIndex , int endIndex )
355+ {
356+ if ( ! string . IsNullOrWhiteSpace ( boundary . Category ) ) return boundary . Category ;
357+ if ( hits == null || hits . Count == 0 ) return null ;
358+ var best = hits
359+ . Where ( h => h . Index >= startIndex && h . Index <= endIndex )
360+ . OrderByDescending ( h => h . Weight )
361+ . ThenBy ( h => h . Index )
362+ . FirstOrDefault ( h => ! string . IsNullOrWhiteSpace ( h . Category ) ) ;
363+ return best ? . Category ;
364+ }
365+
366+ private sealed record BoundaryCandidate ( int Index , string Title , string Summary , double Score , bool IsHeuristic = false , string ? Category = null ) ;
367+
368+ private sealed record HeuristicHit ( int Index , string Pattern , double Weight , string ? Category ) ;
240369
241370 private static string StripCodeFence ( string body )
242371 {
@@ -300,3 +429,6 @@ private static IReadOnlyList<ClipDefinition> FilterOverlaps(IReadOnlyList<ClipDe
300429 return result ;
301430 }
302431}
432+
433+
434+
0 commit comments