Skip to content

Commit 422d52a

Browse files
committed
update segmentation and labels, added weather/traffic/ads labels
1 parent a06111a commit 422d52a

File tree

8 files changed

+273
-72
lines changed

8 files changed

+273
-72
lines changed

services/net/auto-clipper/AutoClipperManager.cs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -590,7 +590,7 @@ private ContentModel BuildClipContentModel(ContentModel sourceContent, ClipDefin
590590
Status = ContentStatus.Draft,
591591
Uid = BaseService.GetContentHash(sourceContent.Source?.Code ?? "AutoClipper", $"{sourceContent.Uid}-clip-{clipIndex}", sourceContent.PublishedOn),
592592
Headline = $"{sourceContent.Headline} [AutoClipper #{clipIndex}] {definition.Title}",
593-
Summary = $"[AutoClipper] {clipSummary}",
593+
Summary = $"[AutoClipper:{definition.Category}] {clipSummary}",
594594
Body = transcriptBody,
595595
SourceUrl = sourceContent.SourceUrl,
596596
PublishedOn = sourceContent.PublishedOn,
@@ -689,3 +689,4 @@ private static string FormatTimestamp(TimeSpan value)
689689
/// <returns>destination file name</returns>
690690
#endregion
691691
}
692+

services/net/auto-clipper/Config/StationProfile.cs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ public class StationTextProfile
2626
public double ChunkSizeSeconds { get; set; } = 3.0;
2727
public double ChunkOverlapRatio { get; set; } = 0.5;
2828
public double HeuristicBoundaryWeight { get; set; } = 0.15;
29+
public Dictionary<string, string> KeywordCategories { get; set; } = new();
2930
public bool LlmSegmentation { get; set; } = true;
3031
public string LlmModel { get; set; } = string.Empty;
3132
public string LlmPrompt { get; set; } = string.Empty;

services/net/auto-clipper/Config/Stations/CKNW.yml

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,12 @@ transcription:
99
text:
1010
chunk_size_s: 3.0
1111
chunk_overlap_ratio: 0.5
12+
heuristic_boundary_weight: 0.2
13+
keyword_categories:
14+
"(?i)traffic": Traffic
15+
"(?i)weather": Weather
16+
"(?i)sponsor": Ad
17+
"(?i)commercial": Ad
1218
llm_segmentation: true
1319
llm_model: gpt-4o-mini
1420
system_prompt: |
@@ -25,8 +31,8 @@ text:
2531
]
2632
}
2733
Requirements:
28-
- `index` matches the numbered sentence (1-based) where the story starts.
29-
- `score` ranges 0-1 and reflects confidence.
34+
- index matches the numbered sentence (1-based) where the story starts.
35+
- score ranges 0-1 and reflects confidence.
3036
- Keep boundaries chronological, never duplicate indices, and do not invent timestamps.
3137
- Use the provided timestamps/lines for context; ignore anything not in the transcript.
3238
Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,11 @@
1+
using System;
2+
13
namespace TNO.Services.AutoClipper.LLM;
24

35
/// <summary>
46
/// Represents a suggested clip from the LLM response.
57
/// </summary>
6-
public record ClipDefinition(string Title, string Summary, TimeSpan Start, TimeSpan End)
8+
public record ClipDefinition(string Title, string Summary, TimeSpan Start, TimeSpan End, string Category = "News")
79
{
810
public bool IsValid => End > Start;
911
}

services/net/auto-clipper/LLM/ClipSegmentationService.cs

Lines changed: 145 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
1-
1+
using System;
2+
using System.Text.RegularExpressions;
23
using System.Collections.Generic;
34
using System.Globalization;
45
using System.Linq;
@@ -17,6 +18,8 @@ public class ClipSegmentationService : IClipSegmentationService
1718
{
1819
private const string DefaultSystemPrompt = "You are a news segment tool. Analyse timestamped transcripts, choose where new stories begin, and output JSON suitable for ffmpeg clip creation.";
1920

21+
private const int ParagraphSentenceCount = 4;
22+
2023
private const string DefaultPrompt = """
2124
You will receive a transcript formatted as numbered sentences (index. timestamp range :: sentence).
2225
Identify up to {{max_clips}} places where a new story starts and return ONLY JSON:
@@ -29,9 +32,13 @@ You will receive a transcript formatted as numbered sentences (index. timestamp
2932
Rules:
3033
- `index` is the numbered sentence (1-based) where the new story begins.
3134
- `score` ranges from 0-1; higher means stronger confidence.
35+
- Consider the optional heuristic cues before discarding a boundary.
3236
- Keep boundaries chronological and avoid duplicates.
3337
- Do not invent timestamps; rely only on the provided lines.
3438
39+
Heuristic cues (if provided):
40+
{{heuristic_notes}}
41+
3542
Transcript:
3643
{{transcript}}
3744
""";
@@ -57,7 +64,8 @@ public async Task<IReadOnlyList<ClipDefinition>> GenerateClipsAsync(IReadOnlyLis
5764

5865
try
5966
{
60-
var prompt = BuildPrompt(transcript, settings);
67+
var heuristicHits = BuildHeuristicHits(transcript, settings);
68+
var prompt = BuildPrompt(transcript, settings, heuristicHits);
6169
var systemPrompt = string.IsNullOrWhiteSpace(settings?.SystemPrompt) ? DefaultSystemPrompt : settings!.SystemPrompt!;
6270
var payload = new
6371
{
@@ -79,7 +87,7 @@ public async Task<IReadOnlyList<ClipDefinition>> GenerateClipsAsync(IReadOnlyLis
7987
var body = await response.Content.ReadAsStringAsync(cancellationToken);
8088
response.EnsureSuccessStatusCode();
8189

82-
var clipDefinitions = ParseResponse(body, transcript);
90+
var clipDefinitions = ParseResponse(body, transcript, settings, heuristicHits);
8391
if (clipDefinitions.Count == 0)
8492
{
8593
_logger.LogWarning("LLM segmentation did not return any clips. Falling back to a single clip definition.");
@@ -101,25 +109,36 @@ private ClipDefinition BuildFallbackClip(IReadOnlyList<TimestampedTranscript> tr
101109
return new ClipDefinition("Full Program", "AutoClipper fallback clip", TimeSpan.Zero, end);
102110
}
103111

104-
private string BuildPrompt(IReadOnlyList<TimestampedTranscript> transcript, ClipSegmentationSettings? overrides)
112+
private string BuildPrompt(IReadOnlyList<TimestampedTranscript> transcript, ClipSegmentationSettings? overrides, IReadOnlyList<HeuristicHit> heuristicHits)
105113
{
106114
var template = !string.IsNullOrWhiteSpace(overrides?.PromptOverride)
107115
? overrides!.PromptOverride!
108116
: string.IsNullOrWhiteSpace(_options.LlmPrompt) ? DefaultPrompt : _options.LlmPrompt;
117+
var includesHeuristicPlaceholder = template.Contains("{{heuristic_notes}}");
109118
var limit = overrides?.PromptCharacterLimit ?? Math.Max(1000, _options.LlmPromptCharacterLimit);
110119
var transcriptBody = BuildPromptTranscript(transcript, limit);
120+
var heuristicNotes = BuildHeuristicNotes(heuristicHits, transcript);
111121

112122
var maxClips = overrides?.MaxStories ?? _options.LlmMaxStories;
113123
if (maxClips <= 0) maxClips = _options.LlmMaxStories;
114124

115-
return template
125+
var prompt = template
116126
.Replace("{{max_clips}}", maxClips.ToString(CultureInfo.InvariantCulture))
117-
.Replace("{{transcript}}", transcriptBody);
127+
.Replace("{{transcript}}", transcriptBody)
128+
.Replace("{{heuristic_notes}}", heuristicNotes);
129+
130+
if (!includesHeuristicPlaceholder && !string.IsNullOrWhiteSpace(heuristicNotes))
131+
{
132+
prompt += "\n\nHeuristic cues (for reference):\n" + heuristicNotes;
133+
}
134+
135+
return prompt;
118136
}
119137

120138
private string BuildPromptTranscript(IReadOnlyList<TimestampedTranscript> transcript, int limit)
121139
{
122140
var builder = new StringBuilder();
141+
builder.AppendLine("Sentences:");
123142
for (var i = 0; i < transcript.Count; i++)
124143
{
125144
var sentence = transcript[i];
@@ -129,9 +148,87 @@ private string BuildPromptTranscript(IReadOnlyList<TimestampedTranscript> transc
129148
break;
130149
builder.AppendLine(line);
131150
}
151+
152+
builder.AppendLine();
153+
builder.AppendLine("Paragraphs:");
154+
var paragraphNumber = 1;
155+
var index = 0;
156+
while (index < transcript.Count && builder.Length < limit)
157+
{
158+
var start = index;
159+
var end = Math.Min(index + ParagraphSentenceCount, transcript.Count);
160+
var sentences = new List<string>();
161+
for (var j = start; j < end; j++)
162+
{
163+
var sentence = transcript[j];
164+
if (string.IsNullOrWhiteSpace(sentence.Text)) continue;
165+
sentences.Add(sentence.Text.Trim());
166+
}
167+
168+
if (sentences.Count > 0)
169+
{
170+
var line = $"Paragraph {paragraphNumber} (sentences {start + 1}-{end}): {string.Join(" / ", sentences)}";
171+
if (builder.Length + line.Length > limit) break;
172+
builder.AppendLine(line);
173+
paragraphNumber++;
174+
}
175+
176+
index += ParagraphSentenceCount;
177+
}
178+
132179
return builder.ToString();
133180
}
134181

182+
private string BuildHeuristicNotes(IReadOnlyList<HeuristicHit>? hits, IReadOnlyList<TimestampedTranscript> transcript)
183+
{
184+
if (hits == null || hits.Count == 0) return "<none>";
185+
var sb = new StringBuilder();
186+
foreach (var hit in hits.OrderBy(h => h.Index))
187+
{
188+
var sentence = transcript[hit.Index];
189+
var snippet = string.IsNullOrWhiteSpace(sentence.Text) ? string.Empty : sentence.Text.Trim();
190+
sb.AppendLine($"Sentence {hit.Index + 1} ({FormatTimestamp(sentence.Start)}): pattern '{hit.Pattern}' -> {snippet}");
191+
}
192+
return sb.ToString().Trim();
193+
}
194+
195+
private IReadOnlyList<HeuristicHit> BuildHeuristicHits(IReadOnlyList<TimestampedTranscript> transcript, ClipSegmentationSettings? settings)
196+
{
197+
if (transcript == null || transcript.Count == 0) return Array.Empty<HeuristicHit>();
198+
if (settings?.KeywordPatterns == null || settings.KeywordPatterns.Count == 0) return Array.Empty<HeuristicHit>();
199+
var weight = settings.HeuristicBoundaryWeight ?? 0;
200+
if (weight <= 0) return Array.Empty<HeuristicHit>();
201+
202+
var hits = new List<HeuristicHit>();
203+
var categoryLookup = settings.KeywordCategories ?? new Dictionary<string, string>();
204+
foreach (var pattern in settings.KeywordPatterns)
205+
{
206+
if (string.IsNullOrWhiteSpace(pattern)) continue;
207+
Regex regex;
208+
try
209+
{
210+
regex = new Regex(pattern, RegexOptions.IgnoreCase | RegexOptions.Compiled);
211+
}
212+
catch (Exception ex)
213+
{
214+
_logger.LogWarning(ex, "Invalid heuristic pattern: {Pattern}", pattern);
215+
continue;
216+
}
217+
218+
var category = categoryLookup.TryGetValue(pattern, out var mappedCategory) ? mappedCategory : null;
219+
220+
for (var i = 0; i < transcript.Count; i++)
221+
{
222+
var textValue = transcript[i].Text;
223+
if (string.IsNullOrWhiteSpace(textValue)) continue;
224+
if (regex.IsMatch(textValue))
225+
hits.Add(new HeuristicHit(i, pattern, weight, category));
226+
}
227+
}
228+
229+
return hits;
230+
}
231+
135232
private string BuildRequestUri()
136233
{
137234
var baseUrl = _options.LlmApiUrl?.TrimEnd('/') ?? string.Empty;
@@ -144,7 +241,7 @@ private string BuildRequestUri()
144241
return string.IsNullOrWhiteSpace(version) ? path : $"{path}?api-version={version}";
145242
}
146243

147-
private IReadOnlyList<ClipDefinition> ParseResponse(string? body, IReadOnlyList<TimestampedTranscript> transcript)
244+
private IReadOnlyList<ClipDefinition> ParseResponse(string? body, IReadOnlyList<TimestampedTranscript> transcript, ClipSegmentationSettings? settings, IReadOnlyList<HeuristicHit> heuristicHits)
148245
{
149246
if (string.IsNullOrWhiteSpace(body)) return Array.Empty<ClipDefinition>();
150247

@@ -178,12 +275,13 @@ JsonValueKind.Object when root.Value.TryGetProperty("boundaries", out var bounda
178275
var zeroIndex = Math.Clamp(rawIndex - 1, 0, transcript.Count - 1);
179276
var title = item.TryGetProperty("title", out var titleElement) ? titleElement.GetString() ?? "Clip" : "Clip";
180277
var summary = item.TryGetProperty("summary", out var summaryElement) ? summaryElement.GetString() ?? string.Empty : string.Empty;
278+
var category = item.TryGetProperty("category", out var categoryElement) ? categoryElement.GetString() : null;
181279
var score = item.TryGetProperty("score", out var scoreElement) && scoreElement.TryGetDouble(out var rawScore) ? Math.Clamp(rawScore, 0, 1) : 1.0;
182-
candidates.Add(new BoundaryCandidate(zeroIndex, title, summary, score));
280+
candidates.Add(new BoundaryCandidate(zeroIndex, title, summary, score, false, category));
183281
}
184282

185283
var threshold = Math.Clamp(_options.LlmBoundaryScoreThreshold, 0, 1);
186-
return CreateClipDefinitions(transcript, candidates, threshold);
284+
return CreateClipDefinitions(transcript, candidates, threshold, heuristicHits);
187285
}
188286
catch (Exception ex)
189287
{
@@ -192,7 +290,7 @@ JsonValueKind.Object when root.Value.TryGetProperty("boundaries", out var bounda
192290
}
193291
}
194292

195-
private IReadOnlyList<ClipDefinition> CreateClipDefinitions(IReadOnlyList<TimestampedTranscript> transcript, List<BoundaryCandidate> candidates, double threshold)
293+
private IReadOnlyList<ClipDefinition> CreateClipDefinitions(IReadOnlyList<TimestampedTranscript> transcript, List<BoundaryCandidate> candidates, double threshold, IReadOnlyList<HeuristicHit> heuristicHits)
196294
{
197295
if (transcript == null || transcript.Count == 0)
198296
return Array.Empty<ClipDefinition>();
@@ -205,12 +303,23 @@ private IReadOnlyList<ClipDefinition> CreateClipDefinitions(IReadOnlyList<Timest
205303
map[index] = candidate with { Index = index };
206304
}
207305

306+
if (heuristicHits != null && heuristicHits.Count > 0)
307+
{
308+
foreach (var hit in heuristicHits)
309+
{
310+
var index = Math.Clamp(hit.Index, 0, transcript.Count - 1);
311+
var heuristicCandidate = new BoundaryCandidate(index, $"Heuristic boundary ({hit.Pattern})", string.Empty, hit.Weight, true, hit.Category);
312+
if (!map.TryGetValue(index, out var existing) || heuristicCandidate.Score > existing.Score)
313+
map[index] = heuristicCandidate;
314+
}
315+
}
316+
208317
var ordered = map.Values.OrderBy(c => c.Index).ToList();
209318
if (ordered.Count == 0)
210319
ordered.Add(new BoundaryCandidate(0, "Full Program", "AutoClipper fallback clip", 1));
211320

212321
if (ordered[0].Index != 0)
213-
ordered.Insert(0, ordered[0] with { Index = 0, Score = 1 });
322+
ordered.Insert(0, ordered[0] with { Index = 0, Score = 1, IsHeuristic = false });
214323

215324
var filtered = new List<BoundaryCandidate>();
216325
foreach (var candidate in ordered)
@@ -226,17 +335,37 @@ private IReadOnlyList<ClipDefinition> CreateClipDefinitions(IReadOnlyList<Timest
226335
{
227336
var boundary = filtered[i];
228337
var start = transcript[boundary.Index].Start;
338+
var endIndex = i + 1 < filtered.Count ? filtered[i + 1].Index : transcript.Count - 1;
229339
var end = i + 1 < filtered.Count ? transcript[filtered[i + 1].Index].Start : transcript[^1].End;
230340
if (end <= start) continue;
231341
var title = string.IsNullOrWhiteSpace(boundary.Title) ? $"Clip {i + 1}" : boundary.Title;
232342
var summary = string.IsNullOrWhiteSpace(boundary.Summary) ? string.Empty : boundary.Summary;
233-
list.Add(new ClipDefinition(title, summary, start, end));
343+
var category = DetermineCategory(boundary, heuristicHits, boundary.Index, endIndex) ?? "News";
344+
list.Add(new ClipDefinition(title, summary, start, end, category));
345+
_logger.LogInformation("Boundary {BoundaryIndex}: {Title} ({Start}-{End}) Score={Score:0.00} Heuristic={IsHeuristic} Category={Category}", boundary.Index + 1, title, start, end, boundary.Score, boundary.IsHeuristic, category);
234346
}
235347

236348
return FilterOverlaps(list);
237349
}
238350

239-
private sealed record BoundaryCandidate(int Index, string Title, string Summary, double Score);
351+
352+
353+
354+
private string? DetermineCategory(BoundaryCandidate boundary, IReadOnlyList<HeuristicHit>? hits, int startIndex, int endIndex)
355+
{
356+
if (!string.IsNullOrWhiteSpace(boundary.Category)) return boundary.Category;
357+
if (hits == null || hits.Count == 0) return null;
358+
var best = hits
359+
.Where(h => h.Index >= startIndex && h.Index <= endIndex)
360+
.OrderByDescending(h => h.Weight)
361+
.ThenBy(h => h.Index)
362+
.FirstOrDefault(h => !string.IsNullOrWhiteSpace(h.Category));
363+
return best?.Category;
364+
}
365+
366+
private sealed record BoundaryCandidate(int Index, string Title, string Summary, double Score, bool IsHeuristic = false, string? Category = null);
367+
368+
private sealed record HeuristicHit(int Index, string Pattern, double Weight, string? Category);
240369

241370
private static string StripCodeFence(string body)
242371
{
@@ -300,3 +429,6 @@ private static IReadOnlyList<ClipDefinition> FilterOverlaps(IReadOnlyList<ClipDe
300429
return result;
301430
}
302431
}
432+
433+
434+

services/net/auto-clipper/LLM/ClipSegmentationSettings.cs

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
using System.Collections.Generic;
2+
13
namespace TNO.Services.AutoClipper.LLM;
24

35
public class ClipSegmentationSettings
@@ -7,4 +9,7 @@ public class ClipSegmentationSettings
79
public string? SystemPrompt { get; set; }
810
public int? PromptCharacterLimit { get; set; }
911
public int? MaxStories { get; set; }
12+
public IReadOnlyList<string>? KeywordPatterns { get; set; }
13+
public double? HeuristicBoundaryWeight { get; set; }
14+
public IReadOnlyDictionary<string, string>? KeywordCategories { get; set; }
1015
}

services/net/auto-clipper/Pipeline/ClipProcessingPipeline.cs

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
using System.Linq;
12
using Microsoft.Extensions.Logging;
23
using Microsoft.Extensions.Options;
34
using TNO.Services.AutoClipper.Audio;
@@ -60,8 +61,15 @@ private ClipSegmentationSettings BuildSegmentationSettings(StationProfile profil
6061
ModelOverride = string.IsNullOrWhiteSpace(profile.Text.LlmModel) ? null : profile.Text.LlmModel,
6162
SystemPrompt = string.IsNullOrWhiteSpace(profile.Text.SystemPrompt) ? null : profile.Text.SystemPrompt,
6263
PromptCharacterLimit = profile.Text.PromptCharacterLimit,
63-
MaxStories = profile.Text.MaxStories
64+
MaxStories = profile.Text.MaxStories,
65+
KeywordPatterns = profile.Heuristics.KeywordPatterns?.ToArray(),
66+
HeuristicBoundaryWeight = profile.Text.HeuristicBoundaryWeight,
67+
KeywordCategories = profile.Text.KeywordCategories?.ToDictionary(kvp => kvp.Key, kvp => kvp.Value)
6468
};
6569
}
6670
}
6771

72+
73+
74+
75+

0 commit comments

Comments
 (0)