Skip to content

Commit b6fee87

Browse files
AlessiaYChenFosol
andauthored
MMI-3400 (#2544)
* auto-clipper initial commit * segment refine * update segmentation and labels, added weather/traffic/ads labels * add documents * Update auto clipper Add DevOps configs * Publish tno-core:1.0.30 * Fix UI * Fix build * Debug * Debug * Fix config * change to batch transcription and update prompt, heuristics,etc * update prompt amd heuristics * Add Auto Clipper Service segment refine update segmentation and labels, added weather/traffic/ads labels add documents Add DevOps configs Publish tno-core:1.0.30 * add missing file * update prompt for clip summary * remove deleted files * update * update station file * update title format * update llm temprature * update temperature * add category in the title --------- Co-authored-by: Jeremy Foster <jeremy.foster@fosol.ca>
1 parent 9772f77 commit b6fee87

File tree

4 files changed

+75
-21
lines changed

4 files changed

+75
-21
lines changed

services/net/auto-clipper/AutoClipperManager.cs

Lines changed: 24 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
using System.Globalization;
22
using System.Text;
3+
using System.Text.RegularExpressions;
34
using Confluent.Kafka;
45
using Microsoft.Extensions.Logging;
56
using Microsoft.Extensions.Options;
@@ -584,6 +585,10 @@ private ContentModel BuildClipContentModel(ContentModel sourceContent, ClipDefin
584585
// Calculate clip time = parent content publish time + clip start offset
585586
var clipTime = sourceContent.PublishedOn?.Add(definition.Start);
586587
var timePrefix = clipTime?.ToString("HH:mm");
588+
var clipTitle = FormatClipTitle(definition.Title);
589+
var categoryLabel = FormatClipCategory(definition.Category);
590+
var headlineCore = string.IsNullOrEmpty(timePrefix) ? clipTitle : $"{timePrefix} - {clipTitle}";
591+
var headline = string.IsNullOrEmpty(categoryLabel) ? headlineCore : $"[{categoryLabel}] {headlineCore}";
587592

588593
return new ContentModel
589594
{
@@ -601,7 +606,7 @@ private ContentModel BuildClipContentModel(ContentModel sourceContent, ClipDefin
601606
Byline = sourceContent.Byline,
602607
Status = ContentStatus.Draft,
603608
Uid = BaseService.GetContentHash(sourceContent.Source?.Code ?? "AutoClipper", $"{sourceContent.Uid}-clip-{clipIndex}", sourceContent.PublishedOn),
604-
Headline = string.IsNullOrEmpty(timePrefix) ? definition.Title : $"{timePrefix} - {definition.Title}",
609+
Headline = headline,
605610
Summary = $"[AutoClipper:{definition.Category}]\n{clipSummary}",
606611
Body = transcriptBody,
607612
SourceUrl = sourceContent.SourceUrl,
@@ -650,6 +655,24 @@ private static string BuildTranscriptDocument(IReadOnlyList<TimestampedTranscrip
650655
return sb.ToString().Trim();
651656
}
652657

658+
private static readonly Regex CamelCaseBoundaryRegex = new("(?<=[a-z0-9])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])", RegexOptions.Compiled);
659+
660+
private static string FormatClipTitle(string? title)
661+
{
662+
if (string.IsNullOrWhiteSpace(title)) return "Clip";
663+
var normalized = title.Trim();
664+
normalized = normalized.Replace('_', ' ').Replace('-', ' ');
665+
normalized = CamelCaseBoundaryRegex.Replace(normalized, " ");
666+
normalized = Regex.Replace(normalized, "\\s+", " ").Trim();
667+
return normalized;
668+
}
669+
670+
private static string FormatClipCategory(string? category)
671+
{
672+
if (string.IsNullOrWhiteSpace(category)) return string.Empty;
673+
return Regex.Replace(category.Trim(), "\\s+", " ");
674+
}
675+
653676
private static string FormatTimestamp(TimeSpan value)
654677
{
655678
var hours = (int)Math.Floor(value.TotalHours);
@@ -684,4 +707,3 @@ private static string FormatTimestamp(TimeSpan value)
684707
}
685708
#endregion
686709
}
687-

services/net/auto-clipper/Config/Stations/CKNW.yml

Lines changed: 17 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# Station specific configuration for CKNW. Adjust as needed.
1+
# Station-specific configuration for CKNW.
22
name: CKNW
33
sample_rate: 16000
44
transcription:
@@ -19,16 +19,26 @@ text:
1919
"(?i)coming up": Promo
2020
llm_segmentation: true
2121
llm_model: gpt-5.1-chat
22-
llm_temperature: 1
22+
llm_temperature: 1 # Set to 0.0 for deterministic JSON and stable indices.
2323
system_prompt: |
2424
You are a Broadcast Structure Parser. Your ONLY job is to detect segment transitions.
25-
Output MUST be a single, raw JSON object.
26-
CRITICAL: Start your response with '{' and end with '}'.
25+
Output MUST be a single, raw JSON object.
26+
CRITICAL: Start your response with '{' and end with '}'.
2727
DO NOT use markdown, backticks, or "```json" blocks. No introductory or closing text.
2828
max_stories: 15
2929
llm_prompt: |
3030
Identify every point in the transcript where the topic or segment type changes.
31-
31+
32+
# SUMMARY RULES
33+
1. **Prioritize Anchor Leads**: For News, derive the summary from the anchor's introduction or the first three sentences of the report.
34+
2. **Active Voice**: Use active journalistic voice (e.g., "Surrey Council rejects housing proposal" NOT "A report about a meeting").
35+
3. **Category Formulas**:
36+
- News: [Subject] [Action] (e.g., "Abbotsford shooting victim’s son calls for urgent investigation").
37+
- Traffic: [Location] [Incident/Status] (e.g., "Highway 99 northbound blocked at Hwy 17A due to crash").
38+
- Weather: [Condition] + [High Temp] (e.g., "Mix of sun and cloud with a high of 9 degrees").
39+
- Ad: [Business Name] + [Offer/Service] (e.g., "McDonald's features Egg McMuffin with Hollandaise sauce").
40+
4. **One Sentence Only**: Summaries MUST be a single, concise sentence.
41+
3242
# STRUCTURAL RULES (To Prevent Bundling)
3343
1. **The Sign-off Rule**: Phrases like "Global News," "CBC News," or "Reporting live" followed by a name mark the END of a segment. The very next sentence MUST be a new boundary.
3444
2. **The Handoff Rule**: When an anchor introduces a reporter (e.g., "As Joshua reports..."), the boundary starts at the ANCHOR'S introduction line.
@@ -42,7 +52,7 @@ text:
4252
"index": [Sentence Number],
4353
"category": "News | Traffic | Weather | Ad | Promo",
4454
"title": "[Short Slug]",
45-
"summary": "[One Sentence Recap]",
55+
"summary": "[Journalistic Summary Sentence]",
4656
"score": 0.95
4757
}
4858
]
@@ -61,7 +71,7 @@ heuristics:
6171
weight: 0.65
6272
category: Promo
6373
note: Signals a hard break/transition
64-
74+
6575
# --- New Anchor-Specific Patterns ---
6676
- pattern: "(?i)Ian Wilson"
6777
weight: 0.85
@@ -75,7 +85,6 @@ heuristics:
7585
weight: 0.60
7686
category: News
7787
note: Primary News anchor reset
78-
7988
# --- Service Cues ---
8089
- pattern: "(?i)traffic update"
8190
weight: 0.6

tools/auto-clipper-harness/.env.sample

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,12 @@ AUTOCLIP_HARNESS_BATCH_ENDPOINT=
88
AUTOCLIP_HARNESS_BATCH_VERSION=v3.2
99
AUTOCLIP_HARNESS_BATCH_POLL_SECONDS=10
1010
AUTOCLIP_HARNESS_BATCH_TIMEOUT_MINUTES=45
11+
# Provide either the full chat-completions endpoint or the base resource URL.
1112
AUTOCLIP_HARNESS_LLM_URL=https://your-resource.openai.azure.com
1213
AUTOCLIP_HARNESS_LLM_KEY=
13-
AUTOCLIP_HARNESS_LLM_DEPLOYMENT=
14-
AUTOCLIP_HARNESS_LLM_VERSION=2024-02-15-preview
14+
AUTOCLIP_HARNESS_LLM_DEPLOYMENT=gpt-4o-mini
15+
AUTOCLIP_HARNESS_LLM_MODEL=gpt-4o-mini
16+
AUTOCLIP_HARNESS_LLM_VERSION=2024-07-18
1517
AUTOCLIP_HARNESS_LANGUAGE=en-US
1618
AUTOCLIP_HARNESS_MAX_STORIES=5
19+

tools/auto-clipper-harness/Program.cs

Lines changed: 29 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
using System.Text;
2+
using System.Linq;
23
using System.Text.RegularExpressions;
34
using Microsoft.Extensions.Logging;
45
using Microsoft.Extensions.Options;
@@ -38,17 +39,25 @@
3839
var stationOptions = Options.Create(new AutoClipperOptions { StationConfigPath = stationConfigPath });
3940
var stationConfiguration = new StationConfigurationService(stationOptions, loggerFactory.CreateLogger<StationConfigurationService>());
4041
var stationProfile = stationConfiguration.GetProfile(stationCode);
41-
4242
var language = args.Length > 1
4343
? args[1]
4444
: Environment.GetEnvironmentVariable("AUTOCLIP_HARNESS_LANGUAGE")
4545
?? (!string.IsNullOrWhiteSpace(stationProfile.Transcription.Language) ? stationProfile.Transcription.Language : "en-US");
4646
var sampleRate = int.TryParse(Environment.GetEnvironmentVariable("AUTOCLIP_HARNESS_SAMPLE_RATE"), out var sr)
4747
? sr
4848
: (stationProfile.Transcription.SampleRate > 0 ? stationProfile.Transcription.SampleRate : 16000);
49-
5049
var audioNormalizer = new AudioNormalizer(loggerFactory.CreateLogger<AudioNormalizer>());
5150
var workingFile = await audioNormalizer.NormalizeAsync(input, sampleRate);
51+
var llmBaseUrl = RequireEnv("AUTOCLIP_HARNESS_LLM_URL").Trim();
52+
var llmDeployment = Environment.GetEnvironmentVariable("AUTOCLIP_HARNESS_LLM_DEPLOYMENT");
53+
var llmVersion = Environment.GetEnvironmentVariable("AUTOCLIP_HARNESS_LLM_VERSION") ?? "2024-07-18";
54+
var llmModel = Environment.GetEnvironmentVariable("AUTOCLIP_HARNESS_LLM_MODEL");
55+
var llmEndpoint = BuildLlmEndpointUri(llmBaseUrl, llmDeployment, llmVersion);
56+
var defaultModel = string.IsNullOrWhiteSpace(llmModel)
57+
? (!string.IsNullOrWhiteSpace(llmDeployment) ? llmDeployment : "gpt-4o-mini")
58+
: llmModel;
59+
60+
5261

5362
var options = Options.Create(new AutoClipperOptions
5463
{
@@ -60,21 +69,21 @@
6069
AzureSpeechBatchTimeoutMinutes = int.TryParse(Environment.GetEnvironmentVariable("AUTOCLIP_HARNESS_BATCH_TIMEOUT_MINUTES"), out var batchTimeoutMinutes) ? batchTimeoutMinutes : 45,
6170
AzureSpeechStorageConnectionString = RequireEnv("AUTOCLIP_HARNESS_STORAGE_CONNECTION_STRING"),
6271
AzureSpeechStorageContainer = RequireEnv("AUTOCLIP_HARNESS_STORAGE_CONTAINER"),
63-
AzureSpeechStorageSasExpiryMinutes = int.TryParse(Environment.GetEnvironmentVariable("AUTOCLIP_HARNESS_STORAGE_SAS_MINUTES"), out var sasMinutes) ? sasMinutes : 180,
64-
LlmApiUrl = new Uri(RequireEnv("AUTOCLIP_HARNESS_LLM_URL")),
72+
AzureSpeechStorageSasExpiryMinutes = int.TryParse(Environment.GetEnvironmentVariable("AUTOCLIP_HARNESS_STORAGE_SAS_MINUTES"), out var sasMinutes) ? sasMinutes : 180, LlmApiUrl = llmEndpoint,
73+
6574
LlmApiKey = RequireEnv("AUTOCLIP_HARNESS_LLM_KEY"),
75+
76+
LlmDefaultModel = defaultModel,
6677
LlmPrompt = Environment.GetEnvironmentVariable("AUTOCLIP_HARNESS_PROMPT")
6778
?? (string.IsNullOrWhiteSpace(stationProfile.Text.LlmPrompt) ? string.Empty : stationProfile.Text.LlmPrompt),
6879
MaxStoriesFromClip = int.TryParse(Environment.GetEnvironmentVariable("AUTOCLIP_HARNESS_MAX_STORIES"), out var maxStories) ? maxStories : 5,
6980
VolumePath = Path.GetDirectoryName(Path.GetFullPath(input)) ?? ".",
7081
DefaultTranscriptLanguage = stationProfile.Transcription.Language ?? "en-US"
7182
});
72-
7383
var speechLogger = loggerFactory.CreateLogger<AzureSpeechTranscriptionService>();
7484
var llmLogger = loggerFactory.CreateLogger<ClipSegmentationService>();
7585
var speechService = new AzureSpeechTranscriptionService(new HttpClient(), options, speechLogger);
7686
var llmService = new ClipSegmentationService(new HttpClient(), options, llmLogger);
77-
7887
var transcriptionRequest = new SpeechTranscriptionRequest
7988
{
8089
Language = language,
@@ -86,12 +95,10 @@
8695
Console.WriteLine($"[HARNESS] Transcribing {workingFile} ...");
8796
var segments = await speechService.TranscribeAsync(workingFile, transcriptionRequest, CancellationToken.None);
8897
Console.WriteLine($"[HARNESS] Received {segments.Count} transcript segments");
89-
9098
var fullTranscriptBody = BuildTranscriptDocument(segments);
9199
var fullTranscriptPath = Path.Combine(outputDir, "transcript_full.txt");
92100
await File.WriteAllTextAsync(fullTranscriptPath, fullTranscriptBody ?? string.Empty);
93101
Console.WriteLine($"[HARNESS] Full transcript -> {fullTranscriptPath}");
94-
95102
var segmentationSettings = BuildSegmentationSettings(stationProfile);
96103
Console.WriteLine("[HARNESS] Asking LLM for clip definitions ...");
97104
var promptDebugPath = Path.Combine(outputDir, "llm_prompt_debug.txt");
@@ -101,7 +108,6 @@
101108
.OrderBy(c => c.Start)
102109
.ToArray();
103110
Console.WriteLine($"[HARNESS] LLM returned {clipDefinitions.Length} clip candidates");
104-
105111
var index = 1;
106112
foreach (var definition in clipDefinitions)
107113
{
@@ -292,6 +298,19 @@ static async Task<string> CreateClipFileAsync(string srcFile, string outputDir,
292298

293299

294300

301+
302+
static Uri BuildLlmEndpointUri(string baseUrl, string? deployment, string apiVersion)
303+
{
304+
if (string.IsNullOrWhiteSpace(baseUrl))
305+
throw new InvalidOperationException("AUTOCLIP_HARNESS_LLM_URL must be set.");
306+
if (baseUrl.Contains("/chat/completions", StringComparison.OrdinalIgnoreCase))
307+
return new Uri(baseUrl);
308+
if (string.IsNullOrWhiteSpace(deployment))
309+
throw new InvalidOperationException("AUTOCLIP_HARNESS_LLM_DEPLOYMENT must be set when using a base LLM URL.");
310+
var version = string.IsNullOrWhiteSpace(apiVersion) ? "2024-07-18" : apiVersion;
311+
return new Uri($"{baseUrl.TrimEnd('/')}/openai/deployments/{deployment}/chat/completions?api-version={version}");
312+
}
313+
295314
static ClipSegmentationSettings BuildSegmentationSettings(StationProfile profile)
296315
{
297316
return new ClipSegmentationSettings
@@ -373,3 +392,4 @@ static IReadOnlyList<TimestampedTranscript> ExtractTranscriptRange(IReadOnlyList
373392

374393

375394

395+

0 commit comments

Comments
 (0)