Skip to content

Commit ae8e5a1

Browse files
authored
Merge pull request #85 from Maxr1998/fix-lrc-parser
Fix white space handling while parsing LRC with word time tags
2 parents 167a45e + 0c303d8 commit ae8e5a1

File tree

4 files changed

+136
-35
lines changed

4 files changed

+136
-35
lines changed

LrcParser.Tests/Parser/Lrc/Lines/LrcLyricParserTest.cs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,8 @@ public void TestDecode(string lyric, LrcLyric expected)
5050
{
5151
Text = "帰り道は",
5252
StartTimes = [17000],
53-
TimeTags = TestCaseTagHelper.ParseTimeTags(["[1,start]:1000", "[2,start]:2000", "[3,start]:3000", "[3,end]:4000"]),
53+
// [0,start]:17000 is created from the line time tag
54+
TimeTags = TestCaseTagHelper.ParseTimeTags(["[0,start]:17000", "[1,start]:1000", "[2,start]:2000", "[3,start]:3000", "[3,end]:4000"]),
5455
},
5556
],
5657
[

LrcParser.Tests/Parser/Lrc/Utils/LrcTimedTextUtilsTest.cs

Lines changed: 54 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -13,25 +13,72 @@ public class LrcTimedTextUtilsTest
1313
#region Decode
1414

1515
[TestCase("<00:17.97>帰<00:18.37>り<00:18.55>道<00:18.94>は<00:19.22>", "帰り道は", new[] { "[0,start]:17970", "[1,start]:18370", "[2,start]:18550", "[3,start]:18940", "[3,end]:19220" })]
16-
[TestCase(" <00:17.97>帰<00:18.37>り<00:18.55>道<00:18.94>は<00:19.22>", " 帰り道は", new[] { "[1,start]:17970", "[2,start]:18370", "[3,start]:18550", "[4,start]:18940", "[4,end]:19220" })]
17-
[TestCase("<00:17.97>帰<00:18.37>り<00:18.55>道<00:18.94>は<00:19.22> ", "帰り道は ", new[] { "[0,start]:17970", "[1,start]:18370", "[2,start]:18550", "[3,start]:18940", "[3,end]:19220" })]
18-
[TestCase("帰<00:18.37>り<00:18.55>道<00:18.94>は<00:19.22>", "帰り道は", new[] { "[1,start]:18370", "[2,start]:18550", "[3,start]:18940", "[3,end]:19220" })]
16+
[TestCase(" <00:17.97>帰<00:18.37>り<00:18.55>道<00:18.94>は<00:19.22>", "帰り道は", new[] { "[0,start]:17970", "[1,start]:18370", "[2,start]:18550", "[3,start]:18940", "[3,end]:19220" })]
17+
[TestCase("<00:17.97>帰<00:18.37>り<00:18.55>道<00:18.94>は<00:19.22> ", "帰り道は", new[] { "[0,start]:17970", "[1,start]:18370", "[2,start]:18550", "[3,start]:18940", "[3,end]:19220" })]
18+
[TestCase("帰<00:18.37>り<00:18.55>道<00:18.94>は<00:19.22>", "帰り道は", new[] { "[0,start]:0", "[1,start]:18370", "[2,start]:18550", "[3,start]:18940", "[3,end]:19220" })]
1919
[TestCase("<00:17.97>帰<00:18.37>り<00:18.55>道<00:18.94>は", "帰り道は", new[] { "[0,start]:17970", "[1,start]:18370", "[2,start]:18550", "[3,start]:18940" })]
2020
[TestCase("帰り道は", "帰り道は", new string[] { })]
2121
[TestCase("", "", new string[] { })]
22+
[TestCase(" ", "", new string[] { })]
2223
[TestCase(null, "", new string[] { })]
24+
[TestCase("<00:51.00> <01:29.99><01:48.29> <02:31.00> <02:41.99>You gotta fight !", "You gotta fight !", new[] { "[0,start]:161990" })] // multiple empty tags
25+
// Surrounding time tags
26+
[TestCase(
27+
"<00:06.84> Every <00:07.20> <00:07.56> night <00:07.87> <00:08.19> that <00:08.46> <00:08.79> goes <00:09.19> <00:09.59> between",
28+
"Every night that goes between",
29+
new[] { "[0,start]:6840", "[4,end]:7200", "[6,start]:7560", "[10,end]:7870", "[12,start]:8190", "[15,end]:8460", "[17,start]:8790", "[20,end]:9190", "[22,start]:9590" }
30+
)]
31+
// Alternating time tags, spaced on both sides
32+
[TestCase(
33+
"<00:06.84> Every <00:07.56> night <00:08.19> that <00:08.79> goes <00:09.59> between", "Every night that goes between",
34+
new[] { "[0,start]:6840", "[6,start]:7560", "[12,start]:8190", "[17,start]:8790", "[22,start]:9590" }
35+
)]
36+
// Alternating time tags, unspaced
37+
[TestCase(
38+
"<00:06.84>Every<00:07.56>night<00:08.19>that<00:08.79>goes<00:09.59>between", "Everynightthatgoesbetween",
39+
new[] { "[0,start]:6840", "[5,start]:7560", "[10,start]:8190", "[14,start]:8790", "[18,start]:9590" }
40+
)]
41+
[TestCase(
42+
"Every<00:07.56>night<00:08.19>that<00:08.79>goes<00:09.59>between", "Everynightthatgoesbetween",
43+
new[] { "[0,start]:0", "[5,start]:7560", "[10,start]:8190", "[14,start]:8790", "[18,start]:9590" }
44+
)]
45+
// Alternating time tags, prefix spaced
46+
[TestCase(
47+
"<00:06.84> Every<00:07.56> night<00:08.19> that<00:08.79> goes<00:09.59> between", "Every night that goes between",
48+
new[] { "[0,start]:6840", "[6,start]:7560", "[12,start]:8190", "[17,start]:8790", "[22,start]:9590" }
49+
)]
50+
[TestCase(
51+
"Every<00:07.56> night<00:08.19> that<00:08.79> goes<00:09.59> between", "Every night that goes between",
52+
new[] { "[0,start]:0", "[6,start]:7560", "[12,start]:8190", "[17,start]:8790", "[22,start]:9590" }
53+
)]
54+
// Alternating time tags, postfix spaced
55+
[TestCase(
56+
"<00:06.84>Every <00:07.56>night <00:08.19>that <00:08.79>goes <00:09.59>between", "Every night that goes between",
57+
new[] { "[0,start]:6840", "[6,start]:7560", "[12,start]:8190", "[17,start]:8790", "[22,start]:9590" }
58+
)]
59+
[TestCase(
60+
"Every <00:07.56>night <00:08.19>that <00:08.79>goes <00:09.59>between", "Every night that goes between",
61+
new[] { "[0,start]:0", "[6,start]:7560", "[12,start]:8190", "[17,start]:8790", "[22,start]:9590" }
62+
)]
2363
public void TestDecode(string text, string expectedText, string[] expectedTimeTags)
2464
{
25-
var (actualText, actualTimeTags) = LrcTimedTextUtils.TimedTextToObject(text);
65+
var (actualText, actualTimeTags) = LrcTimedTextUtils.TimedTextToObject(text, 0);
2666

2767
Assert.That(actualText, Is.EqualTo(expectedText));
2868
Assert.That(actualTimeTags, Is.EqualTo(TestCaseTagHelper.ParseTimeTags(expectedTimeTags)));
2969
}
3070

31-
[TestCase("<00:51.00><01:29.99><01:48.29><02:31.00><02:41.99>You gotta fight !", "You gotta fight !", new[] { "[0,start]:51000" })] // decode with invalid format.
32-
public void TestDecodeWithInvalidFormat(string text, string expectedText, string[] expectedTimeTags)
71+
[TestCase(
72+
"<00:06.84>Every<00:07.56>night<00:08.19>that<00:08.79>goes<00:09.59>between", 6840, "Everynightthatgoesbetween",
73+
new[] { "[0,start]:6840", "[5,start]:7560", "[10,start]:8190", "[14,start]:8790", "[18,start]:9590" }
74+
)]
75+
[TestCase(
76+
"Every<00:07.56>night<00:08.19>that<00:08.79>goes<00:09.59>between", 6840, "Everynightthatgoesbetween",
77+
new[] { "[0,start]:6840", "[5,start]:7560", "[10,start]:8190", "[14,start]:8790", "[18,start]:9590" }
78+
)]
79+
public void TestDecodeWithStartTime(string text, int lineStartTime, string expectedText, string[] expectedTimeTags)
3380
{
34-
var (actualText, actualTimeTags) = LrcTimedTextUtils.TimedTextToObject(text);
81+
var (actualText, actualTimeTags) = LrcTimedTextUtils.TimedTextToObject(text, lineStartTime);
3582

3683
Assert.That(actualText, Is.EqualTo(expectedText));
3784
Assert.That(actualTimeTags, Is.EqualTo(TestCaseTagHelper.ParseTimeTags(expectedTimeTags)));

LrcParser/Parser/Lrc/Lines/LrcLyricParser.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ public override LrcLyric Decode(string text)
3636
};
3737
}
3838

39-
var (lyric, timeTags) = LrcTimedTextUtils.TimedTextToObject(rawLyric);
39+
var (lyric, timeTags) = LrcTimedTextUtils.TimedTextToObject(rawLyric, startTimes[0]);
4040

4141
return new LrcLyric
4242
{

LrcParser/Parser/Lrc/Utils/LrcTimedTextUtils.cs

Lines changed: 79 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
// Copyright (c) karaoke.dev <contact@karaoke.dev>. Licensed under the MIT Licence.
22
// See the LICENCE file in the repository root for full licence text.
33

4+
using System.Text;
45
using LrcParser.Model;
56
using LrcParser.Utils;
67
using static LrcParser.Parser.Lrc.Utils.TimeTagMode;
@@ -10,53 +11,105 @@ namespace LrcParser.Parser.Lrc.Utils;
1011
internal static class LrcTimedTextUtils
1112
{
1213
/// <summary>
13-
///
14+
/// Parses the passed text for word time tags.
1415
/// </summary>
1516
/// <param name="timedText"></param>
17+
/// <param name="lineStartTime"></param>
1618
/// <returns></returns>
17-
internal static Tuple<string, SortedDictionary<TextIndex, int>> TimedTextToObject(string timedText)
19+
internal static Tuple<string, SortedDictionary<TextIndex, int>> TimedTextToObject(string timedText, int lineStartTime)
1820
{
19-
if (string.IsNullOrEmpty(timedText))
21+
if (string.IsNullOrWhiteSpace(timedText))
22+
{
2023
return new Tuple<string, SortedDictionary<TextIndex, int>>("", new SortedDictionary<TextIndex, int>());
24+
}
2125

22-
var matchTimeTags = TimeTagUtils.WORD_TIME_TAG_REGEX.Matches(timedText);
26+
var textLength = timedText.Length;
27+
var lyricText = new StringBuilder();
28+
var timeTags = new SortedDictionary<TextIndex, int>();
2329

24-
var endTextIndex = timedText.Length;
30+
var timeTagMatches = TimeTagUtils.WORD_TIME_TAG_REGEX.Matches(timedText);
2531

26-
var startIndex = 0;
32+
if (timeTagMatches.Count == 0)
33+
{
34+
// no word time tags, return lyric as-is
35+
return new Tuple<string, SortedDictionary<TextIndex, int>>(timedText, new SortedDictionary<TextIndex, int>());
36+
}
2737

28-
var text = string.Empty;
29-
var timeTags = new SortedDictionary<TextIndex, int>();
38+
var lastTimeTag = lineStartTime;
39+
var segmentStartIndex = 0;
40+
var insertSpace = false;
41+
var lastTagWasStartTag = false;
3042

31-
foreach (var match in matchTimeTags.ToArray())
43+
foreach (var match in timeTagMatches.ToArray())
3244
{
33-
var endIndex = match.Index;
45+
// Segment ends at the start of the next time tag
46+
var segmentEndIndex = match.Index;
47+
48+
var segment = timedText[segmentStartIndex..segmentEndIndex];
49+
50+
// Update next start index
51+
segmentStartIndex = segmentEndIndex + match.Length;
3452

35-
if (startIndex < endIndex)
53+
if (string.IsNullOrWhiteSpace(segment))
3654
{
37-
// add the text.
38-
text += timedText[startIndex..endIndex];
55+
// The last segment was a start tag, and the next segment is empty, insert end tag
56+
if (lastTagWasStartTag)
57+
{
58+
timeTags.TryAdd(new TextIndex(lyricText.Length - 1, IndexState.End), lastTimeTag);
59+
lastTagWasStartTag = false;
60+
}
61+
62+
// Skip empty lyric, update start time
63+
lastTimeTag = TimeTagUtils.ConvertTimeTagToMilliseconds(match.Value, WordTimeTag);
64+
65+
// Segment contains only whitespace but isn't empty, insert a space before an upcoming valid segment.
66+
if (segment.Length > 0) insertSpace = true;
67+
continue;
3968
}
4069

41-
// update the new start for next time-tag calculation.
42-
startIndex = endIndex + match.Length;
70+
// If the last segment ended with whitespace, or the current starts with whitespace,
71+
// insert a single space before the next segment.
72+
if ((char.IsWhiteSpace(segment[0]) || insertSpace) && lyricText.Length > 0)
73+
{
74+
lyricText.Append(' ');
75+
}
4376

44-
// add the time-tag.
45-
var hasText = startIndex < endTextIndex;
46-
var isEmptyStringNext = hasText && timedText[startIndex] == ' ';
77+
// Add start time tag for next lyric
78+
timeTags.TryAdd(new TextIndex(lyricText.Length), lastTimeTag);
79+
lastTagWasStartTag = true;
4780

48-
var state = hasText && !isEmptyStringNext ? IndexState.Start : IndexState.End;
49-
var textIndex = text.Length - (state == IndexState.Start ? 0 : 1);
50-
var time = TimeTagUtils.ConvertTimeTagToMilliseconds(match.Value, WordTimeTag);
81+
// Append lyric segment without surrounding whitespace
82+
lyricText.Append(segment.Trim());
5183

52-
// using try add because it might be possible with duplicated time-tag position in the lyric.
53-
timeTags.TryAdd(new TextIndex(textIndex, state), time);
84+
// Update start time for the next segment
85+
lastTimeTag = TimeTagUtils.ConvertTimeTagToMilliseconds(match.Value, WordTimeTag);
86+
87+
// Reset insertSpace flag after adding a segment,
88+
// and instead track whether this new segment ends with whitespace
89+
insertSpace = char.IsWhiteSpace(segment[^1]);
5490
}
5591

56-
// should add remaining text at the right of the end time-tag.
57-
text += timedText[startIndex..endTextIndex];
92+
var remaining = timedText[segmentStartIndex..textLength];
93+
94+
if (!string.IsNullOrWhiteSpace(remaining))
95+
{
96+
if ((char.IsWhiteSpace(remaining[0]) || insertSpace) && lyricText.Length > 0)
97+
{
98+
// Add space before the next segment
99+
lyricText.Append(' ');
100+
}
101+
102+
// Add remaining text with start time tag
103+
timeTags.TryAdd(new TextIndex(lyricText.Length), lastTimeTag);
104+
lyricText.Append(remaining.Trim());
105+
}
106+
else
107+
{
108+
// No remaining text, last time tag was end tag
109+
timeTags.TryAdd(new TextIndex(lyricText.Length - 1, IndexState.End), lastTimeTag);
110+
}
58111

59-
return new Tuple<string, SortedDictionary<TextIndex, int>>(text, timeTags);
112+
return new Tuple<string, SortedDictionary<TextIndex, int>>(lyricText.ToString(), timeTags);
60113
}
61114

62115
internal static string ToTimedText(string text, SortedDictionary<TextIndex, int> timeTags)

0 commit comments

Comments
 (0)