Skip to content

Commit 38ce4f0

Browse files
committed
sub: add closest subtitle option using Levenshtein distance
This adds a new subtitle selection option that picks the subtitle file most similar to the media filename based on Levenshtein distance.
1 parent 454d9eb commit 38ce4f0

File tree

3 files changed

+127
-4
lines changed

3 files changed

+127
-4
lines changed

DOCS/man/options.rst

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2826,16 +2826,19 @@ Subtitles
28262826
rendering of ASS/SSA subtitles. It can sometimes be useful to forcibly
28272827
override the styling of ASS subtitles, but should be avoided in general.
28282828

2829-
``--sub-auto=<no|exact|fuzzy|all>``
2829+
``--sub-auto=<no|exact|fuzzy|all|closest>``
28302830
Load additional subtitle files matching the video filename. The parameter
28312831
specifies how external subtitle files are matched. ``exact`` is enabled by
28322832
default.
28332833

28342834
:no: Don't automatically load external subtitle files.
28352835
:exact: Load the media filename with subtitle file extension and possibly
28362836
language suffixes (default).
2837-
:fuzzy: Load all subs containing the media filename.
2838-
:all: Load all subs in the current and ``--sub-file-paths`` directories.
2837+
:fuzzy: Load all subs containing the media filename.
2838+
:all: Load all subs in the current and ``--sub-file-paths`` directories.
2839+
:closest: Load exactly one external subtitle: the single file whose filename
2840+
is most similar to the media's base name. Language/flag suffixes
2841+
like ``.en``, ``.eng``, ``.forced`` are ignored for similarity.
28392842

28402843
``--sub-auto-exts=ext1,ext2,...``
28412844
Subtitle extensions to try and match when using ``--sub-auto``. Note that

options/options.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -722,7 +722,7 @@ static const m_option_t mp_opts[] = {
722722
{"autoload-files", OPT_BOOL(autoload_files)},
723723

724724
{"sub-auto", OPT_CHOICE(sub_auto,
725-
{"no", -1}, {"exact", 0}, {"fuzzy", 1}, {"all", 2})},
725+
{"no", -1}, {"exact", 0}, {"fuzzy", 1}, {"all", 2}, {"closest", 3})},
726726
{"sub-auto-exts", OPT_STRINGLIST(sub_auto_exts), .flags = UPDATE_SUB_EXTS},
727727
{"audio-file-auto", OPT_CHOICE(audiofile_auto,
728728
{"no", -1}, {"exact", 0}, {"fuzzy", 1}, {"all", 2})},

player/loadfile.c

Lines changed: 120 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,10 @@
2020
#include <inttypes.h>
2121
#include <assert.h>
2222
#include <time.h>
23+
#include <ctype.h>
24+
#include <string.h>
25+
26+
// (helpers moved below includes to ensure prototypes are visible)
2327

2428
#include <libavutil/avutil.h>
2529

@@ -47,6 +51,7 @@
4751
#include "input/input.h"
4852
#include "misc/json.h"
4953
#include "misc/language.h"
54+
#include "misc/bstr.h"
5055

5156
#include "audio/out/ao.h"
5257
#include "filters/f_decoder_wrapper.h"
@@ -938,6 +943,86 @@ int mp_add_external_file(struct MPContext *mpctx, char *filename,
938943
return -1;
939944
}
940945

946+
// Helpers for sub-auto=closest selection
947+
static bool is_suffix_token(const char *tkn)
948+
{
949+
int len = (int)strlen(tkn);
950+
bool alpha = true;
951+
for (int i = 0; i < len; i++)
952+
alpha &= isalpha((unsigned char)tkn[i]) != 0;
953+
if (alpha && (len == 2 || len == 3))
954+
return true;
955+
const char *special[] = {"eng","en","es","fr","de","pt","ru","jp","ja","zh","chs","cht","sub","subs","sdh","forced","cc",NULL};
956+
for (int i = 0; special[i]; i++)
957+
if (strcmp(tkn, special[i]) == 0)
958+
return true;
959+
return false;
960+
}
961+
962+
static char *normalize_base_name(void *ta_ctx, const char *path)
963+
{
964+
struct bstr base = bstr0(mp_basename(path));
965+
base = bstr_strip_ext(base);
966+
char *tmpbuf = talloc_strndup(ta_ctx, base.start, base.len);
967+
for (int i = 0; tmpbuf[i]; i++)
968+
tmpbuf[i] = tolower((unsigned char)tmpbuf[i]);
969+
char **tokens = NULL;
970+
int ntok = 0;
971+
char *p = tmpbuf;
972+
while (*p) {
973+
while (*p && !isalnum((unsigned char)*p)) p++;
974+
if (!*p) break;
975+
char *start = p;
976+
while (*p && isalnum((unsigned char)*p)) p++;
977+
char save = *p; *p = '\0';
978+
MP_TARRAY_APPEND(ta_ctx, tokens, ntok, talloc_strdup(ta_ctx, start));
979+
*p = save;
980+
}
981+
while (ntok > 0 && is_suffix_token(tokens[ntok - 1]))
982+
ntok--;
983+
char *out = talloc_strdup(ta_ctx, "");
984+
for (int i = 0; i < ntok; i++)
985+
out = talloc_asprintf_append_buffer(out, "%s", tokens[i]);
986+
if (!out[0])
987+
out = talloc_strdup(ta_ctx, tmpbuf);
988+
return out;
989+
}
990+
991+
static int levenshtein_dist(const char *a, const char *b)
992+
{
993+
int la = (int)strlen(a), lb = (int)strlen(b);
994+
if (la == 0) return lb;
995+
if (lb == 0) return la;
996+
int *prev = talloc_array(NULL, int, lb + 1);
997+
int *curr = talloc_array(NULL, int, lb + 1);
998+
for (int j = 0; j <= lb; j++) prev[j] = j;
999+
for (int i = 1; i <= la; i++) {
1000+
curr[0] = i;
1001+
for (int j = 1; j <= lb; j++) {
1002+
int cost = a[i - 1] == b[j - 1] ? 0 : 1;
1003+
int del = prev[j] + 1;
1004+
int ins = curr[j - 1] + 1;
1005+
int sub = prev[j - 1] + cost;
1006+
int m = del < ins ? del : ins;
1007+
curr[j] = m < sub ? m : sub;
1008+
}
1009+
int *tmpv = prev; prev = curr; curr = tmpv;
1010+
}
1011+
int d = prev[lb];
1012+
talloc_free(prev);
1013+
talloc_free(curr);
1014+
return d;
1015+
}
1016+
1017+
static double similarity_ratio(const char *a, const char *b)
1018+
{
1019+
int la = (int)strlen(a), lb = (int)strlen(b);
1020+
int m = la > lb ? la : lb;
1021+
if (m == 0) return 1.0;
1022+
int d = levenshtein_dist(a, b);
1023+
return 1.0 - (double)d / (double)m;
1024+
}
1025+
9411026
// to be run on a worker thread, locked (temporarily unlocks core)
9421027
static void open_external_files(struct MPContext *mpctx, char **files,
9431028
enum stream_type filter)
@@ -974,6 +1059,38 @@ void autoload_external_files(struct MPContext *mpctx, struct mp_cancel *cancel)
9741059
sc[mpctx->tracks[n]->type]++;
9751060
}
9761061

1062+
// Preselect single best subtitle if sub-auto=closest
1063+
int best_sub_index = -1;
1064+
if (opts->sub_auto == 3) {
1065+
void *selctx = talloc_new(tmp);
1066+
char *movie_norm = normalize_base_name(selctx, mpctx->filename);
1067+
double best_score = -1.0;
1068+
for (int i = 0; list && list[i].fname; i++) {
1069+
struct subfn *e = &list[i];
1070+
if (e->type != STREAM_SUB)
1071+
continue;
1072+
// apply the same basic eligibility checks as the loading loop
1073+
bool already_loaded = false;
1074+
for (int n = 0; n < mpctx->num_tracks; n++) {
1075+
struct track *t = mpctx->tracks[n];
1076+
if (t->demuxer && strcmp(t->demuxer->filename, e->fname) == 0) {
1077+
already_loaded = true; break;
1078+
}
1079+
}
1080+
if (already_loaded)
1081+
continue;
1082+
if (!sc[STREAM_VIDEO] && !sc[STREAM_AUDIO])
1083+
continue;
1084+
char *cand_norm = normalize_base_name(selctx, e->fname);
1085+
double score = similarity_ratio(movie_norm, cand_norm);
1086+
if (score > best_score) {
1087+
best_score = score;
1088+
best_sub_index = i;
1089+
}
1090+
}
1091+
talloc_free(selctx);
1092+
}
1093+
9771094
for (int i = 0; list && list[i].fname; i++) {
9781095
struct subfn *e = &list[i];
9791096

@@ -989,6 +1106,9 @@ void autoload_external_files(struct MPContext *mpctx, struct mp_cancel *cancel)
9891106
if (e->type == STREAM_VIDEO && (sc[STREAM_VIDEO] || !sc[STREAM_AUDIO]))
9901107
goto skip;
9911108

1109+
if (opts->sub_auto == 3 && e->type == STREAM_SUB && i != best_sub_index)
1110+
goto skip;
1111+
9921112
enum track_flags flags = e->flags;
9931113
// when given filter is set to video, we are loading up cover art
9941114
flags |= e->type == STREAM_VIDEO ? TRACK_ATTACHED_PICTURE : 0;

0 commit comments

Comments
 (0)