Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
8c22b6c
Add response filtering
tcely Jan 7, 2025
63fa97c
More compact JSON
tcely Jan 7, 2025
8c31720
Log the reduction of metadata length
tcely Jan 7, 2025
ca75398
Merge branch 'meeb:main' into filter-metadata-response
tcely Jan 7, 2025
25d2ff6
Don't reduce the actual data yet
tcely Jan 7, 2025
2f34fff
Fixes from testing
tcely Jan 7, 2025
9a4101a
Fix formatting
tcely Jan 7, 2025
db25fa8
Adjusted comment
tcely Jan 7, 2025
431de2e
Loop over a set of keys for each URL type
tcely Jan 7, 2025
7b8d117
Drop keys from formats that cannot be useful
tcely Jan 7, 2025
c7457e9
Check that the drop_key exists
tcely Jan 7, 2025
2d85bcb
Use a distinct try to log errors
tcely Jan 7, 2025
8ac5b36
Use the exception function for traceback
tcely Jan 7, 2025
7793701
Simplify results from _url_keys
tcely Jan 7, 2025
1c432cc
Some formats are using a different URL
tcely Jan 7, 2025
d35f52f
Drop /expire/ URLs from automatic_captions too
tcely Jan 8, 2025
ad10bcf
Log both compacted and reduced sizes
tcely Jan 9, 2025
100382f
Rename compact_data to compact_json
tcely Jan 9, 2025
682a53d
Add a filter_response test
tcely Jan 9, 2025
4c9fa40
More filter_response asserts
tcely Jan 9, 2025
3e3f80d
More filter_response asserts
tcely Jan 9, 2025
29c39aa
Add SHRINK_NEW_MEDIA_METADATA setting
tcely Jan 9, 2025
0f98694
Have filter_response return a copy, if requested
tcely Jan 9, 2025
274f19f
Use the new copy argument to filter_response
tcely Jan 9, 2025
1ff8dfd
Use the new copy argument to filter_response
tcely Jan 9, 2025
6292a9a
Add SHRINK_OLD_MEDIA_METADATA setting
tcely Jan 9, 2025
45d7039
Only log the extra messages with the new setting
tcely Jan 14, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 32 additions & 1 deletion tubesync/sync/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
from .youtube import (get_media_info as get_youtube_media_info,
download_media as download_youtube_media,
get_channel_image_info as get_youtube_channel_image_info)
from .utils import seconds_to_timestr, parse_media_format
from .utils import seconds_to_timestr, parse_media_format, filter_response
from .matching import (get_best_combined_format, get_best_audio_format,
get_best_video_format)
from .mediaservers import PlexMediaServer
Expand Down Expand Up @@ -1143,8 +1143,39 @@ def format_dict(self):
def has_metadata(self):
return self.metadata is not None


@property
def reduce_data(self):
try:
from common.logger import log
from common.utils import json_serial

old_mdl = len(self.metadata or "")
data = json.loads(self.metadata or "")
compact_json = json.dumps(data, separators=(',', ':'), default=json_serial)

filtered_data = filter_response(data, True)
filtered_json = json.dumps(filtered_data, separators=(',', ':'), default=json_serial)
except Exception as e:
log.exception('reduce_data: %s', e)
else:
# log the results of filtering / compacting on metadata size
new_mdl = len(compact_json)
if old_mdl > new_mdl:
delta = old_mdl - new_mdl
log.info(f'{self.key}: metadata compacted by {delta:,} characters ({old_mdl:,} -> {new_mdl:,})')
new_mdl = len(filtered_json)
if old_mdl > new_mdl:
delta = old_mdl - new_mdl
log.info(f'{self.key}: metadata reduced by {delta:,} characters ({old_mdl:,} -> {new_mdl:,})')
if getattr(settings, 'SHRINK_OLD_MEDIA_METADATA', False):
self.metadata = filtered_json


@property
def loaded_metadata(self):
if getattr(settings, 'SHRINK_OLD_MEDIA_METADATA', False):
self.reduce_data
try:
data = json.loads(self.metadata)
if not isinstance(data, dict):
Expand Down
7 changes: 5 additions & 2 deletions tubesync/sync/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
from common.utils import json_serial
from .models import Source, Media, MediaServer
from .utils import (get_remote_image, resize_image_to_height, delete_file,
write_text_file)
write_text_file, filter_response)
from .filtering import filter_media


Expand Down Expand Up @@ -304,7 +304,10 @@ def download_media_metadata(media_id):
return
source = media.source
metadata = media.index_metadata()
media.metadata = json.dumps(metadata, default=json_serial)
response = metadata
if getattr(settings, 'SHRINK_NEW_MEDIA_METADATA', False):
response = filter_response(metadata, True)
media.metadata = json.dumps(response, separators=(',', ':'), default=json_serial)
upload_date = media.upload_date
# Media must have a valid upload date
if upload_date:
Expand Down
79 changes: 79 additions & 0 deletions tubesync/sync/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from .models import Source, Media
from .tasks import cleanup_old_media
from .filtering import filter_media
from .utils import filter_response


class FrontEndTestCase(TestCase):
Expand Down Expand Up @@ -1709,6 +1710,84 @@ def test_is_regex_match(self):
f'expected {expected_match_result}')


class ResponseFilteringTestCase(TestCase):

def setUp(self):
# Disable general logging for test case
logging.disable(logging.CRITICAL)
# Add a test source
self.source = Source.objects.create(
source_type=Source.SOURCE_TYPE_YOUTUBE_CHANNEL,
key='testkey',
name='testname',
directory='testdirectory',
index_schedule=3600,
delete_old_media=False,
days_to_keep=14,
source_resolution=Source.SOURCE_RESOLUTION_1080P,
source_vcodec=Source.SOURCE_VCODEC_VP9,
source_acodec=Source.SOURCE_ACODEC_OPUS,
prefer_60fps=False,
prefer_hdr=False,
fallback=Source.FALLBACK_FAIL
)
# Add some media
self.media = Media.objects.create(
key='mediakey',
source=self.source,
metadata='{}'
)

def test_metadata_20230629(self):
self.media.metadata = all_test_metadata['20230629']
self.media.save()

unfiltered = self.media.loaded_metadata
filtered = filter_response(self.media.loaded_metadata)
self.assertIn('formats', unfiltered.keys())
self.assertIn('formats', filtered.keys())
# filtered 'downloader_options'
self.assertIn('downloader_options', unfiltered['formats'][10].keys())
self.assertNotIn('downloader_options', filtered['formats'][10].keys())
# filtered 'http_headers'
self.assertIn('http_headers', unfiltered['formats'][0].keys())
self.assertNotIn('http_headers', filtered['formats'][0].keys())
# did not lose any formats
self.assertEqual(48, len(unfiltered['formats']))
self.assertEqual(48, len(filtered['formats']))
self.assertEqual(len(unfiltered['formats']), len(filtered['formats']))
# did not remove everything with url
self.assertIn('original_url', unfiltered.keys())
self.assertIn('original_url', filtered.keys())
self.assertEqual(unfiltered['original_url'], filtered['original_url'])
# did reduce the size of the metadata
self.assertTrue(len(str(filtered)) < len(str(unfiltered)))

url_keys = []
for format in unfiltered['formats']:
for key in format.keys():
if 'url' in key:
url_keys.append((format['format_id'], key, format[key],))
unfiltered_url_keys = url_keys
self.assertEqual(63, len(unfiltered_url_keys), msg=str(unfiltered_url_keys))

url_keys = []
for format in filtered['formats']:
for key in format.keys():
if 'url' in key:
url_keys.append((format['format_id'], key, format[key],))
filtered_url_keys = url_keys
self.assertEqual(3, len(filtered_url_keys), msg=str(filtered_url_keys))

url_keys = []
for lang_code, captions in filtered['automatic_captions'].items():
for caption in captions:
for key in caption.keys():
if 'url' in key:
url_keys.append((lang_code, caption['ext'], caption[key],))
self.assertEqual(0, len(url_keys), msg=str(url_keys))


class TasksTestCase(TestCase):

def setUp(self):
Expand Down
90 changes: 90 additions & 0 deletions tubesync/sync/utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import os
import re
import math
from copy import deepcopy
from operator import itemgetter
from pathlib import Path
from tempfile import NamedTemporaryFile
Expand Down Expand Up @@ -171,6 +172,95 @@ def normalize_codec(codec_str):
return result


def _url_keys(arg_dict, filter_func):
result = {}
for key in arg_dict.keys():
if 'url' in key:
result.update(
{key: filter_func(key=key, url=arg_dict[key])}
)
return result


def _drop_url_keys(arg_dict, key, filter_func):
if key in arg_dict.keys():
for val_dict in arg_dict[key]:
for url_key, remove in _url_keys(val_dict, filter_func).items():
if remove is True:
del val_dict[url_key]


def filter_response(arg_dict, copy_arg=False):
'''
Clean up the response so as to not store useless metadata in the database.
'''
response_dict = arg_dict
# raise an exception for an unexpected argument type
if not isinstance(response_dict, dict):
raise TypeError(f'response_dict must be a dict, got "{type(response_dict)}"')

if copy_arg:
response_dict = deepcopy(arg_dict)

# optimize the empty case
if not response_dict:
return response_dict

# beginning of formats cleanup {{{
# drop urls that expire, or restrict IPs
def drop_format_url(**kwargs):
url = kwargs['url']
return (
url
and '://' in url
and (
'/ip/' in url
or 'ip=' in url
or '/expire/' in url
or 'expire=' in url
)
)

# these format keys are not useful to us
drop_keys = frozenset((
'downloader_options',
'fragments',
'http_headers',
'__needs_testing',
'__working',
))
for key in frozenset(('formats', 'requested_formats',)):
_drop_url_keys(response_dict, key, drop_format_url)
if key in response_dict.keys():
for format in response_dict[key]:
for drop_key in drop_keys:
if drop_key in format.keys():
del format[drop_key]
# end of formats cleanup }}}

# beginning of subtitles cleanup {{{
# drop urls that expire
def drop_subtitles_url(**kwargs):
url = kwargs['url']
return (
url
and '://' in url
and (
'/expire/' in url
or '&expire=' in url
)
)

for key in frozenset(('subtitles', 'automatic_captions',)):
if key in response_dict.keys():
key_dict = response_dict[key]
for lang_code in key_dict:
_drop_url_keys(key_dict, lang_code, drop_subtitles_url)
# end of subtitles cleanup }}}

return response_dict


def parse_media_format(format_dict):
'''
This parser primarily adapts the format dict returned by youtube-dl into a
Expand Down