Skip to content

Commit 51153f0

Browse files
authored
Merge pull request #612 from tcely/filter-metadata-response
Filter metadata to avoid storing excess text in the database table
2 parents d1a7e90 + 45d7039 commit 51153f0

File tree

4 files changed

+206
-3
lines changed

4 files changed

+206
-3
lines changed

tubesync/sync/models.py

Lines changed: 32 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
from .youtube import (get_media_info as get_youtube_media_info,
2020
download_media as download_youtube_media,
2121
get_channel_image_info as get_youtube_channel_image_info)
22-
from .utils import seconds_to_timestr, parse_media_format
22+
from .utils import seconds_to_timestr, parse_media_format, filter_response
2323
from .matching import (get_best_combined_format, get_best_audio_format,
2424
get_best_video_format)
2525
from .mediaservers import PlexMediaServer
@@ -1145,8 +1145,39 @@ def format_dict(self):
11451145
def has_metadata(self):
11461146
return self.metadata is not None
11471147

1148+
1149+
@property
1150+
def reduce_data(self):
1151+
try:
1152+
from common.logger import log
1153+
from common.utils import json_serial
1154+
1155+
old_mdl = len(self.metadata or "")
1156+
data = json.loads(self.metadata or "")
1157+
compact_json = json.dumps(data, separators=(',', ':'), default=json_serial)
1158+
1159+
filtered_data = filter_response(data, True)
1160+
filtered_json = json.dumps(filtered_data, separators=(',', ':'), default=json_serial)
1161+
except Exception as e:
1162+
log.exception('reduce_data: %s', e)
1163+
else:
1164+
# log the results of filtering / compacting on metadata size
1165+
new_mdl = len(compact_json)
1166+
if old_mdl > new_mdl:
1167+
delta = old_mdl - new_mdl
1168+
log.info(f'{self.key}: metadata compacted by {delta:,} characters ({old_mdl:,} -> {new_mdl:,})')
1169+
new_mdl = len(filtered_json)
1170+
if old_mdl > new_mdl:
1171+
delta = old_mdl - new_mdl
1172+
log.info(f'{self.key}: metadata reduced by {delta:,} characters ({old_mdl:,} -> {new_mdl:,})')
1173+
if getattr(settings, 'SHRINK_OLD_MEDIA_METADATA', False):
1174+
self.metadata = filtered_json
1175+
1176+
11481177
@property
11491178
def loaded_metadata(self):
1179+
if getattr(settings, 'SHRINK_OLD_MEDIA_METADATA', False):
1180+
self.reduce_data
11501181
try:
11511182
data = json.loads(self.metadata)
11521183
if not isinstance(data, dict):

tubesync/sync/tasks.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
from common.utils import json_serial
2727
from .models import Source, Media, MediaServer
2828
from .utils import (get_remote_image, resize_image_to_height, delete_file,
29-
write_text_file)
29+
write_text_file, filter_response)
3030
from .filtering import filter_media
3131

3232

@@ -304,7 +304,10 @@ def download_media_metadata(media_id):
304304
return
305305
source = media.source
306306
metadata = media.index_metadata()
307-
media.metadata = json.dumps(metadata, default=json_serial)
307+
response = metadata
308+
if getattr(settings, 'SHRINK_NEW_MEDIA_METADATA', False):
309+
response = filter_response(metadata, True)
310+
media.metadata = json.dumps(response, separators=(',', ':'), default=json_serial)
308311
upload_date = media.upload_date
309312
# Media must have a valid upload date
310313
if upload_date:

tubesync/sync/tests.py

Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
from .models import Source, Media
1919
from .tasks import cleanup_old_media
2020
from .filtering import filter_media
21+
from .utils import filter_response
2122

2223

2324
class FrontEndTestCase(TestCase):
@@ -1709,6 +1710,84 @@ def test_is_regex_match(self):
17091710
f'expected {expected_match_result}')
17101711

17111712

1713+
class ResponseFilteringTestCase(TestCase):
1714+
1715+
def setUp(self):
1716+
# Disable general logging for test case
1717+
logging.disable(logging.CRITICAL)
1718+
# Add a test source
1719+
self.source = Source.objects.create(
1720+
source_type=Source.SOURCE_TYPE_YOUTUBE_CHANNEL,
1721+
key='testkey',
1722+
name='testname',
1723+
directory='testdirectory',
1724+
index_schedule=3600,
1725+
delete_old_media=False,
1726+
days_to_keep=14,
1727+
source_resolution=Source.SOURCE_RESOLUTION_1080P,
1728+
source_vcodec=Source.SOURCE_VCODEC_VP9,
1729+
source_acodec=Source.SOURCE_ACODEC_OPUS,
1730+
prefer_60fps=False,
1731+
prefer_hdr=False,
1732+
fallback=Source.FALLBACK_FAIL
1733+
)
1734+
# Add some media
1735+
self.media = Media.objects.create(
1736+
key='mediakey',
1737+
source=self.source,
1738+
metadata='{}'
1739+
)
1740+
1741+
def test_metadata_20230629(self):
1742+
self.media.metadata = all_test_metadata['20230629']
1743+
self.media.save()
1744+
1745+
unfiltered = self.media.loaded_metadata
1746+
filtered = filter_response(self.media.loaded_metadata)
1747+
self.assertIn('formats', unfiltered.keys())
1748+
self.assertIn('formats', filtered.keys())
1749+
# filtered 'downloader_options'
1750+
self.assertIn('downloader_options', unfiltered['formats'][10].keys())
1751+
self.assertNotIn('downloader_options', filtered['formats'][10].keys())
1752+
# filtered 'http_headers'
1753+
self.assertIn('http_headers', unfiltered['formats'][0].keys())
1754+
self.assertNotIn('http_headers', filtered['formats'][0].keys())
1755+
# did not lose any formats
1756+
self.assertEqual(48, len(unfiltered['formats']))
1757+
self.assertEqual(48, len(filtered['formats']))
1758+
self.assertEqual(len(unfiltered['formats']), len(filtered['formats']))
1759+
# did not remove everything with url
1760+
self.assertIn('original_url', unfiltered.keys())
1761+
self.assertIn('original_url', filtered.keys())
1762+
self.assertEqual(unfiltered['original_url'], filtered['original_url'])
1763+
# did reduce the size of the metadata
1764+
self.assertTrue(len(str(filtered)) < len(str(unfiltered)))
1765+
1766+
url_keys = []
1767+
for format in unfiltered['formats']:
1768+
for key in format.keys():
1769+
if 'url' in key:
1770+
url_keys.append((format['format_id'], key, format[key],))
1771+
unfiltered_url_keys = url_keys
1772+
self.assertEqual(63, len(unfiltered_url_keys), msg=str(unfiltered_url_keys))
1773+
1774+
url_keys = []
1775+
for format in filtered['formats']:
1776+
for key in format.keys():
1777+
if 'url' in key:
1778+
url_keys.append((format['format_id'], key, format[key],))
1779+
filtered_url_keys = url_keys
1780+
self.assertEqual(3, len(filtered_url_keys), msg=str(filtered_url_keys))
1781+
1782+
url_keys = []
1783+
for lang_code, captions in filtered['automatic_captions'].items():
1784+
for caption in captions:
1785+
for key in caption.keys():
1786+
if 'url' in key:
1787+
url_keys.append((lang_code, caption['ext'], caption[key],))
1788+
self.assertEqual(0, len(url_keys), msg=str(url_keys))
1789+
1790+
17121791
class TasksTestCase(TestCase):
17131792

17141793
def setUp(self):

tubesync/sync/utils.py

Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import os
22
import re
33
import math
4+
from copy import deepcopy
45
from operator import itemgetter
56
from pathlib import Path
67
from tempfile import NamedTemporaryFile
@@ -171,6 +172,95 @@ def normalize_codec(codec_str):
171172
return result
172173

173174

175+
def _url_keys(arg_dict, filter_func):
176+
result = {}
177+
for key in arg_dict.keys():
178+
if 'url' in key:
179+
result.update(
180+
{key: filter_func(key=key, url=arg_dict[key])}
181+
)
182+
return result
183+
184+
185+
def _drop_url_keys(arg_dict, key, filter_func):
186+
if key in arg_dict.keys():
187+
for val_dict in arg_dict[key]:
188+
for url_key, remove in _url_keys(val_dict, filter_func).items():
189+
if remove is True:
190+
del val_dict[url_key]
191+
192+
193+
def filter_response(arg_dict, copy_arg=False):
194+
'''
195+
Clean up the response so as to not store useless metadata in the database.
196+
'''
197+
response_dict = arg_dict
198+
# raise an exception for an unexpected argument type
199+
if not isinstance(response_dict, dict):
200+
raise TypeError(f'response_dict must be a dict, got "{type(response_dict)}"')
201+
202+
if copy_arg:
203+
response_dict = deepcopy(arg_dict)
204+
205+
# optimize the empty case
206+
if not response_dict:
207+
return response_dict
208+
209+
# beginning of formats cleanup {{{
210+
# drop urls that expire, or restrict IPs
211+
def drop_format_url(**kwargs):
212+
url = kwargs['url']
213+
return (
214+
url
215+
and '://' in url
216+
and (
217+
'/ip/' in url
218+
or 'ip=' in url
219+
or '/expire/' in url
220+
or 'expire=' in url
221+
)
222+
)
223+
224+
# these format keys are not useful to us
225+
drop_keys = frozenset((
226+
'downloader_options',
227+
'fragments',
228+
'http_headers',
229+
'__needs_testing',
230+
'__working',
231+
))
232+
for key in frozenset(('formats', 'requested_formats',)):
233+
_drop_url_keys(response_dict, key, drop_format_url)
234+
if key in response_dict.keys():
235+
for format in response_dict[key]:
236+
for drop_key in drop_keys:
237+
if drop_key in format.keys():
238+
del format[drop_key]
239+
# end of formats cleanup }}}
240+
241+
# beginning of subtitles cleanup {{{
242+
# drop urls that expire
243+
def drop_subtitles_url(**kwargs):
244+
url = kwargs['url']
245+
return (
246+
url
247+
and '://' in url
248+
and (
249+
'/expire/' in url
250+
or '&expire=' in url
251+
)
252+
)
253+
254+
for key in frozenset(('subtitles', 'automatic_captions',)):
255+
if key in response_dict.keys():
256+
key_dict = response_dict[key]
257+
for lang_code in key_dict:
258+
_drop_url_keys(key_dict, lang_code, drop_subtitles_url)
259+
# end of subtitles cleanup }}}
260+
261+
return response_dict
262+
263+
174264
def parse_media_format(format_dict):
175265
'''
176266
This parser primarily adapts the format dict returned by youtube-dl into a

0 commit comments

Comments
 (0)