Skip to content

Commit 3445c51

Browse files
committed
[job] add 'output.jsonl' option (#8953)
1 parent 532ab71 commit 3445c51

File tree

4 files changed

+48
-5
lines changed

4 files changed

+48
-5
lines changed

docs/configuration.rst

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8035,6 +8035,17 @@ Description
80358035
Include fallback URLs in the output of ``-g/--get-urls``.
80368036

80378037

8038+
output.jsonl
8039+
------------
8040+
Type
8041+
``bool``
8042+
Default
8043+
``false``
8044+
Description
8045+
Output ``-j/--dump-json`` & ``-J/--resolve-json``
8046+
data in `JSON Lines <https://jsonlines.org/>`__ format.
8047+
8048+
80388049
output.private
80398050
--------------
80408051
Type

docs/gallery-dl.conf

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1276,6 +1276,7 @@
12761276
{
12771277
"ansi" : true,
12781278
"fallback" : true,
1279+
"jsonl" : false,
12791280
"mode" : "auto",
12801281
"private" : false,
12811282
"progress" : true,

gallery_dl/job.py

Lines changed: 17 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -947,13 +947,16 @@ def __init__(self, url, parent=None, file=sys.stdout, ensure_ascii=True,
947947
self.data_meta = []
948948
self.exception = None
949949
self.ascii = config.get(("output",), "ascii", ensure_ascii)
950+
self.jsonl = config.get(("output",), "jsonl", False)
950951
self.resolve = 128 if resolve is True else (resolve or self.resolve)
951952

952953
private = config.get(("output",), "private")
953954
self.filter = dict.copy if private else util.filter_dict
954955

955956
if self.resolve > 0:
956957
self.handle_queue = self.handle_queue_resolve
958+
if not self.jsonl:
959+
self.out = util.noop
957960

958961
def run(self):
959962
self._init()
@@ -983,7 +986,7 @@ def run(self):
983986
for msg in self.data:
984987
util.transform_dict(msg[-1], util.number_to_string)
985988

986-
if self.file:
989+
if self.file and not self.jsonl:
987990
# dump to 'file'
988991
try:
989992
util.dump_json(self.data, self.file, self.ascii, 2)
@@ -993,22 +996,30 @@ def run(self):
993996

994997
return 0
995998

999+
def out(self, msg):
1000+
self.file.write(util.json_dumps(msg))
1001+
self.file.write("\n")
1002+
self.file.flush()
1003+
9961004
def handle_url(self, url, kwdict):
9971005
kwdict = self.filter(kwdict)
1006+
self.out(msg := (Message.Url, url, kwdict))
9981007
self.data_urls.append(url)
9991008
self.data_meta.append(kwdict)
1000-
self.data.append((Message.Url, url, kwdict))
1009+
self.data.append(msg)
10011010

10021011
def handle_directory(self, kwdict):
10031012
kwdict = self.filter(kwdict)
1013+
self.out(msg := (Message.Directory, kwdict))
10041014
self.data_post.append(kwdict)
1005-
self.data.append((Message.Directory, kwdict))
1015+
self.data.append(msg)
10061016

10071017
def handle_queue(self, url, kwdict):
10081018
kwdict = self.filter(kwdict)
1019+
self.out(msg := (Message.Queue, url, kwdict))
10091020
self.data_urls.append(url)
10101021
self.data_meta.append(kwdict)
1011-
self.data.append((Message.Queue, url, kwdict))
1022+
self.data.append(msg)
10121023

10131024
def handle_queue_resolve(self, url, kwdict):
10141025
if cls := kwdict.get("_extractor"):
@@ -1018,9 +1029,10 @@ def handle_queue_resolve(self, url, kwdict):
10181029

10191030
if not extr:
10201031
kwdict = self.filter(kwdict)
1032+
self.out(msg := (Message.Queue, url, kwdict))
10211033
self.data_urls.append(url)
10221034
self.data_meta.append(kwdict)
1023-
return self.data.append((Message.Queue, url, kwdict))
1035+
return self.data.append(msg)
10241036

10251037
job = self.__class__(extr, self, None, self.ascii, self.resolve-1)
10261038
job.data = self.data

test/test_job.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -359,6 +359,25 @@ def test_num_string(self):
359359
self.assertEqual(tjob.data[-1][0], Message.Url)
360360
self.assertEqual(tjob.data[-1][2]["num"], "3")
361361

362+
def test_jsonl(self):
363+
extr = TestExtractor.from_url("test:")
364+
tjob = self.jobclass(extr, file=io.StringIO())
365+
with patch("gallery_dl.job.DataJob.out") as out:
366+
tjob.run()
367+
self.assertEqual(len(out.call_args_list), 0)
368+
369+
config.set(("output",), "jsonl", True)
370+
extr = TestExtractor.from_url("test:")
371+
file = io.StringIO()
372+
tjob = self.jobclass(extr, file=file)
373+
with patch("gallery_dl.job.DataJob.out") as out:
374+
tjob.run()
375+
self.assertEqual(len(out.call_args_list), 4)
376+
377+
tjob.run()
378+
for line in file.getvalue().split():
379+
self.assertRegex(line, r"""^\[[23],("http[^"]+",)?\{.+\}\]$""")
380+
362381

363382
class TestExtractor(Extractor):
364383
category = "test_category"

0 commit comments

Comments
 (0)