Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion gallery_dl/extractor/bellazon.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

"""Extractors for https://www.bellazon.com/"""

from .common import Extractor, Message
from .common import Extractor, Message, mark_queue_rollback
from .. import text

BASE_PATTERN = r"(?:https?://)?(?:www\.)?bellazon\.com/main"
Expand Down Expand Up @@ -48,6 +48,11 @@ def items(self):

yield Message.Directory, "", data
data["num"] = data["num_internal"] = data["num_external"] = 0
mark_queue_rollback(
data,
("num", "num_external", "count"),
("post.count",),
)
for info, url, url_img in urls:
if url_img:
url = text.unescape(
Expand Down
9 changes: 9 additions & 0 deletions gallery_dl/extractor/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,15 @@
urllib3 = requests.packages.urllib3


def mark_queue_rollback(data, counters=(), nested=()):
"""Annotate Queue metadata with rollback fields for skipped children."""
if counters:
data[Message.QueueRollback] = counters
if nested:
data[Message.QueueRollbackNested] = nested
return data


class Extractor():

category = ""
Expand Down
11 changes: 11 additions & 0 deletions gallery_dl/extractor/message.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,15 @@ class Message():
- 2nd element is the (external) URL as a string
- 3rd element is a dictionary containing URL-specific metadata

Optional metadata keys for Queue messages:
- Message.QueueRollback:
- Tuple/list of top-level integer metadata fields that should be
decremented if queue handling gets skipped (for example due to an
extractor blacklist).
- Message.QueueRollbackNested:
- Tuple/list of dotted metadata paths (e.g. "post.count") that should
be decremented under the same conditions.

- Message.Urllist: # obsolete
- Same as Message.Url, but its 2nd element is a list of multiple URLs
- The additional URLs serve as a fallback if the primary one fails
Expand All @@ -52,5 +61,7 @@ class Message():
# Headers = 4
# Cookies = 5
Queue = 6
QueueRollback = "_queue_rollback"
QueueRollbackNested = "_queue_rollback_nested"
# Urllist = 7
# Metadata = 8
7 changes: 6 additions & 1 deletion gallery_dl/extractor/xenforo.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

"""Extractors for XenForo forums"""

from .common import BaseExtractor, Message
from .common import BaseExtractor, Message, mark_queue_rollback
from .. import text, util
from ..cache import cache
import binascii
Expand Down Expand Up @@ -62,6 +62,11 @@ def items(self):
data["_http_expected_status"] = (403,)
data["_http_validate"] = self._validate
data["num"] = data["num_internal"] = data["num_external"] = 0
mark_queue_rollback(
data,
("num", "num_external", "count"),
("post.count",),
)
for video, inl, bb, ext in urls:
if ext:
if ext[0] == "#":
Expand Down
36 changes: 36 additions & 0 deletions gallery_dl/job.py
Original file line number Diff line number Diff line change
Expand Up @@ -489,6 +489,7 @@ def handle_directory(self, kwdict):

def handle_queue(self, url, kwdict):
if url in self.visited:
self._rollback_queue_metadata(kwdict)
return
self.visited.add(url)

Expand All @@ -498,13 +499,15 @@ def handle_queue(self, url, kwdict):
for callback in self.hooks["child"]:
callback(pathfmt)

filtered = False
if cls := kwdict.get("_extractor"):
extr = cls.from_url(url)
else:
if extr := extractor.find(url):
if self._extractor_filter is None:
self._extractor_filter = self._build_extractor_filter()
if not self._extractor_filter(extr):
filtered = True
extr = None

if extr:
Expand Down Expand Up @@ -569,6 +572,8 @@ def handle_queue(self, url, kwdict):
pass

else:
if filtered:
self._rollback_queue_metadata(kwdict)
self._write_unsupported(url)

if "child-after" in self.hooks:
Expand All @@ -577,6 +582,37 @@ def handle_queue(self, url, kwdict):
for callback in self.hooks["child-after"]:
callback(pathfmt)

@staticmethod
def _decrement_counter(mapping, key):
value = mapping.get(key)
if isinstance(value, int) and value > 0:
mapping[key] = value - 1
return True
return False

def _rollback_queue_metadata(self, kwdict):
rolled_back = False

for key in kwdict.get(Message.QueueRollback, ()):
if self._decrement_counter(kwdict, key):
rolled_back = True

for path in kwdict.get(Message.QueueRollbackNested, ()):
parts = path.split(".")
if len(parts) < 2:
continue

current = kwdict
for part in parts[:-1]:
current = current.get(part)
if not isinstance(current, dict):
break
else:
if self._decrement_counter(current, parts[-1]):
rolled_back = True

return rolled_back

def handle_finalize(self):
if self.archive:
if not self.status:
Expand Down
55 changes: 54 additions & 1 deletion test/test_job.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,11 @@

sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from gallery_dl import job, config, text # noqa E402
from gallery_dl.extractor.common import Extractor, Message # noqa E402
from gallery_dl.extractor.common import ( # noqa E402
Extractor,
Message,
mark_queue_rollback,
)


class TestJob(unittest.TestCase):
Expand Down Expand Up @@ -107,6 +111,34 @@ def test_parent_metadata_extractor(self):
# no output if '_extractor' is overwritten (#8958)
self.assertEqual(out, "11\n")

def test_queue_rollback_for_blacklisted_extractor(self):
config.set((), "download", False)
config.set((), "blacklist", "reddit")
config.set(
("extractor", "test_category", "test_queue_counter"),
"queue-url",
"https://www.reddit.com/r/python/",
)

extr = TestExtractorQueueCounter.from_url("test:queue-counter")
self.jobclass(extr).run()

self.assertEqual(extr.data["num"], 0)
self.assertEqual(extr.data["num_external"], 0)
self.assertEqual(extr.data["count"], 0)
self.assertEqual(extr.data["post"]["count"], 0)

def test_queue_rollback_keeps_counts_when_queue_is_processed(self):
config.set((), "download", False)

extr = TestExtractorQueueCounter.from_url("test:queue-counter")
self.jobclass(extr).run()

self.assertEqual(extr.data["num"], 1)
self.assertEqual(extr.data["num_external"], 1)
self.assertEqual(extr.data["count"], 1)
self.assertEqual(extr.data["post"]["count"], 1)


class TestKeywordJob(TestJob):
jobclass = job.KeywordJob
Expand Down Expand Up @@ -574,6 +606,27 @@ def items(self):
}


class TestExtractorQueueCounter(Extractor):
category = "test_category"
subcategory = "test_queue_counter"
pattern = r"test:queue-counter$"

def items(self):
self.data = data = {
"num": 1,
"num_external": 1,
"count": 1,
"post": {"count": 1},
}
mark_queue_rollback(
data,
("num", "num_external", "count"),
("post.count",),
)
yield Message.Directory, "", data
yield Message.Queue, self.config("queue-url", "test:noop"), data


class TestExtractorException(Extractor):
category = "test_category"
subcategory = "test_subcategory_exception"
Expand Down
Loading