diff --git a/bibtexparser/middlewares/__init__.py b/bibtexparser/middlewares/__init__.py index 3fd1a36..7202200 100644 --- a/bibtexparser/middlewares/__init__.py +++ b/bibtexparser/middlewares/__init__.py @@ -15,6 +15,7 @@ from bibtexparser.middlewares.names import SeparateCoAuthors from bibtexparser.middlewares.names import SplitNameParts from bibtexparser.middlewares.sorting_blocks import SortBlocksByTypeAndKeyMiddleware +from bibtexparser.middlewares.sorting_blocks import SortBlocksMiddleware from bibtexparser.middlewares.sorting_entry_fields import SortFieldsAlphabeticallyMiddleware from bibtexparser.middlewares.sorting_entry_fields import SortFieldsCustomMiddleware diff --git a/bibtexparser/middlewares/sorting_blocks.py b/bibtexparser/middlewares/sorting_blocks.py index 5ff5f13..117646d 100644 --- a/bibtexparser/middlewares/sorting_blocks.py +++ b/bibtexparser/middlewares/sorting_blocks.py @@ -1,6 +1,8 @@ from copy import deepcopy from dataclasses import dataclass from dataclasses import field +from typing import Any +from typing import Callable from typing import List from typing import Tuple from typing import Type @@ -22,15 +24,14 @@ class _BlockJunk: """Data-Structure reflecting zero or more comments together with a block.""" - sort_key: str = "" # The blocks (comments and the main block) are stored in the order they were parsed. blocks: List[Block] = field(default_factory=list) @property - def main_block_type(self) -> type: - """Returns the type of the main (i.e., non-comment) block.""" + def main_block(self) -> Block: + """Returns the main (i.e., last, non-comment) block of this junk.""" try: - return type(self.blocks[-1]) + return self.blocks[-1] except IndexError: raise RuntimeError( "Block junk must contain at least one block. " @@ -38,42 +39,76 @@ def main_block_type(self) -> type: ) -class SortBlocksByTypeAndKeyMiddleware(LibraryMiddleware): - """Sorts the blocks of a library by type and key. Optionally, comments remain above same block.""" +class SortBlocksMiddleware(LibraryMiddleware): + """Sorts the blocks of a library by a user-provided sort key. + + This middleware works like Pythons built-in sorting + (the ``key`` and ``reverse`` arguments behave as in :func:`sorted`): + The ``key`` callable is applied to each block and the blocks are + sorted by the returned values. + + Example: To sort entries by their ``year`` field, with all non-entry + blocks (strings, preambles, ...) and year-less entries on top:: + + from bibtexparser.middlewares import SortBlocksMiddleware + from bibtexparser.model import Entry + + def by_year(block): + if isinstance(block, Entry) and "year" in block: + return (1, int(block["year"])) + return (0, 0) + + middleware = SortBlocksMiddleware(key=by_year) + + Hints regarding the ``key`` callable: + + - It must accept every block it may be called with (see below) + and the returned values must be mutually comparable. + Returning tuples - as in the example above - is a simple way + to achieve this for libraries with mixed block types, + and also allows hierarchical sorting criteria. + - The sort is stable: blocks for which the key returns equal values + remain in their original relative order. + - The key should be pure (deterministic and without side-effects). + - If you have a comparator function (``compare(block_1, block_2) -> int``) + instead of a key function, wrap it with :func:`functools.cmp_to_key`. + + Comment handling: if ``preserve_comments_on_top`` is True (default), + comments remain directly above the consecutive non-comment block + and the key is only called with said non-comment block. + (Exception: for comments at the very end of the library - not followed + by any non-comment block - the key is called with the last comment.) + If ``preserve_comments_on_top`` is False, comments are sorted + like all other blocks, and the key must thus handle comment blocks, too. + """ def __init__( self, - block_type_order: Tuple[Type[Block], ...] = DEFAULT_BLOCK_TYPE_ORDER, + key: Callable[[Block], Any], + reverse: bool = False, preserve_comments_on_top: bool = True, ): - self._verify_all_types_are_block_types(block_type_order) - self._block_type_order = block_type_order + """ + + :param key: Callable mapping a block to a sort key, as in :func:`sorted`. + See the class docstring for requirements and an example. + :param reverse: If True, sort in descending order. + :param preserve_comments_on_top: If True, comments remain above + the following non-comment block (sorted as one unit). + """ + self._key = key + self._reverse = reverse self._preserve_comments_on_top = preserve_comments_on_top # In-place modification is not yet supported, we make this explicit here, super().__init__(allow_inplace_modification=False) - @staticmethod - def _verify_all_types_are_block_types(sort_order): - for t in sort_order: - if not issubclass(t, Block): - raise ValueError( - "Sort order must only contain Block subclasses, " f"but got {str(t)}" - ) - @staticmethod def _block_junks(blocks: List[Block]) -> List[_BlockJunk]: block_junks = [] current_junk = _BlockJunk() for block in blocks: current_junk.blocks.append(block) - try: - current_junk.sort_key = block.key - except AttributeError: - # Block has no key that could be used as sort key - # (this happens for comments, preambles and parsing-failed blocks, for example) - pass - if not (isinstance(block, ExplicitComment) or isinstance(block, ImplicitComment)): # We added a non-comment block, hence we finish the junk and # start a new one @@ -91,32 +126,44 @@ def transform(self, library: Library) -> Library: blocks = deepcopy(library.blocks) if self._preserve_comments_on_top: block_junks = self._block_junks(blocks) - - def _sort_key(block_junk): - """Sort key for block junks. Based on (block type, string-or-entry-key).""" - try: - return ( - self._block_type_order.index(block_junk.main_block_type), - block_junk.sort_key, - ) - except ValueError: - # If the block type is not in the order list, put it at the end - return len(self._block_type_order), block_junk.sort_key - - block_junks.sort(key=_sort_key) + block_junks.sort(key=lambda junk: self._key(junk.main_block), reverse=self._reverse) return Library( blocks=[block for block_junk in block_junks for block in block_junk.blocks] ) else: + blocks.sort(key=self._key, reverse=self._reverse) + return Library(blocks=blocks) - def _sort_key(block: Block): - """Sort key for blocks. Based on (block type, string-or-entry-key).""" - block_key = getattr(block, "key", "") - try: - return self._block_type_order.index(block.__class__), block_key - except ValueError: - # If the block type is not in the order list, put it at the end - return len(self._block_type_order), block_key - blocks.sort(key=_sort_key) - return Library(blocks=blocks) +class SortBlocksByTypeAndKeyMiddleware(SortBlocksMiddleware): + """Sorts the blocks of a library by type and key. Optionally, comments remain above same block.""" + + def __init__( + self, + block_type_order: Tuple[Type[Block], ...] = DEFAULT_BLOCK_TYPE_ORDER, + preserve_comments_on_top: bool = True, + ): + self._verify_all_types_are_block_types(block_type_order) + self._block_type_order = block_type_order + + super().__init__( + key=self._type_and_key_sort_key, + preserve_comments_on_top=preserve_comments_on_top, + ) + + @staticmethod + def _verify_all_types_are_block_types(sort_order): + for t in sort_order: + if not issubclass(t, Block): + raise ValueError( + "Sort order must only contain Block subclasses, " f"but got {str(t)}" + ) + + def _type_and_key_sort_key(self, block: Block) -> Tuple[int, str]: + """Sort key for blocks. Based on (block type, string-or-entry-key).""" + try: + type_index = self._block_type_order.index(type(block)) + except ValueError: + # If the block type is not in the order list, put it at the end + type_index = len(self._block_type_order) + return type_index, getattr(block, "key", "") diff --git a/docs/source/customize.rst b/docs/source/customize.rst index 6f94d6d..df46a07 100644 --- a/docs/source/customize.rst +++ b/docs/source/customize.rst @@ -106,10 +106,40 @@ Names Sorting ::::::: +* :mod:`bibtexparser.middlewares.SortBlocksMiddleware` * :mod:`bibtexparser.middlewares.SortBlocksByTypeAndKeyMiddleware` * :mod:`bibtexparser.middlewares.SortFieldsAlphabeticallyMiddleware` * :mod:`bibtexparser.middlewares.SortFieldsCustomMiddleware` +:class:`bibtexparser.middlewares.SortBlocksMiddleware` allows sorting blocks by any custom criterion: +It takes a sort-key function which - as the ``key`` argument of Python's built-in :func:`sorted` - +maps each block to a value to sort by. For example, to write a library with its entries sorted by year: + +.. code-block:: python + + import bibtexparser + import bibtexparser.middlewares as m + from bibtexparser.model import Entry + + def by_year(block): + # Tuple sort keys allow sorting libraries with mixed block types: + # Non-entries (e.g. @string) and entries without a year are put on top, + # remaining entries are sorted by year, ties broken by citation key. + if isinstance(block, Entry) and "year" in block: + return (1, int(block["year"]), block.key) + return (0, 0, "") + + library = bibtexparser.parse_file("bibtex.bib") + bibtexparser.write_file( + "sorted.bib", library, prepend_middleware=[m.SortBlocksMiddleware(key=by_year)] + ) + +Descending order is available via ``reverse=True``. By default, comments remain attached +to the (entry or other) block they precede; pass ``preserve_comments_on_top=False`` to sort +them like any other block. The sort is stable, i.e., blocks with equal sort keys remain +in their previous order. See the class docstring for further details, e.g., on how to use +comparator functions instead of sort-key functions. + .. note:: As opposed to bibtexparser v1, the en- and decoding of latex characters is now handled by a third-party library. Previously, this part was responsible for much of the code complexity and bugs in bibtexparser, @@ -164,6 +194,11 @@ Should extend the :class:`bibtexparser.middlewares.LibraryMiddleware` class. This includes functionalities similar to sorting blocks (e.g. :mod:`bibtexparser.middlewares.SortBlocksByTypeAndKeyMiddleware`). +.. note:: + For custom sorting, you usually don't have to write your own middleware: + Pass a sort-key function to :class:`bibtexparser.middlewares.SortBlocksMiddleware` + instead (see :ref:`middleware_sorting`). + Warning ::::::: diff --git a/tests/middleware_tests/test_sorting_blocks.py b/tests/middleware_tests/test_sorting_blocks.py index da0deca..7071685 100644 --- a/tests/middleware_tests/test_sorting_blocks.py +++ b/tests/middleware_tests/test_sorting_blocks.py @@ -1,5 +1,7 @@ +import bibtexparser from bibtexparser import Library from bibtexparser.middlewares.sorting_blocks import SortBlocksByTypeAndKeyMiddleware +from bibtexparser.middlewares.sorting_blocks import SortBlocksMiddleware from bibtexparser.model import Entry from bibtexparser.model import ExplicitComment from bibtexparser.model import ImplicitComment @@ -102,3 +104,120 @@ def test_sorting_blocks_no_comment_preserving_with_custom_order(): assert ordered_blocks[11] == ExplicitComment("explicit_comment_b") assert len(ordered_blocks) == len(BLOCKS) + + +ENTRIES_BIBTEX = """ +@article{newest, author = {Author, C}, year = {2020}} +@article{oldest, author = {Author, B}, year = {1999}} +@article{middle, author = {Author, A}, year = {2005}} +""" + + +def test_sorting_blocks_by_custom_key_year_field(): + library = bibtexparser.parse_string(ENTRIES_BIBTEX) + + library = SortBlocksMiddleware(key=lambda entry: int(entry["year"])).transform(library) + + assert [entry.key for entry in library.entries] == ["oldest", "middle", "newest"] + + +def test_sorting_blocks_by_custom_key_reverse(): + library = bibtexparser.parse_string(ENTRIES_BIBTEX) + + library = SortBlocksMiddleware(key=lambda entry: int(entry["year"]), reverse=True).transform( + library + ) + + assert [entry.key for entry in library.entries] == ["newest", "middle", "oldest"] + + +def test_sorting_blocks_by_custom_hierarchical_key(): + bibtex = """ + @article{same_year_b, author = {Author, B}, year = {2005}} + @article{newest, author = {Author, C}, year = {2020}} + @article{same_year_a, author = {Author, A}, year = {2005}} + """ + library = bibtexparser.parse_string(bibtex) + + # Sort by year first, ties broken by author + library = SortBlocksMiddleware( + key=lambda entry: (int(entry["year"]), entry["author"]) + ).transform(library) + + assert [entry.key for entry in library.entries] == ["same_year_a", "same_year_b", "newest"] + + +def test_sorting_blocks_by_custom_key_is_stable(): + bibtex = """ + @article{c, year = {2005}} + @article{a, year = {2005}} + @article{b, year = {2005}} + """ + library = bibtexparser.parse_string(bibtex) + + library = SortBlocksMiddleware(key=lambda entry: int(entry["year"])).transform(library) + + # Equal sort keys: input order must be preserved (stable sort) + assert [entry.key for entry in library.entries] == ["c", "a", "b"] + + +def test_sorting_blocks_by_custom_key_with_mixed_block_types(): + bibtex = """ + @article{newer, year = {2020}} + @string{me = "My Name"} + @article{older, year = {1999}} + @preamble{"some preamble"} + """ + + def entries_by_year_others_on_top(block): + # Tuple keys make mixed block types comparable: + # non-entries first, then entries sorted by year + if isinstance(block, Entry): + return (1, int(block["year"])) + return (0, 0) + + library = bibtexparser.parse_string(bibtex) + library = SortBlocksMiddleware(key=entries_by_year_others_on_top).transform(library) + + assert [type(block) for block in library.blocks] == [String, Preamble, Entry, Entry] + assert [entry.key for entry in library.entries] == ["older", "newer"] + + +def test_sorting_blocks_by_custom_key_keeps_comments_on_top(): + bibtex = """ + % comment belonging to newer + @article{newer, year = {2020}} + @article{older, year = {1999}} + """ + library = bibtexparser.parse_string(bibtex) + + library = SortBlocksMiddleware(key=lambda entry: int(entry["year"])).transform(library) + + blocks = library.blocks + assert isinstance(blocks[0], Entry) and blocks[0].key == "older" + assert isinstance(blocks[1], ImplicitComment) + assert blocks[1].comment == "% comment belonging to newer" + assert isinstance(blocks[2], Entry) and blocks[2].key == "newer" + + +def test_sorting_blocks_by_custom_key_without_comment_preservation(): + bibtex = """ + % some comment + @article{newer, year = {2020}} + @article{older, year = {1999}} + """ + + def comments_last(block): + if isinstance(block, Entry): + return (0, int(block["year"])) + return (1, 0) + + library = bibtexparser.parse_string(bibtex) + library = SortBlocksMiddleware(key=comments_last, preserve_comments_on_top=False).transform( + library + ) + + blocks = library.blocks + assert isinstance(blocks[0], Entry) and blocks[0].key == "older" + assert isinstance(blocks[1], Entry) and blocks[1].key == "newer" + assert isinstance(blocks[2], ImplicitComment)