Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions bibtexparser/middlewares/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from bibtexparser.middlewares.names import SeparateCoAuthors
from bibtexparser.middlewares.names import SplitNameParts
from bibtexparser.middlewares.sorting_blocks import SortBlocksByTypeAndKeyMiddleware
from bibtexparser.middlewares.sorting_blocks import SortBlocksMiddleware
from bibtexparser.middlewares.sorting_entry_fields import SortFieldsAlphabeticallyMiddleware
from bibtexparser.middlewares.sorting_entry_fields import SortFieldsCustomMiddleware

Expand Down
141 changes: 94 additions & 47 deletions bibtexparser/middlewares/sorting_blocks.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
from copy import deepcopy
from dataclasses import dataclass
from dataclasses import field
from typing import Any
from typing import Callable
from typing import List
from typing import Tuple
from typing import Type
Expand All @@ -22,58 +24,91 @@
class _BlockJunk:
"""Data-Structure reflecting zero or more comments together with a block."""

sort_key: str = ""
# The blocks (comments and the main block) are stored in the order they were parsed.
blocks: List[Block] = field(default_factory=list)

@property
def main_block_type(self) -> type:
"""Returns the type of the main (i.e., non-comment) block."""
def main_block(self) -> Block:
"""Returns the main (i.e., last, non-comment) block of this junk."""
try:
return type(self.blocks[-1])
return self.blocks[-1]
except IndexError:
raise RuntimeError(
"Block junk must contain at least one block. "
"This is a bug in bibtexparser, please report it."
)


class SortBlocksByTypeAndKeyMiddleware(LibraryMiddleware):
"""Sorts the blocks of a library by type and key. Optionally, comments remain above same block."""
class SortBlocksMiddleware(LibraryMiddleware):
"""Sorts the blocks of a library by a user-provided sort key.

This middleware works like Pythons built-in sorting
(the ``key`` and ``reverse`` arguments behave as in :func:`sorted`):
The ``key`` callable is applied to each block and the blocks are
sorted by the returned values.

Example: To sort entries by their ``year`` field, with all non-entry
blocks (strings, preambles, ...) and year-less entries on top::

from bibtexparser.middlewares import SortBlocksMiddleware
from bibtexparser.model import Entry

def by_year(block):
if isinstance(block, Entry) and "year" in block:
return (1, int(block["year"]))
return (0, 0)

middleware = SortBlocksMiddleware(key=by_year)

Hints regarding the ``key`` callable:

- It must accept every block it may be called with (see below)
and the returned values must be mutually comparable.
Returning tuples - as in the example above - is a simple way
to achieve this for libraries with mixed block types,
and also allows hierarchical sorting criteria.
- The sort is stable: blocks for which the key returns equal values
remain in their original relative order.
- The key should be pure (deterministic and without side-effects).
- If you have a comparator function (``compare(block_1, block_2) -> int``)
instead of a key function, wrap it with :func:`functools.cmp_to_key`.

Comment handling: if ``preserve_comments_on_top`` is True (default),
comments remain directly above the consecutive non-comment block
and the key is only called with said non-comment block.
(Exception: for comments at the very end of the library - not followed
by any non-comment block - the key is called with the last comment.)
If ``preserve_comments_on_top`` is False, comments are sorted
like all other blocks, and the key must thus handle comment blocks, too.
"""

def __init__(
self,
block_type_order: Tuple[Type[Block], ...] = DEFAULT_BLOCK_TYPE_ORDER,
key: Callable[[Block], Any],
reverse: bool = False,
preserve_comments_on_top: bool = True,
):
self._verify_all_types_are_block_types(block_type_order)
self._block_type_order = block_type_order
"""

:param key: Callable mapping a block to a sort key, as in :func:`sorted`.
See the class docstring for requirements and an example.
:param reverse: If True, sort in descending order.
:param preserve_comments_on_top: If True, comments remain above
the following non-comment block (sorted as one unit).
"""
self._key = key
self._reverse = reverse
self._preserve_comments_on_top = preserve_comments_on_top

# In-place modification is not yet supported, we make this explicit here,
super().__init__(allow_inplace_modification=False)

@staticmethod
def _verify_all_types_are_block_types(sort_order):
for t in sort_order:
if not issubclass(t, Block):
raise ValueError(
"Sort order must only contain Block subclasses, " f"but got {str(t)}"
)

@staticmethod
def _block_junks(blocks: List[Block]) -> List[_BlockJunk]:
block_junks = []
current_junk = _BlockJunk()
for block in blocks:
current_junk.blocks.append(block)
try:
current_junk.sort_key = block.key
except AttributeError:
# Block has no key that could be used as sort key
# (this happens for comments, preambles and parsing-failed blocks, for example)
pass

if not (isinstance(block, ExplicitComment) or isinstance(block, ImplicitComment)):
# We added a non-comment block, hence we finish the junk and
# start a new one
Expand All @@ -91,32 +126,44 @@ def transform(self, library: Library) -> Library:
blocks = deepcopy(library.blocks)
if self._preserve_comments_on_top:
block_junks = self._block_junks(blocks)

def _sort_key(block_junk):
"""Sort key for block junks. Based on (block type, string-or-entry-key)."""
try:
return (
self._block_type_order.index(block_junk.main_block_type),
block_junk.sort_key,
)
except ValueError:
# If the block type is not in the order list, put it at the end
return len(self._block_type_order), block_junk.sort_key

block_junks.sort(key=_sort_key)
block_junks.sort(key=lambda junk: self._key(junk.main_block), reverse=self._reverse)
return Library(
blocks=[block for block_junk in block_junks for block in block_junk.blocks]
)
else:
blocks.sort(key=self._key, reverse=self._reverse)
return Library(blocks=blocks)

def _sort_key(block: Block):
"""Sort key for blocks. Based on (block type, string-or-entry-key)."""
block_key = getattr(block, "key", "")
try:
return self._block_type_order.index(block.__class__), block_key
except ValueError:
# If the block type is not in the order list, put it at the end
return len(self._block_type_order), block_key

blocks.sort(key=_sort_key)
return Library(blocks=blocks)
class SortBlocksByTypeAndKeyMiddleware(SortBlocksMiddleware):
"""Sorts the blocks of a library by type and key. Optionally, comments remain above same block."""

def __init__(
self,
block_type_order: Tuple[Type[Block], ...] = DEFAULT_BLOCK_TYPE_ORDER,
preserve_comments_on_top: bool = True,
):
self._verify_all_types_are_block_types(block_type_order)
self._block_type_order = block_type_order

super().__init__(
key=self._type_and_key_sort_key,
preserve_comments_on_top=preserve_comments_on_top,
)

@staticmethod
def _verify_all_types_are_block_types(sort_order):
for t in sort_order:
if not issubclass(t, Block):
raise ValueError(
"Sort order must only contain Block subclasses, " f"but got {str(t)}"
)

def _type_and_key_sort_key(self, block: Block) -> Tuple[int, str]:
"""Sort key for blocks. Based on (block type, string-or-entry-key)."""
try:
type_index = self._block_type_order.index(type(block))
except ValueError:
# If the block type is not in the order list, put it at the end
type_index = len(self._block_type_order)
return type_index, getattr(block, "key", "")
35 changes: 35 additions & 0 deletions docs/source/customize.rst
Original file line number Diff line number Diff line change
Expand Up @@ -106,10 +106,40 @@ Names
Sorting
:::::::

* :mod:`bibtexparser.middlewares.SortBlocksMiddleware`
* :mod:`bibtexparser.middlewares.SortBlocksByTypeAndKeyMiddleware`
* :mod:`bibtexparser.middlewares.SortFieldsAlphabeticallyMiddleware`
* :mod:`bibtexparser.middlewares.SortFieldsCustomMiddleware`

:class:`bibtexparser.middlewares.SortBlocksMiddleware` allows sorting blocks by any custom criterion:
It takes a sort-key function which - as the ``key`` argument of Python's built-in :func:`sorted` -
maps each block to a value to sort by. For example, to write a library with its entries sorted by year:

.. code-block:: python

import bibtexparser
import bibtexparser.middlewares as m
from bibtexparser.model import Entry

def by_year(block):
# Tuple sort keys allow sorting libraries with mixed block types:
# Non-entries (e.g. @string) and entries without a year are put on top,
# remaining entries are sorted by year, ties broken by citation key.
if isinstance(block, Entry) and "year" in block:
return (1, int(block["year"]), block.key)
return (0, 0, "")

library = bibtexparser.parse_file("bibtex.bib")
bibtexparser.write_file(
"sorted.bib", library, prepend_middleware=[m.SortBlocksMiddleware(key=by_year)]
)

Descending order is available via ``reverse=True``. By default, comments remain attached
to the (entry or other) block they precede; pass ``preserve_comments_on_top=False`` to sort
them like any other block. The sort is stable, i.e., blocks with equal sort keys remain
in their previous order. See the class docstring for further details, e.g., on how to use
comparator functions instead of sort-key functions.

.. note::
As opposed to bibtexparser v1, the en- and decoding of latex characters is now handled by a third-party library.
Previously, this part was responsible for much of the code complexity and bugs in bibtexparser,
Expand Down Expand Up @@ -164,6 +194,11 @@ Should extend the :class:`bibtexparser.middlewares.LibraryMiddleware` class.
This includes functionalities similar to sorting blocks
(e.g. :mod:`bibtexparser.middlewares.SortBlocksByTypeAndKeyMiddleware`).

.. note::
For custom sorting, you usually don't have to write your own middleware:
Pass a sort-key function to :class:`bibtexparser.middlewares.SortBlocksMiddleware`
instead (see :ref:`middleware_sorting`).

Warning
:::::::

Expand Down
119 changes: 119 additions & 0 deletions tests/middleware_tests/test_sorting_blocks.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import bibtexparser
from bibtexparser import Library
from bibtexparser.middlewares.sorting_blocks import SortBlocksByTypeAndKeyMiddleware
from bibtexparser.middlewares.sorting_blocks import SortBlocksMiddleware
from bibtexparser.model import Entry
from bibtexparser.model import ExplicitComment
from bibtexparser.model import ImplicitComment
Expand Down Expand Up @@ -102,3 +104,120 @@ def test_sorting_blocks_no_comment_preserving_with_custom_order():
assert ordered_blocks[11] == ExplicitComment("explicit_comment_b")

assert len(ordered_blocks) == len(BLOCKS)


ENTRIES_BIBTEX = """
@article{newest, author = {Author, C}, year = {2020}}
@article{oldest, author = {Author, B}, year = {1999}}
@article{middle, author = {Author, A}, year = {2005}}
"""


def test_sorting_blocks_by_custom_key_year_field():
library = bibtexparser.parse_string(ENTRIES_BIBTEX)

library = SortBlocksMiddleware(key=lambda entry: int(entry["year"])).transform(library)

assert [entry.key for entry in library.entries] == ["oldest", "middle", "newest"]


def test_sorting_blocks_by_custom_key_reverse():
library = bibtexparser.parse_string(ENTRIES_BIBTEX)

library = SortBlocksMiddleware(key=lambda entry: int(entry["year"]), reverse=True).transform(
library
)

assert [entry.key for entry in library.entries] == ["newest", "middle", "oldest"]


def test_sorting_blocks_by_custom_hierarchical_key():
bibtex = """
@article{same_year_b, author = {Author, B}, year = {2005}}
@article{newest, author = {Author, C}, year = {2020}}
@article{same_year_a, author = {Author, A}, year = {2005}}
"""
library = bibtexparser.parse_string(bibtex)

# Sort by year first, ties broken by author
library = SortBlocksMiddleware(
key=lambda entry: (int(entry["year"]), entry["author"])
).transform(library)

assert [entry.key for entry in library.entries] == ["same_year_a", "same_year_b", "newest"]


def test_sorting_blocks_by_custom_key_is_stable():
bibtex = """
@article{c, year = {2005}}
@article{a, year = {2005}}
@article{b, year = {2005}}
"""
library = bibtexparser.parse_string(bibtex)

library = SortBlocksMiddleware(key=lambda entry: int(entry["year"])).transform(library)

# Equal sort keys: input order must be preserved (stable sort)
assert [entry.key for entry in library.entries] == ["c", "a", "b"]


def test_sorting_blocks_by_custom_key_with_mixed_block_types():
bibtex = """
@article{newer, year = {2020}}
@string{me = "My Name"}
@article{older, year = {1999}}
@preamble{"some preamble"}
"""

def entries_by_year_others_on_top(block):
# Tuple keys make mixed block types comparable:
# non-entries first, then entries sorted by year
if isinstance(block, Entry):
return (1, int(block["year"]))
return (0, 0)

library = bibtexparser.parse_string(bibtex)
library = SortBlocksMiddleware(key=entries_by_year_others_on_top).transform(library)

assert [type(block) for block in library.blocks] == [String, Preamble, Entry, Entry]
assert [entry.key for entry in library.entries] == ["older", "newer"]


def test_sorting_blocks_by_custom_key_keeps_comments_on_top():
bibtex = """
% comment belonging to newer
@article{newer, year = {2020}}
@article{older, year = {1999}}
"""
library = bibtexparser.parse_string(bibtex)

library = SortBlocksMiddleware(key=lambda entry: int(entry["year"])).transform(library)

blocks = library.blocks
assert isinstance(blocks[0], Entry) and blocks[0].key == "older"
assert isinstance(blocks[1], ImplicitComment)
assert blocks[1].comment == "% comment belonging to newer"
assert isinstance(blocks[2], Entry) and blocks[2].key == "newer"


def test_sorting_blocks_by_custom_key_without_comment_preservation():
bibtex = """
% some comment
@article{newer, year = {2020}}
@article{older, year = {1999}}
"""

def comments_last(block):
if isinstance(block, Entry):
return (0, int(block["year"]))
return (1, 0)

library = bibtexparser.parse_string(bibtex)
library = SortBlocksMiddleware(key=comments_last, preserve_comments_on_top=False).transform(
library
)

blocks = library.blocks
assert isinstance(blocks[0], Entry) and blocks[0].key == "older"
assert isinstance(blocks[1], Entry) and blocks[1].key == "newer"
assert isinstance(blocks[2], ImplicitComment)
Loading