diff --git a/mapillary_tools/exif_read.py b/mapillary_tools/exif_read.py index d2fac8dfd..284d59377 100644 --- a/mapillary_tools/exif_read.py +++ b/mapillary_tools/exif_read.py @@ -1,7 +1,9 @@ import abc import datetime +import io import logging import re +import struct import typing as T import xml.etree.ElementTree as et from fractions import Fraction @@ -11,6 +13,7 @@ from exifread.utils import Ratio +LOG = logging.getLogger(__name__) XMP_NAMESPACES = { "exif": "http://ns.adobe.com/exif/1.0/", "tiff": "http://ns.adobe.com/tiff/1.0/", @@ -508,6 +511,86 @@ def _extract_alternative_fields( return None +def extract_xmp_efficiently(fp) -> T.Optional[str]: + """ + Extract XMP metadata from a JPEG file efficiently by reading only necessary chunks. + + Args: + image_path (str): Path to the JPEG image file + + Returns: + str: Formatted XML string containing XMP metadata, or None if no XMP data found + """ + # JPEG markers + SOI_MARKER = b"\xff\xd8" # Start of Image + APP1_MARKER = b"\xff\xe1" # Application Segment 1 (where XMP usually lives) + XMP_IDENTIFIER = b"http://ns.adobe.com/xap/1.0/\x00" + XMP_META_TAG_BEGIN = b"" + + # Check for JPEG signature (SOI marker) + if fp.read(2) != SOI_MARKER: + return None + + while True: + # Read marker + marker_bytes = fp.read(2) + if len(marker_bytes) < 2: + # End of file + break + + # If not APP1, skip this segment + if marker_bytes != APP1_MARKER: + # Read length field (includes the length bytes themselves) + length_bytes = fp.read(2) + if len(length_bytes) < 2: + break + + length = struct.unpack(">H", length_bytes)[0] + # Skip the rest of this segment (-2 because length includes length bytes) + fp.seek(length - 2, io.SEEK_CUR) + continue + + # It's an APP1 segment - read length + length_bytes = fp.read(2) + if len(length_bytes) < 2: + break + + length = struct.unpack(">H", length_bytes)[0] + segment_data_length = length - 2 # Subtract length field size + + # Read enough bytes to check for XMP identifier + identifier_check = fp.read(len(XMP_IDENTIFIER)) + if len(identifier_check) < len(XMP_IDENTIFIER): + break + + # Check if this APP1 contains XMP data + if identifier_check == XMP_IDENTIFIER: + # We found XMP data - read the rest of the segment + remaining_length = segment_data_length - len(XMP_IDENTIFIER) + if remaining_length > 128 * 1024 * 1024: + raise ValueError("XMP data too large") + xmp_data = fp.read(remaining_length) + + # Process the XMP data + begin_idx = xmp_data.find(XMP_META_TAG_BEGIN) + if begin_idx >= 0: + end_idx = xmp_data.rfind(XMP_META_TAG_END, begin_idx) + if end_idx >= 0: + xmp_data = xmp_data[begin_idx : end_idx + len(XMP_META_TAG_END)] + else: + xmp_data = xmp_data[begin_idx:] + + return xmp_data.decode("utf-8") + else: + # Not XMP data - skip the rest of this APP1 segment + # We already read the identifier_check bytes, so subtract that + fp.seek(segment_data_length - len(identifier_check), io.SEEK_CUR) + + # If we reach here, no XMP data was found + return None + + class ExifReadFromEXIF(ExifReadABC): """ EXIF class for reading exif from an image @@ -520,16 +603,20 @@ def __init__(self, path_or_stream: T.Union[Path, T.BinaryIO]) -> None: if isinstance(path_or_stream, Path): with path_or_stream.open("rb") as fp: try: - self.tags = exifread.process_file(fp, details=True, debug=True) - except Exception: + # Turn off details and debug for performance reasons + self.tags = exifread.process_file(fp, details=False, debug=False) + except Exception as ex: + LOG.warning("Error reading EXIF from %s: %s", path_or_stream, ex) self.tags = {} else: try: + # Turn off details and debug for performance reasons self.tags = exifread.process_file( - path_or_stream, details=True, debug=True + path_or_stream, details=False, debug=False ) - except Exception: + except Exception as ex: + LOG.warning("Error reading EXIF: %s", ex) self.tags = {} def extract_altitude(self) -> T.Optional[float]: @@ -775,27 +862,54 @@ def extract_application_notes(self) -> T.Optional[str]: class ExifRead(ExifReadFromEXIF): + """ + Extract properties from EXIF first and then XMP + NOTE: For performance reasons, XMP is only extracted if EXIF does not contain the required fields + """ + def __init__(self, path_or_stream: T.Union[Path, T.BinaryIO]) -> None: super().__init__(path_or_stream) - self._xmp = self._extract_xmp() + self._path_or_stream = path_or_stream + self._xml_extracted: bool = False + self._cached_xml: T.Optional[ExifReadFromXMP] = None + + def _xmp_with_reason(self, reason: str) -> T.Optional[ExifReadFromXMP]: + if not self._xml_extracted: + LOG.debug('Extracting XMP for "%s"', reason) + self._cached_xml = self._extract_xmp() + self._xml_extracted = True + + return self._cached_xml def _extract_xmp(self) -> T.Optional[ExifReadFromXMP]: - application_notes = self.extract_application_notes() - if application_notes is None: - return None + xml_str = self.extract_application_notes() + if xml_str is None: + if isinstance(self._path_or_stream, Path): + with self._path_or_stream.open("rb") as fp: + xml_str = extract_xmp_efficiently(fp) + else: + self._path_or_stream.seek(0, io.SEEK_SET) + xml_str = extract_xmp_efficiently(self._path_or_stream) + + if xml_str is None: + return None + try: - e = et.fromstring(application_notes) - except et.ParseError: + e = et.fromstring(xml_str) + except et.ParseError as ex: + LOG.warning("Error parsing XMP XML: %s: %s", ex, xml_str) return None + return ExifReadFromXMP(et.ElementTree(e)) def extract_altitude(self) -> T.Optional[float]: val = super().extract_altitude() if val is not None: return val - if self._xmp is None: + xmp = self._xmp_with_reason("altitude") + if xmp is None: return None - val = self._xmp.extract_altitude() + val = xmp.extract_altitude() if val is not None: return val return None @@ -804,20 +918,10 @@ def extract_capture_time(self) -> T.Optional[datetime.datetime]: val = super().extract_capture_time() if val is not None: return val - if self._xmp is None: - return None - val = self._xmp.extract_capture_time() - if val is not None: - return val - return None - - def extract_direction(self) -> T.Optional[float]: - val = super().extract_direction() - if val is not None: - return val - if self._xmp is None: + xmp = self._xmp_with_reason("capture_time") + if xmp is None: return None - val = self._xmp.extract_direction() + val = xmp.extract_capture_time() if val is not None: return val return None @@ -826,9 +930,10 @@ def extract_lon_lat(self) -> T.Optional[T.Tuple[float, float]]: val = super().extract_lon_lat() if val is not None: return val - if self._xmp is None: + xmp = self._xmp_with_reason("lon_lat") + if xmp is None: return None - val = self._xmp.extract_lon_lat() + val = xmp.extract_lon_lat() if val is not None: return val return None @@ -837,9 +942,10 @@ def extract_make(self) -> T.Optional[str]: val = super().extract_make() if val is not None: return val - if self._xmp is None: + xmp = self._xmp_with_reason("make") + if xmp is None: return None - val = self._xmp.extract_make() + val = xmp.extract_make() if val is not None: return val return None @@ -848,9 +954,10 @@ def extract_model(self) -> T.Optional[str]: val = super().extract_model() if val is not None: return val - if self._xmp is None: + xmp = self._xmp_with_reason("model") + if xmp is None: return None - val = self._xmp.extract_model() + val = xmp.extract_model() if val is not None: return val return None @@ -859,9 +966,10 @@ def extract_width(self) -> T.Optional[int]: val = super().extract_width() if val is not None: return val - if self._xmp is None: + xmp = self._xmp_with_reason("width") + if xmp is None: return None - val = self._xmp.extract_width() + val = xmp.extract_width() if val is not None: return val return None @@ -870,9 +978,10 @@ def extract_height(self) -> T.Optional[int]: val = super().extract_height() if val is not None: return val - if self._xmp is None: + xmp = self._xmp_with_reason("width") + if xmp is None: return None - val = self._xmp.extract_height() + val = xmp.extract_height() if val is not None: return val return None diff --git a/mapillary_tools/geotag/geotag_images_from_exif.py b/mapillary_tools/geotag/geotag_images_from_exif.py index 4e46f2198..4f16fb5f2 100644 --- a/mapillary_tools/geotag/geotag_images_from_exif.py +++ b/mapillary_tools/geotag/geotag_images_from_exif.py @@ -1,4 +1,3 @@ -import io import logging import typing as T from multiprocessing import Pool @@ -6,34 +5,13 @@ from tqdm import tqdm -from .. import exceptions, exif_write, geo, types, utils +from .. import exceptions, geo, types, utils from ..exif_read import ExifRead, ExifReadABC from .geotag_from_generic import GeotagImagesFromGeneric LOG = logging.getLogger(__name__) -def verify_image_exif_write( - metadata: types.ImageMetadata, - image_bytes: T.Optional[bytes] = None, -) -> None: - if image_bytes is None: - edit = exif_write.ExifEdit(metadata.filename) - else: - edit = exif_write.ExifEdit(image_bytes) - - # The cast is to fix the type error in Python3.6: - # Argument 1 to "add_image_description" of "ExifEdit" has incompatible type "ImageDescription"; expected "Dict[str, Any]" - edit.add_image_description( - T.cast(T.Dict, types.desc_file_to_exif(types.as_desc(metadata))) - ) - - # Possible errors thrown here: - # - struct.error: 'H' format requires 0 <= number <= 65535 - # - piexif.InvalidImageDataError - edit.dump_image_bytes() - - class GeotagImagesFromEXIF(GeotagImagesFromGeneric): def __init__( self, image_paths: T.Sequence[Path], num_processes: T.Optional[int] = None @@ -84,30 +62,16 @@ def geotag_image( image_path: Path, skip_lonlat_error: bool = False ) -> types.ImageMetadataOrError: try: - # load the image bytes into memory to avoid reading it multiple times with image_path.open("rb") as fp: - image_bytesio = io.BytesIO(fp.read()) - - image_bytesio.seek(0, io.SEEK_SET) - exif = ExifRead(image_bytesio) - - image_metadata = GeotagImagesFromEXIF.build_image_metadata( - image_path, exif, skip_lonlat_error=skip_lonlat_error - ) - - image_bytesio.seek(0, io.SEEK_SET) - verify_image_exif_write( - image_metadata, - image_bytes=image_bytesio.read(), - ) + exif = ExifRead(fp) + image_metadata = GeotagImagesFromEXIF.build_image_metadata( + image_path, exif, skip_lonlat_error=skip_lonlat_error + ) except Exception as ex: return types.describe_error_metadata( ex, image_path, filetype=types.FileType.IMAGE ) - image_bytesio.seek(0, io.SEEK_SET) - image_metadata.update_md5sum(image_bytesio) - return image_metadata def to_description(self) -> T.List[types.ImageMetadataOrError]: diff --git a/mapillary_tools/geotag/geotag_images_from_exiftool.py b/mapillary_tools/geotag/geotag_images_from_exiftool.py index b343f7dca..79d0a00dc 100644 --- a/mapillary_tools/geotag/geotag_images_from_exiftool.py +++ b/mapillary_tools/geotag/geotag_images_from_exiftool.py @@ -9,7 +9,7 @@ from .. import exceptions, exiftool_read, types from .geotag_from_generic import GeotagImagesFromGeneric -from .geotag_images_from_exif import GeotagImagesFromEXIF, verify_image_exif_write +from .geotag_images_from_exif import GeotagImagesFromEXIF LOG = logging.getLogger(__name__) @@ -40,10 +40,6 @@ def geotag_image(element: ET.Element) -> types.ImageMetadataOrError: with image_path.open("rb") as fp: image_bytesio = io.BytesIO(fp.read()) image_bytesio.seek(0, io.SEEK_SET) - verify_image_exif_write( - image_metadata, - image_bytes=image_bytesio.read(), - ) except Exception as ex: return types.describe_error_metadata( ex, image_path, filetype=types.FileType.IMAGE diff --git a/mapillary_tools/process_geotag_properties.py b/mapillary_tools/process_geotag_properties.py index 054a8cbd3..698e8889e 100644 --- a/mapillary_tools/process_geotag_properties.py +++ b/mapillary_tools/process_geotag_properties.py @@ -9,7 +9,7 @@ from tqdm import tqdm -from . import constants, exceptions, exif_write, history, types, utils +from . import constants, exceptions, exif_write, types, utils from .geotag import ( geotag_from_generic, geotag_images_from_exif, @@ -461,62 +461,6 @@ def split_if( return yes, no -def _check_upload_status( - metadatas: T.Sequence[types.MetadataOrError], -) -> T.List[types.MetadataOrError]: - groups = types.group_and_sort_images( - [ - metadata - for metadata in metadatas - if isinstance(metadata, types.ImageMetadata) - ] - ) - uploaded_sequence_uuids = set() - for sequence_uuid, group in groups.items(): - for m in group: - m.update_md5sum() - sequence_md5sum = types.sequence_md5sum(group) - if history.is_uploaded(sequence_md5sum): - uploaded_sequence_uuids.add(sequence_uuid) - - output: T.List[types.MetadataOrError] = [] - for metadata in metadatas: - if isinstance(metadata, types.ImageMetadata): - if metadata.MAPSequenceUUID in uploaded_sequence_uuids: - output.append( - types.describe_error_metadata( - exceptions.MapillaryUploadedAlreadyError( - "The image was already uploaded", - types.as_desc(metadata), - ), - filename=metadata.filename, - filetype=types.FileType.IMAGE, - ) - ) - else: - output.append(metadata) - elif isinstance(metadata, types.VideoMetadata): - metadata.update_md5sum() - assert isinstance(metadata.md5sum, str) - if history.is_uploaded(metadata.md5sum): - output.append( - types.describe_error_metadata( - exceptions.MapillaryUploadedAlreadyError( - "The video was already uploaded", - types.as_desc(metadata), - ), - filename=metadata.filename, - filetype=metadata.filetype, - ) - ) - else: - output.append(metadata) - else: - output.append(metadata) - assert len(output) == len(metadatas), "length mismatch" - return output - - def _validate_metadatas( metadatas: T.Sequence[types.MetadataOrError], num_processes: T.Optional[int] ) -> T.List[types.MetadataOrError]: @@ -593,10 +537,7 @@ def process_finalize( ) LOG.debug("Validating %d metadatas", len(metadatas)) - metadatas = _validate_metadatas(metadatas, num_processes) - - LOG.info("Checking upload status for %d metadatas", len(metadatas)) - metadatas = _check_upload_status(metadatas) + metadatas = _validate_metadatas(metadatas, num_processes=num_processes) _overwrite_exif_tags( # search image metadatas again because some of them might have been failed diff --git a/tests/integration/fixtures.py b/tests/integration/fixtures.py index 5d19b10dd..cc30f54d3 100644 --- a/tests/integration/fixtures.py +++ b/tests/integration/fixtures.py @@ -224,7 +224,9 @@ def verify_descs(expected: T.List[T.Dict], actual: T.Union[Path, T.List[T.Dict]] actual_desc = actual_map.get(filename) assert actual_desc is not None, expected_desc if "error" in expected_desc: - assert expected_desc["error"]["type"] == actual_desc["error"]["type"] + assert expected_desc["error"]["type"] == actual_desc.get("error", {}).get( + "type" + ), f"{expected_desc=} != {actual_desc=}" if "message" in expected_desc["error"]: assert ( expected_desc["error"]["message"] == actual_desc["error"]["message"] diff --git a/tests/integration/test_history.py b/tests/integration/test_history.py index 656bc626f..0dbf0e4f8 100644 --- a/tests/integration/test_history.py +++ b/tests/integration/test_history.py @@ -1,5 +1,4 @@ import subprocess -from pathlib import Path import py.path import pytest @@ -10,7 +9,6 @@ setup_data, setup_upload, USERNAME, - verify_descs, ) UPLOAD_FLAGS = f"--dry_run --user_name={USERNAME}" @@ -32,38 +30,6 @@ def test_upload_images( for upload in setup_upload.listdir(): upload.remove() - x = subprocess.run( - f"{EXECUTABLE} process --file_types=image {str(setup_data)}", - shell=True, - ) - assert x.returncode == 0, x.stderr - verify_descs( - [ - { - "error": { - "message": "The image was already uploaded", - "type": "MapillaryUploadedAlreadyError", - }, - "filename": str(Path(setup_data.join("images").join("DSC00001.JPG"))), - }, - { - "error": { - "message": "The image was already uploaded", - "type": "MapillaryUploadedAlreadyError", - }, - "filename": str(Path(setup_data.join("images").join("DSC00497.JPG"))), - }, - { - "error": { - "message": "The image was already uploaded", - "type": "MapillaryUploadedAlreadyError", - }, - "filename": str(Path(setup_data.join("images").join("V0370574.JPG"))), - }, - ], - Path(setup_data, "mapillary_image_description.json"), - ) - x = subprocess.run( f"{EXECUTABLE} process_and_upload --file_types=image {UPLOAD_FLAGS} {str(setup_data)}", shell=True, diff --git a/tests/unit/test_sequence_processing.py b/tests/unit/test_sequence_processing.py index 475002366..5d076cb92 100644 --- a/tests/unit/test_sequence_processing.py +++ b/tests/unit/test_sequence_processing.py @@ -456,7 +456,7 @@ def test_process_finalize(setup_data): "MAPLongitude": 1, "MAPCaptureTime": "1970_01_01_00_00_02_000", "MAPCompassHeading": {"TrueHeading": 17.0, "MagneticHeading": 17.0}, - "md5sum": "346c064df2c194e20ea98708fd61ac10", + "md5sum": None, }, { "error": {