Skip to content

Commit c7c498f

Browse files
authored
Merge pull request #446 from softwarepub/feature/424-readme-license-file-exists
Add implementation of `file_exists` plugin
2 parents 2a3f347 + 05f6146 commit c7c498f

File tree

5 files changed

+339
-2
lines changed

5 files changed

+339
-2
lines changed

docs/source/plugins/plugins.json

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,17 @@
1515
"harvested_files": ["codemeta.json"],
1616
"builtin": true
1717
},
18+
{
19+
"name": "'file_exists' Harvester",
20+
"description": "Harvest plugin that figures out whether certain frequently used files (README, LICENSE, ...) exist. Custom search patterns for other types of files can be configured.",
21+
"author": "Hermes team",
22+
"steps": ["harvest"],
23+
"harvested_files": [
24+
"readme",
25+
"license"
26+
],
27+
"builtin": true
28+
},
1829
{
1930
"name": "hermes-plugin-git",
2031
"description": "Harvest plugin for Git repository metadata.",

hermes.toml

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,10 @@
33
# SPDX-License-Identifier: CC0-1.0
44

55
[harvest]
6-
sources = [ "cff", "toml" ] # ordered priority (first one is most important)
6+
sources = [ "cff", "toml", "file_exists" ] # ordered priority (first one is most important)
7+
8+
[harvest.file_exists.search_patterns]
9+
community = ["contributing.md", "governance.md"]
710

811
[deposit]
912
target = "invenio_rdm"
@@ -18,4 +21,3 @@ record_id = 13221384
1821
depositions = "api/deposit/depositions"
1922
licenses = "api/vocabularies/licenses"
2023
communities = "api/communities"
21-

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@ hermes-marketplace = "hermes.commands.marketplace:main"
5757
[project.entry-points."hermes.harvest"]
5858
cff = "hermes.commands.harvest.cff:CffHarvestPlugin"
5959
codemeta = "hermes.commands.harvest.codemeta:CodeMetaHarvestPlugin"
60+
file_exists = "hermes.commands.harvest.file_exists:FileExistsHarvestPlugin"
6061

6162
[project.entry-points."hermes.deposit"]
6263
file = "hermes.commands.deposit.file:FileDepositPlugin"
Lines changed: 286 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,286 @@
1+
# SPDX-FileCopyrightText: 2025 Helmholtz-Zentrum Dresden-Rossendorf
2+
# SPDX-License-Identifier: Apache-2.0
3+
# SPDX-FileContributor: David Pape
4+
5+
"""Module for the ``FileExistsHarvestPlugin`` and it's associated models and helpers."""
6+
7+
from collections import defaultdict
8+
from dataclasses import dataclass
9+
from functools import cache
10+
from pathlib import Path
11+
from typing import Dict, Iterable, List, Optional, Set
12+
from typing_extensions import Self
13+
import subprocess
14+
15+
from pydantic import BaseModel
16+
17+
from hermes.commands.harvest.base import HermesHarvestCommand, HermesHarvestPlugin
18+
from hermes.utils import guess_file_type
19+
20+
21+
@dataclass(kw_only=True)
22+
class URL:
23+
"""Basic model of a ``schema:URL``.
24+
25+
See also: https://schema.org/URL
26+
"""
27+
28+
url: str
29+
30+
@classmethod
31+
def from_path(cls, path: Path) -> Self:
32+
return cls(url=path.as_uri())
33+
34+
def as_codemeta(self) -> dict:
35+
return {
36+
"@type": "schema:URL",
37+
"@value": self.url,
38+
}
39+
40+
41+
# TODO: Support common subtypes of ``MediaObject`` such as ``TextObject`` and
42+
# ``ImageObject``? This would require either mapping mime types to text/image/binary/...
43+
# which probably has many special cases (e.g. ``application/toml`` → text,
44+
# ``image/svg+xml`` → text, ...), or figuring this out using the file itself, e.g.
45+
# using libmagic.
46+
@dataclass(kw_only=True)
47+
class MediaObject:
48+
"""Basic model of a ``schema:MediaObject``.
49+
50+
See also: https://schema.org/MediaObject
51+
"""
52+
53+
content_size: Optional[str]
54+
encoding_format: Optional[str]
55+
url: URL
56+
57+
@classmethod
58+
def from_path(cls, path: Path) -> Self:
59+
size = None
60+
try:
61+
size = str(path.stat().st_size) # string!
62+
except FileNotFoundError:
63+
pass
64+
type_, _encoding = guess_file_type(path)
65+
url = URL.from_path(path)
66+
return cls(content_size=size, encoding_format=type_, url=url)
67+
68+
def as_codemeta(self) -> dict:
69+
return {
70+
"@type": "schema:MediaObject",
71+
"schema:contentSize": self.content_size,
72+
"schema:encodingFormat": self.encoding_format,
73+
"schema:url": self.url.as_codemeta(),
74+
}
75+
76+
77+
@dataclass(kw_only=True)
78+
class CreativeWork:
79+
"""Basic model of a ``schema:CreativeWork``.
80+
81+
See also: https://schema.org/CreativeWork
82+
"""
83+
84+
name: str
85+
associated_media: MediaObject
86+
keywords: Set[str]
87+
88+
@classmethod
89+
def from_path(cls, path: Path, keywords: Iterable[str]) -> Self:
90+
text_object = MediaObject.from_path(path)
91+
return cls(name=path.stem, associated_media=text_object, keywords=set(keywords))
92+
93+
def as_codemeta(self) -> dict:
94+
return {
95+
"@type": "schema:CreativeWork",
96+
"schema:name": self.name,
97+
"schema:associatedMedia": self.associated_media.as_codemeta(),
98+
"schema:keywords": list(self.keywords),
99+
}
100+
101+
102+
class FileExistsHarvestSettings(BaseModel):
103+
"""Settings for ``file_exists`` harvester."""
104+
105+
enable_git_ls_files: bool = True
106+
keep_untagged_files: bool = False
107+
search_patterns: Dict[str, List[str]] = {}
108+
109+
110+
class FileExistsHarvestPlugin(HermesHarvestPlugin):
111+
"""Harvest plugin that finds and tags files based on patterns.
112+
113+
Files are searched using ``git ls-files`` or a recursive traversal of the working
114+
directory. If available, ``git ls-files`` is used. This can be disabled via the
115+
options.
116+
117+
The found files are then tagged based on patterns such as ``readme.md``
118+
or ``licenses/*.txt``. Matching of the file paths is implemented using the ``match``
119+
function of Python's ``Path`` objects. This means, matching is performed from the
120+
end of the path. Search patterns are case-insensitive and use ``/`` as the path
121+
separator.
122+
123+
Files are tagged using the name of the file name pattern's "group" as the keyword.
124+
If a file matches multiple patterns, all appropriate keywords are added. Depending
125+
on configuration of ``keep_untagged_files``, files without any tags are then removed
126+
from the file list (this is the default).
127+
128+
Files that were tagged with ``readme`` are added to the data model as a
129+
``schema:URL`` using the ``codemeta:readme`` property. Files that were tagged
130+
``license`` are added to the data model as a ``schema:URL`` using the
131+
``schema:license`` property. All files are added to the data model as a
132+
``schema:CreativeWork`` using the ``schema:hasPart`` property. All file URLs are
133+
given using the ``file:`` protocol and the absolute path of the file at the time of
134+
harvesting.
135+
"""
136+
137+
settings_class = FileExistsHarvestSettings
138+
139+
# key: group name (used as keywords when tagging), value: list of patterns
140+
base_search_patterns = {
141+
"readme": [
142+
"readme",
143+
"readme.md",
144+
"readme.markdown",
145+
"readme.rst",
146+
"readme.txt",
147+
],
148+
"license": [
149+
"license",
150+
"license.txt",
151+
"license.md",
152+
"licenses/*.txt",
153+
],
154+
}
155+
156+
def __init__(self):
157+
self.working_directory: Path = Path.cwd()
158+
self.settings: FileExistsHarvestSettings = FileExistsHarvestSettings()
159+
160+
# mapping from tag name to list of file name patterns
161+
self.search_patterns: Dict[str, List[str]] = self.base_search_patterns
162+
# mapping from file name pattern to set of tags
163+
self.search_pattern_keywords: Dict[str, Set[str]] = defaultdict(set)
164+
# flat list of file name patterns
165+
self.search_pattern_list: List[str] = []
166+
167+
def __call__(self, command: HermesHarvestCommand):
168+
self.working_directory = command.args.path.resolve()
169+
self.settings = command.settings.file_exists
170+
171+
# update search patterns from config
172+
self.search_patterns.update(self.settings.search_patterns)
173+
174+
# create inverse lookup table
175+
for key, patterns in self.search_patterns.items():
176+
for pattern in patterns:
177+
self.search_pattern_keywords[pattern].add(key)
178+
179+
# create flat list for easy iteration
180+
self.search_pattern_list = sum(self.search_patterns.values(), start=[])
181+
182+
files_tags = self._filter_files(self._tag_files(self._find_files()))
183+
creative_works = [
184+
CreativeWork.from_path(file, list(tags))
185+
for file, tags in files_tags.items()
186+
]
187+
188+
data = {
189+
"schema:hasPart": [work.as_codemeta() for work in creative_works],
190+
"schema:license": [
191+
work.associated_media.url.as_codemeta()
192+
for work in creative_works
193+
if work.keywords and "license" in work.keywords
194+
],
195+
"codemeta:readme": [
196+
work.associated_media.url.as_codemeta()
197+
for work in creative_works
198+
if work.keywords and "readme" in work.keywords
199+
],
200+
}
201+
202+
return data, {"workingDirectory": str(self.working_directory)}
203+
204+
def _find_files(self) -> List[Path]:
205+
"""Find files.
206+
207+
If the setting ``enable_git_ls_files`` is ``True``, ``git ls-files`` is used to
208+
find matching files. If it is set to ``False`` or getting the list from git
209+
fails, the working directory is searched recursively.
210+
"""
211+
files = None
212+
if self.settings.enable_git_ls_files:
213+
files = _git_ls_files(self.working_directory)
214+
if files is None:
215+
files = _ls_files(self.working_directory)
216+
return files
217+
218+
def _tag_files(self, paths: Iterable[Path]) -> Dict[Path, Set[str]]:
219+
"""Tag file paths based on patterns.
220+
221+
The files are tagged using the "group" names of the search pattern as the
222+
keywords.
223+
"""
224+
paths_tags = {}
225+
for path in paths:
226+
paths_tags[path] = set()
227+
for pattern in self.search_pattern_list:
228+
if _path_matches_pattern(path, pattern):
229+
tags = self.search_pattern_keywords[pattern]
230+
paths_tags[path].update(tags)
231+
return paths_tags
232+
233+
def _filter_files(self, files_tags: Dict[Path, Set[str]]) -> Dict[Path, Set[str]]:
234+
"""Filter out untagged files if required.
235+
236+
If the setting ``keep_untagged_files`` is set to ``True``, the filter is not
237+
applied.
238+
"""
239+
if self.settings.keep_untagged_files:
240+
return files_tags
241+
return {path: tags for path, tags in files_tags.items() if tags}
242+
243+
244+
def _path_matches_pattern(path: Path, pattern: str) -> bool:
245+
"""Case-insensitive path matching.
246+
247+
Python 3.12 introduces the ``case_sensitive`` kwarg to the ``match`` function. For
248+
older Python versions, we have to implement this behaviour ourselves.
249+
"""
250+
return Path(str(path).casefold()).match(pattern.casefold())
251+
252+
253+
def _ls_files(working_directory: Path) -> List[Path]:
254+
"""Get a list of all files by recursively searching the ``working_directory``.
255+
256+
Only regular files (i.e. files which are not directories, pipes, etc.) are returned.
257+
"""
258+
return [file for file in working_directory.rglob("*") if file.is_file()]
259+
260+
261+
@cache
262+
def _git_ls_files(working_directory: Path) -> Optional[List[Path]]:
263+
"""Get a list of all files by calling ``git ls-file`` in ``working_directory``.
264+
265+
``git ls-file`` is called with the ``--cached`` flag which lists all files tracked
266+
by git. The returned file paths are converted to a list of ``Path`` objects. Files
267+
that are tracked by git but don't exist on disk are not returned. If the git command
268+
fails or git is not found, ``None`` is returned.
269+
270+
The result of this function is cached. Git is only executed once per given
271+
``working_directory``.
272+
"""
273+
try:
274+
result = subprocess.run(
275+
["git", "ls-files", "--cached"],
276+
capture_output=True,
277+
cwd=working_directory,
278+
text=True,
279+
)
280+
except FileNotFoundError:
281+
return None
282+
if result.returncode != 0:
283+
return None
284+
filenames = result.stdout.splitlines()
285+
files = [Path(filename).resolve() for filename in filenames]
286+
return [file for file in files if file.exists()]

src/hermes/utils.py

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@
66
# SPDX-FileContributor: Stephan Druskat <[email protected]>
77

88
from importlib.metadata import metadata
9+
from mimetypes import guess_type
10+
from pathlib import Path
911

1012

1113
def retrieve_project_urls(metadata_urls: list[str]) -> dict[str, str]:
@@ -43,3 +45,38 @@ def retrieve_project_urls(metadata_urls: list[str]) -> dict[str, str]:
4345

4446
# User agent
4547
hermes_user_agent = f"{hermes_name}/{hermes_version} ({hermes_homepage})"
48+
49+
50+
def guess_file_type(path: Path):
51+
"""File type detection for non-standardised formats.
52+
53+
Custom detection for file types not yet supported by Python's ``guess_type``
54+
function.
55+
"""
56+
# YAML was only added to ``guess_type`` in Python 3.14 due to the MIME type only
57+
# having been decided in 2024.
58+
# See: https://www.rfc-editor.org/rfc/rfc9512.html
59+
if path.suffix in [".yml", ".yaml"]:
60+
return ("application/yaml", None)
61+
62+
# TOML is not yet part of ``guess_type`` due to the MIME type only having been
63+
# accepted in October of 2024.
64+
# See: https://www.iana.org/assignments/media-types/application/toml
65+
if path.suffix == ".toml":
66+
return ("application/toml", None)
67+
68+
# cff is yaml.
69+
# See: https://github.com/citation-file-format/citation-file-format/issues/391
70+
if path.name == "CITATION.cff":
71+
return ("application/yaml", None)
72+
73+
# .license files are likely license annotations according to REUSE specification.
74+
# See: https://reuse.software/spec/
75+
if path.suffix == ".license":
76+
return ("text/plain", None)
77+
78+
if path.name == "poetry.lock":
79+
return ("text/plain", None)
80+
81+
# use non-strict mode to cover more file types
82+
return guess_type(path, strict=False)

0 commit comments

Comments
 (0)