diff --git a/CHANGELOG.md b/CHANGELOG.md index 95485df..1d7be79 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,12 @@ See [GitHub releases](https://github.com/pyOpenSci/pyosMeta/releases) page for a ## [Unreleased] +* RSS feed parser that will generate Markdown stub files (@banesullivan, #301) +* Two new dependencies: + * `feedparser`: a utility library for fetching and parsing RSS feeds. This saves us from having to write quite a lot of fetching/parsing logic. + * `unidecode`: comes with a new utility function to easily slugify long title strings. + * `click`: for adding arguments to command line scripts + ## [v1.7.3] - 2025-08-07 * Fix: gracefully fail when collecting repository metrics outside of GitHub (@banesullivan, #300) diff --git a/pyproject.toml b/pyproject.toml index 81f7c02..7326015 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -24,12 +24,15 @@ classifiers = [ "Programming Language :: Python :: 3.11", ] dependencies = [ + "click", + "feedparser", "pydantic>=2.0", "python-doi", "python-dotenv", "requests", "ruamel-yaml>=0.17.21", "tqdm", + "unidecode" ] # This is metadata that pip reads to understand what Python versions your package supports requires-python = ">=3.10" @@ -43,6 +46,7 @@ dev = [ "pre-commit", "pytest", "pytest-cov", + "pytest-localserver", "pytest-mock", ] @@ -58,6 +62,7 @@ parse-history = "pyosmeta.cli.parse_history:main" update-contributors = "pyosmeta.cli.update_contributors:main" update-reviews = "pyosmeta.cli.process_reviews:main" update-review-teams = "pyosmeta.cli.update_review_teams:main" +fetch-rss-feed = "pyosmeta.cli.fetch_rss_feed:main" [tool.coverage.run] branch = true diff --git a/src/pyosmeta/cli/fetch_rss_feed.py b/src/pyosmeta/cli/fetch_rss_feed.py new file mode 100644 index 0000000..35dba38 --- /dev/null +++ b/src/pyosmeta/cli/fetch_rss_feed.py @@ -0,0 +1,15 @@ +import click + +from pyosmeta.parse_rss import create_rss_feed_stubs + + +@click.command() +@click.argument("url") +@click.argument("output_dir") +def main(url: str, output_dir: str): + """Create markdown stubs from an RSS feed URL into a directory.""" + create_rss_feed_stubs(url, output_dir) + + +if __name__ == "__main__": + main() diff --git a/src/pyosmeta/parse_rss.py b/src/pyosmeta/parse_rss.py new file mode 100644 index 0000000..99163ac --- /dev/null +++ b/src/pyosmeta/parse_rss.py @@ -0,0 +1,63 @@ +from pathlib import Path + +import feedparser + +from .utils_clean import slugify + + +def parse_rss_feed(url: str) -> list[dict]: + """Fetch and parse an RSS feed from a URL.""" + parsed_feed = feedparser.parse(url) + return [ + {key: entry.get(key) for key in entry.keys()} + for entry in parsed_feed.entries + ] + + +def make_md_stub(index: int, title: str, summary: str, link: str) -> str: + """Create a Markdown stub for an entry.""" + return f''' +--- +title: "{index}. {title}" +excerpt: " + {summary}" +link: {link} +btn_label: View Tutorial +btn_class: btn--success btn--large +--- +''' + + +def fetch_rss_feed_as_stubs(url: str) -> dict[str, str]: + """Fetch an RSS feed and return a dictionary of Markdown stubs. + + The keys of the dictionary are filenames, and the values are the Markdown content. + """ + items = parse_rss_feed(url) + + stubs = {} + for i, item in enumerate(items): + title = item.get("title", None) + if not title: + # WARN + continue + filename = f"{i:02d}-{slugify(title)}.md" + content = make_md_stub( + index=i, + title=title, + summary=item.get("summary", ""), + link=item.get("link", "#"), + ) + stubs[filename] = content + return stubs + + +def create_rss_feed_stubs(url: str, output_dir: str) -> None: + """Create markdown stubs from an RSS feed URL into a directory.""" + stubs = fetch_rss_feed_as_stubs(url) + for filename, content in stubs.items(): + # TODO: should we wipe existing files? + path = Path(output_dir) / filename + path.parent.mkdir(parents=True, exist_ok=True) + with open(path, "w") as f: + f.write(content) diff --git a/src/pyosmeta/utils_clean.py b/src/pyosmeta/utils_clean.py index 907d97f..2d99769 100644 --- a/src/pyosmeta/utils_clean.py +++ b/src/pyosmeta/utils_clean.py @@ -9,6 +9,7 @@ import doi import requests +import unidecode from .logging import logger @@ -213,3 +214,9 @@ def clean_archive(archive): return None else: raise ValueError(f"Invalid archive URL: {archive}") + + +def slugify(text: str) -> str: + """Convert a long title/text into a slug suitable for filenames/URLs.""" + text = unidecode.unidecode(text).lower() + return re.sub(r"[\W_]+", "-", text) diff --git a/tests/conftest.py b/tests/conftest.py index e484968..d53dd26 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -175,3 +175,11 @@ def _data_file( return path return _data_file + + +@pytest.fixture +def rss_feed_url(httpserver): + """Serve a local RSS feed for testing.""" + path = DATA_DIR / "tutorials.rss" + httpserver.serve_content(path.read_text()) + return httpserver.url diff --git a/tests/data/tutorials.rss b/tests/data/tutorials.rss new file mode 100644 index 0000000..e1e37a2 --- /dev/null +++ b/tests/data/tutorials.rss @@ -0,0 +1,108 @@ + + + + + pyOpenSci Tutorials + https://www.pyopensci.org/python-package-guide/tutorials/intro.html + + A tutorial feed that lists metadata for the pyOpenSci Python packaging tutorials so we can automatically list them on our website. + en + Mon, 11 Aug 2025 21:09:23 GMT + + Add a License and Code of Conduct to your python package + https://www.pyopensci.org/python-package-guide/tutorials/add-license-coc.html + Learn how to add a LICENSE and CODE_OF_CONDUCT file to your Python package. This lesson covers choosing a permissive license, placing key files for visibility on GitHub and PyPI, and adopting the Contributor Covenant to support an inclusive community. + pyOpenSci + https://www.pyopensci.org/python-package-guide/tutorials/add-license-coc.html + Mon, 11 Aug 2025 21:07:32 GMT + + + Add a README file to your Python package + https://www.pyopensci.org/python-package-guide/tutorials/add-readme.html + Learn how to create a clear, effective README file for your Python package. This lesson covers what to include, why each section matters, and how a well-structured README improves usability and discoverability on GitHub and PyPI. + pyOpenSci + https://www.pyopensci.org/python-package-guide/tutorials/add-readme.html + Mon, 11 Aug 2025 21:07:32 GMT + + + Command Line Reference Guide + https://www.pyopensci.org/python-package-guide/tutorials/command-line-reference.html + Learn how to add a command-line interface (CLI) to your Python package using the argparse library. This lesson walks you through creating a CLI entry point so users can run your package directly from the terminal. + pyOpenSci + https://www.pyopensci.org/python-package-guide/tutorials/command-line-reference.html + Mon, 11 Aug 2025 21:07:32 GMT + + + Create a Python package from scratch, a beginner-friendly tutorial + https://www.pyopensci.org/python-package-guide/tutorials/create-python-package.html + Learn how to create a Python package and make your code installable using Hatch. This tutorial walks you through structuring your code and configuring a pyproject.toml so others can easily install and use your package. + pyOpenSci + https://www.pyopensci.org/python-package-guide/tutorials/create-python-package.html + Mon, 11 Aug 2025 21:07:32 GMT + + + Use Hatch environments with your Python package: a beginner-friendly tutorial + https://www.pyopensci.org/python-package-guide/tutorials/develop-python-package-hatch.html + The pyOpenSci pure Python package template uses Hatch to manage environments and run tests, docs, and other maintenance steps. Learn how to use Hatch environments to manage your Python package. + pyOpenSci + https://www.pyopensci.org/python-package-guide/tutorials/develop-python-package-hatch.html + Mon, 11 Aug 2025 21:07:32 GMT + + + Get to Know Hatch + https://www.pyopensci.org/python-package-guide/tutorials/get-to-know-hatch.html + Get started with Hatch, a modern Python packaging tool. This lesson introduces Hatch’s features and shows how it simplifies environment management, project scaffolding, and building your package. + pyOpenSci + https://www.pyopensci.org/python-package-guide/tutorials/get-to-know-hatch.html + Mon, 11 Aug 2025 21:07:32 GMT + + + Python packaging 101 + https://www.pyopensci.org/python-package-guide/tutorials/intro.html + This page outlines the key steps to create, document, and share a high-quality scientific Python package. Here you will also get an overview of the pyOpenSci packaging guide and what you’ll learn. + pyOpenSci + https://www.pyopensci.org/python-package-guide/tutorials/intro.html + Mon, 11 Aug 2025 21:07:32 GMT + + + Publish your Python package that is on PyPI to conda-forge + https://www.pyopensci.org/python-package-guide/tutorials/publish-conda-forge.html + Learn how to publish your Python package on conda-forge to make it easily installable with conda. This lesson covers the submission process, metadata requirements, and maintaining your feedstock. + pyOpenSci + https://www.pyopensci.org/python-package-guide/tutorials/publish-conda-forge.html + Mon, 11 Aug 2025 21:07:32 GMT + + + Publish your Python package to PyPI + https://www.pyopensci.org/python-package-guide/tutorials/publish-pypi.html + Learn how to publish your Python package on PyPI so others can install it using pip. This lesson covers building your package, creating a PyPI account, and uploading your distribution files. + pyOpenSci + https://www.pyopensci.org/python-package-guide/tutorials/publish-pypi.html + Mon, 11 Aug 2025 21:07:32 GMT + + + Make your Python package PyPI ready - pyproject.toml + https://www.pyopensci.org/python-package-guide/tutorials/pyproject-toml.html + The pyproject.toml file is the central configuration file for building and packaging Python projects. This lesson explains key sections like name, version, dependencies, and how they support packaging and distribution. You’ll learn how to set up this file to ensure your package is ready for publishing. + pyOpenSci + https://www.pyopensci.org/python-package-guide/tutorials/pyproject-toml.html + Mon, 11 Aug 2025 21:07:32 GMT + + + Using Hatch to Migrate setup.py to a pyproject.toml + https://www.pyopensci.org/python-package-guide/tutorials/setup-py-to-pyproject-toml.html + If you’re creating a pure Python project, pyproject.toml is preferred over setup.py for packaging and configuration. Learn how to migrate from the older setup.py format to the modern pyproject.toml file. This lesson walks you through updating your package metadata and build settings to align with current Python packaging standards. + pyOpenSci + https://www.pyopensci.org/python-package-guide/tutorials/setup-py-to-pyproject-toml.html + Mon, 11 Aug 2025 21:07:32 GMT + + + Setup Trusted Publishing for secure and automated publishing via GitHub Actions + https://www.pyopensci.org/python-package-guide/tutorials/trusted-publishing.html + Learn how to publish your Python package automatically via GitHub Actions. This lesson also covers how to do publishing in a secure way by using Trusted Publishing. + pyOpenSci + https://www.pyopensci.org/python-package-guide/tutorials/trusted-publishing.html + Mon, 11 Aug 2025 21:07:32 GMT + + + diff --git a/tests/unit/test_parse_rss.py b/tests/unit/test_parse_rss.py new file mode 100644 index 0000000..9333e97 --- /dev/null +++ b/tests/unit/test_parse_rss.py @@ -0,0 +1,24 @@ +from pyosmeta.parse_rss import fetch_rss_feed_as_stubs, parse_rss_feed + + +def test_rss_feed_parse(rss_feed_url): + feed = parse_rss_feed(rss_feed_url) + assert feed is not None + assert len(feed) > 0 + for entry in feed: + assert isinstance(entry, dict) + assert "title" in entry + assert "link" in entry + assert "summary" in entry + + +def test_fetch_rss_feed_as_stubs(rss_feed_url): + stubs = fetch_rss_feed_as_stubs(rss_feed_url) + assert isinstance(stubs, dict) + assert len(stubs) > 0 + for filename, content in stubs.items(): + assert filename.endswith(".md") + assert isinstance(content, str) + assert "title:" in content + assert "excerpt:" in content + assert "link:" in content