Skip to content

Utilities to parse RSS feeds and make markdown stubs #301

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 8 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,12 @@ See [GitHub releases](https://github.com/pyOpenSci/pyosMeta/releases) page for a

## [Unreleased]

* RSS feed parser that will generate Markdown stub files (@banesullivan, #301)
* Two new dependencies:
* `feedparser`: a utility library for fetching and parsing RSS feeds. This saves us from having to write quite a lot of fetching/parsing logic.
* `unidecode`: comes with a new utility function to easily slugify long title strings.
* `click`: for adding arguments to command line scripts

## [v1.7.3] - 2025-08-07

* Fix: gracefully fail when collecting repository metrics outside of GitHub (@banesullivan, #300)
Expand Down
5 changes: 5 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -24,12 +24,15 @@ classifiers = [
"Programming Language :: Python :: 3.11",
]
dependencies = [
"click",
"feedparser",
"pydantic>=2.0",
"python-doi",
"python-dotenv",
"requests",
"ruamel-yaml>=0.17.21",
"tqdm",
"unidecode"
]
# This is metadata that pip reads to understand what Python versions your package supports
requires-python = ">=3.10"
Expand All @@ -43,6 +46,7 @@ dev = [
"pre-commit",
"pytest",
"pytest-cov",
"pytest-localserver",
"pytest-mock",
]

Expand All @@ -58,6 +62,7 @@ parse-history = "pyosmeta.cli.parse_history:main"
update-contributors = "pyosmeta.cli.update_contributors:main"
update-reviews = "pyosmeta.cli.process_reviews:main"
update-review-teams = "pyosmeta.cli.update_review_teams:main"
fetch-rss-feed = "pyosmeta.cli.fetch_rss_feed:main"

[tool.coverage.run]
branch = true
Expand Down
15 changes: 15 additions & 0 deletions src/pyosmeta/cli/fetch_rss_feed.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
import click

from pyosmeta.parse_rss import create_rss_feed_stubs


@click.command()
@click.argument("url")
@click.argument("output_dir")
def main(url: str, output_dir: str):
"""Create markdown stubs from an RSS feed URL into a directory."""
create_rss_feed_stubs(url, output_dir)


if __name__ == "__main__":
main()
63 changes: 63 additions & 0 deletions src/pyosmeta/parse_rss.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
from pathlib import Path

import feedparser

from .utils_clean import slugify


def parse_rss_feed(url: str) -> list[dict]:
"""Fetch and parse an RSS feed from a URL."""
parsed_feed = feedparser.parse(url)
return [
{key: entry.get(key) for key in entry.keys()}
for entry in parsed_feed.entries
]


def make_md_stub(index: int, title: str, summary: str, link: str) -> str:
"""Create a Markdown stub for an entry."""
return f'''
---
title: "{index}. {title}"
excerpt: "
{summary}"
link: {link}
btn_label: View Tutorial
btn_class: btn--success btn--large
---
'''


def fetch_rss_feed_as_stubs(url: str) -> dict[str, str]:
"""Fetch an RSS feed and return a dictionary of Markdown stubs.

The keys of the dictionary are filenames, and the values are the Markdown content.
"""
items = parse_rss_feed(url)

stubs = {}
for i, item in enumerate(items):
title = item.get("title", None)
if not title:
# WARN
continue
filename = f"{i:02d}-{slugify(title)}.md"
content = make_md_stub(
index=i,
title=title,
summary=item.get("summary", ""),
link=item.get("link", "#"),
)
stubs[filename] = content
return stubs


def create_rss_feed_stubs(url: str, output_dir: str) -> None:
"""Create markdown stubs from an RSS feed URL into a directory."""
stubs = fetch_rss_feed_as_stubs(url)
for filename, content in stubs.items():
# TODO: should we wipe existing files?
path = Path(output_dir) / filename
path.parent.mkdir(parents=True, exist_ok=True)
with open(path, "w") as f:
f.write(content)
7 changes: 7 additions & 0 deletions src/pyosmeta/utils_clean.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

import doi
import requests
import unidecode

from .logging import logger

Expand Down Expand Up @@ -213,3 +214,9 @@ def clean_archive(archive):
return None
else:
raise ValueError(f"Invalid archive URL: {archive}")


def slugify(text: str) -> str:
"""Convert a long title/text into a slug suitable for filenames/URLs."""
text = unidecode.unidecode(text).lower()
return re.sub(r"[\W_]+", "-", text)
8 changes: 8 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,3 +175,11 @@ def _data_file(
return path

return _data_file


@pytest.fixture
def rss_feed_url(httpserver):
"""Serve a local RSS feed for testing."""
path = DATA_DIR / "tutorials.rss"
httpserver.serve_content(path.read_text())
return httpserver.url
108 changes: 108 additions & 0 deletions tests/data/tutorials.rss
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
<!-- https://www.pyopensci.org/python-package-guide/tutorials.rss -->
<?xml version='1.0' encoding='UTF-8'?>
<rss xmlns:atom="http://www.w3.org/2005/Atom" xmlns:content="http://purl.org/rss/1.0/modules/content/" version="2.0">
<channel>
<title>pyOpenSci Tutorials</title>
<link>https://www.pyopensci.org/python-package-guide/tutorials/intro.html</link>
<atom:link href="https://www.pyopensci.org/python-package-guide/tutorials.rss" rel="self"/>
<description>A tutorial feed that lists metadata for the pyOpenSci Python packaging tutorials so we can automatically list them on our website.</description>
<language>en</language>
<lastBuildDate>Mon, 11 Aug 2025 21:09:23 GMT</lastBuildDate>
<item>
<title>Add a License and Code of Conduct to your python package</title>
<link>https://www.pyopensci.org/python-package-guide/tutorials/add-license-coc.html</link>
<description>Learn how to add a LICENSE and CODE_OF_CONDUCT file to your Python package. This lesson covers choosing a permissive license, placing key files for visibility on GitHub and PyPI, and adopting the Contributor Covenant to support an inclusive community.</description>
<author>pyOpenSci</author>
<guid isPermaLink="true">https://www.pyopensci.org/python-package-guide/tutorials/add-license-coc.html</guid>
<pubDate>Mon, 11 Aug 2025 21:07:32 GMT</pubDate>
</item>
<item>
<title>Add a README file to your Python package</title>
<link>https://www.pyopensci.org/python-package-guide/tutorials/add-readme.html</link>
<description>Learn how to create a clear, effective README file for your Python package. This lesson covers what to include, why each section matters, and how a well-structured README improves usability and discoverability on GitHub and PyPI.</description>
<author>pyOpenSci</author>
<guid isPermaLink="true">https://www.pyopensci.org/python-package-guide/tutorials/add-readme.html</guid>
<pubDate>Mon, 11 Aug 2025 21:07:32 GMT</pubDate>
</item>
<item>
<title>Command Line Reference Guide</title>
<link>https://www.pyopensci.org/python-package-guide/tutorials/command-line-reference.html</link>
<description>Learn how to add a command-line interface (CLI) to your Python package using the argparse library. This lesson walks you through creating a CLI entry point so users can run your package directly from the terminal.</description>
<author>pyOpenSci</author>
<guid isPermaLink="true">https://www.pyopensci.org/python-package-guide/tutorials/command-line-reference.html</guid>
<pubDate>Mon, 11 Aug 2025 21:07:32 GMT</pubDate>
</item>
<item>
<title>Create a Python package from scratch, a beginner-friendly tutorial</title>
<link>https://www.pyopensci.org/python-package-guide/tutorials/create-python-package.html</link>
<description>Learn how to create a Python package and make your code installable using Hatch. This tutorial walks you through structuring your code and configuring a pyproject.toml so others can easily install and use your package.</description>
<author>pyOpenSci</author>
<guid isPermaLink="true">https://www.pyopensci.org/python-package-guide/tutorials/create-python-package.html</guid>
<pubDate>Mon, 11 Aug 2025 21:07:32 GMT</pubDate>
</item>
<item>
<title>Use Hatch environments with your Python package: a beginner-friendly tutorial</title>
<link>https://www.pyopensci.org/python-package-guide/tutorials/develop-python-package-hatch.html</link>
<description>The pyOpenSci pure Python package template uses Hatch to manage environments and run tests, docs, and other maintenance steps. Learn how to use Hatch environments to manage your Python package.</description>
<author>pyOpenSci</author>
<guid isPermaLink="true">https://www.pyopensci.org/python-package-guide/tutorials/develop-python-package-hatch.html</guid>
<pubDate>Mon, 11 Aug 2025 21:07:32 GMT</pubDate>
</item>
<item>
<title>Get to Know Hatch</title>
<link>https://www.pyopensci.org/python-package-guide/tutorials/get-to-know-hatch.html</link>
<description>Get started with Hatch, a modern Python packaging tool. This lesson introduces Hatch’s features and shows how it simplifies environment management, project scaffolding, and building your package.</description>
<author>pyOpenSci</author>
<guid isPermaLink="true">https://www.pyopensci.org/python-package-guide/tutorials/get-to-know-hatch.html</guid>
<pubDate>Mon, 11 Aug 2025 21:07:32 GMT</pubDate>
</item>
<item>
<title>Python packaging 101</title>
<link>https://www.pyopensci.org/python-package-guide/tutorials/intro.html</link>
<description>This page outlines the key steps to create, document, and share a high-quality scientific Python package. Here you will also get an overview of the pyOpenSci packaging guide and what you’ll learn.</description>
<author>pyOpenSci</author>
<guid isPermaLink="true">https://www.pyopensci.org/python-package-guide/tutorials/intro.html</guid>
<pubDate>Mon, 11 Aug 2025 21:07:32 GMT</pubDate>
</item>
<item>
<title>Publish your Python package that is on PyPI to conda-forge</title>
<link>https://www.pyopensci.org/python-package-guide/tutorials/publish-conda-forge.html</link>
<description>Learn how to publish your Python package on conda-forge to make it easily installable with conda. This lesson covers the submission process, metadata requirements, and maintaining your feedstock.</description>
<author>pyOpenSci</author>
<guid isPermaLink="true">https://www.pyopensci.org/python-package-guide/tutorials/publish-conda-forge.html</guid>
<pubDate>Mon, 11 Aug 2025 21:07:32 GMT</pubDate>
</item>
<item>
<title>Publish your Python package to PyPI</title>
<link>https://www.pyopensci.org/python-package-guide/tutorials/publish-pypi.html</link>
<description>Learn how to publish your Python package on PyPI so others can install it using pip. This lesson covers building your package, creating a PyPI account, and uploading your distribution files.</description>
<author>pyOpenSci</author>
<guid isPermaLink="true">https://www.pyopensci.org/python-package-guide/tutorials/publish-pypi.html</guid>
<pubDate>Mon, 11 Aug 2025 21:07:32 GMT</pubDate>
</item>
<item>
<title>Make your Python package PyPI ready - pyproject.toml</title>
<link>https://www.pyopensci.org/python-package-guide/tutorials/pyproject-toml.html</link>
<description>The pyproject.toml file is the central configuration file for building and packaging Python projects. This lesson explains key sections like name, version, dependencies, and how they support packaging and distribution. You’ll learn how to set up this file to ensure your package is ready for publishing.</description>
<author>pyOpenSci</author>
<guid isPermaLink="true">https://www.pyopensci.org/python-package-guide/tutorials/pyproject-toml.html</guid>
<pubDate>Mon, 11 Aug 2025 21:07:32 GMT</pubDate>
</item>
<item>
<title>Using Hatch to Migrate setup.py to a pyproject.toml</title>
<link>https://www.pyopensci.org/python-package-guide/tutorials/setup-py-to-pyproject-toml.html</link>
<description>If you’re creating a pure Python project, pyproject.toml is preferred over setup.py for packaging and configuration. Learn how to migrate from the older setup.py format to the modern pyproject.toml file. This lesson walks you through updating your package metadata and build settings to align with current Python packaging standards.</description>
<author>pyOpenSci</author>
<guid isPermaLink="true">https://www.pyopensci.org/python-package-guide/tutorials/setup-py-to-pyproject-toml.html</guid>
<pubDate>Mon, 11 Aug 2025 21:07:32 GMT</pubDate>
</item>
<item>
<title>Setup Trusted Publishing for secure and automated publishing via GitHub Actions</title>
<link>https://www.pyopensci.org/python-package-guide/tutorials/trusted-publishing.html</link>
<description>Learn how to publish your Python package automatically via GitHub Actions. This lesson also covers how to do publishing in a secure way by using Trusted Publishing.</description>
<author>pyOpenSci</author>
<guid isPermaLink="true">https://www.pyopensci.org/python-package-guide/tutorials/trusted-publishing.html</guid>
<pubDate>Mon, 11 Aug 2025 21:07:32 GMT</pubDate>
</item>
</channel>
</rss>
24 changes: 24 additions & 0 deletions tests/unit/test_parse_rss.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
from pyosmeta.parse_rss import fetch_rss_feed_as_stubs, parse_rss_feed


def test_rss_feed_parse(rss_feed_url):
feed = parse_rss_feed(rss_feed_url)
assert feed is not None
assert len(feed) > 0
for entry in feed:
assert isinstance(entry, dict)
assert "title" in entry
assert "link" in entry
assert "summary" in entry


def test_fetch_rss_feed_as_stubs(rss_feed_url):
stubs = fetch_rss_feed_as_stubs(rss_feed_url)
assert isinstance(stubs, dict)
assert len(stubs) > 0
for filename, content in stubs.items():
assert filename.endswith(".md")
assert isinstance(content, str)
assert "title:" in content
assert "excerpt:" in content
assert "link:" in content
Loading