fix(workflows): use temp file for RSS parsing (#11)

CalvinAllen · web-flow · commit 8dc95a609275 · 2025-12-27T19:04:53.000-05:00
- Use temp file instead of piping to avoid broken pipe errors
- Pass target_date as command line argument to Python
- Remove conflicting env block that could override shell variable
diff --git a/.github/workflows/detect-blog-post-from-rss.yml b/.github/workflows/detect-blog-post-from-rss.yml
@@ -65,86 +65,84 @@ jobs:
           fi
 
           echo "Fetching RSS from: $RSS_URL"
-          RSS_CONTENT=$(curl -s "$RSS_URL")
+          RSS_FILE=$(mktemp)
+          curl -s "$RSS_URL" > "$RSS_FILE"
 
-          if [ -z "$RSS_CONTENT" ]; then
+          if [ ! -s "$RSS_FILE" ]; then
             echo "Failed to fetch RSS feed"
             echo "has_posts=false" >> $GITHUB_OUTPUT
             echo "post_count=0" >> $GITHUB_OUTPUT
+            rm -f "$RSS_FILE"
             exit 0
           fi
 
           # Parse RSS and find posts matching target date
-          # Extract items and filter by pubDate
-          POSTS_JSON=$(echo "$RSS_CONTENT" | python3 << 'PYTHON_SCRIPT'
-          import sys
-          import xml.etree.ElementTree as ET
-          import json
-          from email.utils import parsedate_to_datetime
-          import os
-
-          target_date = os.environ.get('TARGET_DATE', '')
-          rss_content = sys.stdin.read()
-
-          try:
-              root = ET.fromstring(rss_content)
-          except ET.ParseError as e:
-              print(json.dumps([]))
-              sys.exit(0)
-
-          posts = []
-          channel = root.find('channel')
-          if channel is None:
-              print(json.dumps([]))
-              sys.exit(0)
-
-          for item in channel.findall('item'):
-              pub_date_elem = item.find('pubDate')
-              if pub_date_elem is None:
-                  continue
-
-              pub_date_str = pub_date_elem.text
-              # Parse RFC 822 date format using email.utils (handles GMT correctly)
-              try:
-                  pub_date = parsedate_to_datetime(pub_date_str)
-              except (ValueError, TypeError):
-                  continue
-
-              post_date = pub_date.strftime('%Y-%m-%d')
-
-              if post_date != target_date:
-                  continue
-
-              title = item.find('title')
-              link = item.find('link')
-              description = item.find('description')
-              enclosure = item.find('enclosure')
-
-              # Get categories
-              categories = []
-              for cat in item.findall('category'):
-                  if cat.text:
-                      categories.append(cat.text)
-
-              # Convert categories to hashtags
-              hashtags = ' '.join(['#' + cat.replace(' ', '') for cat in categories])
-
-              post = {
-                  'title': title.text if title is not None else '',
-                  'url': link.text if link is not None else '',
-                  'description': description.text if description is not None else '',
-                  'categories': categories,
-                  'hashtags': hashtags,
-                  'image_url': enclosure.get('url') if enclosure is not None else '',
-                  'pub_date': post_date
-              }
-              posts.append(post)
-
-          print(json.dumps(posts))
-          PYTHON_SCRIPT
+          POSTS_JSON=$(python3 - "$TARGET_DATE" "$RSS_FILE" << 'PYTHON_SCRIPT'
+import sys
+import xml.etree.ElementTree as ET
+import json
+from email.utils import parsedate_to_datetime
+
+target_date = sys.argv[1]
+rss_file = sys.argv[2]
+
+try:
+    tree = ET.parse(rss_file)
+    root = tree.getroot()
+except ET.ParseError as e:
+    print(json.dumps([]))
+    sys.exit(0)
+
+posts = []
+channel = root.find('channel')
+if channel is None:
+    print(json.dumps([]))
+    sys.exit(0)
+
+for item in channel.findall('item'):
+    pub_date_elem = item.find('pubDate')
+    if pub_date_elem is None:
+        continue
+
+    pub_date_str = pub_date_elem.text
+    try:
+        pub_date = parsedate_to_datetime(pub_date_str)
+    except (ValueError, TypeError):
+        continue
+
+    post_date = pub_date.strftime('%Y-%m-%d')
+
+    if post_date != target_date:
+        continue
+
+    title = item.find('title')
+    link = item.find('link')
+    description = item.find('description')
+    enclosure = item.find('enclosure')
+
+    categories = []
+    for cat in item.findall('category'):
+        if cat.text:
+            categories.append(cat.text)
+
+    hashtags = ' '.join(['#' + cat.replace(' ', '') for cat in categories])
+
+    post = {
+        'title': title.text if title is not None else '',
+        'url': link.text if link is not None else '',
+        'description': description.text if description is not None else '',
+        'categories': categories,
+        'hashtags': hashtags,
+        'image_url': enclosure.get('url') if enclosure is not None else '',
+        'pub_date': post_date
+    }
+    posts.append(post)
+
+print(json.dumps(posts))
+PYTHON_SCRIPT
           )
 
-          export TARGET_DATE="$TARGET_DATE"
+          rm -f "$RSS_FILE"
 
           POST_COUNT=$(echo "$POSTS_JSON" | python3 -c "import sys, json; print(len(json.load(sys.stdin)))")
 
@@ -186,5 +184,3 @@ jobs:
           echo "post_description=$POST_DESCRIPTION" >> $GITHUB_OUTPUT
           echo "post_hashtags=$POST_HASHTAGS" >> $GITHUB_OUTPUT
           echo "post_image_url=$POST_IMAGE_URL" >> $GITHUB_OUTPUT
-        env:
-          TARGET_DATE: ${{ inputs.target_date }}