podcasts/scrape_episodes.py at main · hasadna/podcasts · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
import json
import subprocess
import xml.etree.ElementTree as ET

from data import rssxml


root = ET.fromstring(rssxml)


def download_item(item):
    guid = item.findtext('guid')

    data = {
        "guid": guid,
        "title": item.findtext('title'),
        "description": item.findtext('description'),
        "link": item.findtext('link'),
        "enclosure_url": item.find('enclosure').get('url'),
        "pubDate": item.findtext('pubDate'),
        "image": item.find('{http://www.itunes.com/dtds/podcast-1.0.dtd}image').get('href'),
    }
    subprocess.run(["wget", "-O", f"site/images/{guid}.jpg", data['image']])
    with open(f'metadata/{guid}.json', 'w') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)
        # subprocess.run(['wget', '-O', f'hasadnapodcasts/{guid}.mp3', data['enclosure_url']])


for item in root.findall('channel/item'):
    download_item(item)