Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,2 +1,4 @@
# Binary
app/GermanTechPodcasts
app/GermanTechPodcasts

emails*.json
4 changes: 4 additions & 0 deletions app/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -28,3 +28,7 @@ run: build ## Compiles and starts the application
.PHONY: test
test: ## Runs all unit tests
go test -v -race ./...

.PHONY: extract-emails
extract-emails: build ## Extracts contact emails from RSS feeds and saves to JSON file
./GermanTechPodcasts extractEmails --json-directory ../generated --output ../emails_with_authors.json
273 changes: 273 additions & 0 deletions app/cmd/extractEmails.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,273 @@
package cmd

import (
"encoding/json"
"log"
"net/http"
"os"
"path/filepath"
"regexp"
"strings"
"time"

libIO "github.com/EngineeringKiosk/GermanTechPodcasts/io"
"github.com/mmcdole/gofeed"
"github.com/spf13/cobra"
)

type AuthorInfo struct {
Name string `json:"name,omitempty"`
Email string `json:"email,omitempty"`
}

type EmailResult struct {
PodcastName string `json:"podcastName"`
RSSFeed string `json:"rssFeed"`
Emails []string `json:"emails"`
Authors []AuthorInfo `json:"authors,omitempty"`
Error string `json:"error,omitempty"`
}

// extractEmailsCmd represents the extractEmails command
var extractEmailsCmd = &cobra.Command{
Use: "extractEmails",
Short: "Extract contact email addresses from podcast RSS feeds",
Long: `This command reads all podcast JSON files and extracts contact email addresses
from their RSS feeds. It looks for email addresses in various RSS fields like
managingEditor, webMaster, author, and description.`,
RunE: cmdExtractEmails,
}

func init() {
rootCmd.AddCommand(extractEmailsCmd)

extractEmailsCmd.Flags().String("json-directory", "", "Directory containing the podcast JSON files")
extractEmailsCmd.Flags().String("output", "", "Optional: Output file to write results (if not provided, prints to stdout)")

extractEmailsCmd.MarkFlagRequired("json-directory")
}

func cmdExtractEmails(cmd *cobra.Command, args []string) error {
jsonDir, err := cmd.Flags().GetString("json-directory")
if err != nil {
return err
}

outputFile, err := cmd.Flags().GetString("output")
if err != nil {
return err
}

log.Printf("Reading files with extension %s from directory %s", libIO.JSONExtension, jsonDir)
jsonFiles, err := libIO.GetAllFilesFromDirectory(jsonDir, libIO.JSONExtension)
if err != nil {
return err
}
log.Printf("%d files found with extension %s in directory %s", len(jsonFiles), libIO.JSONExtension, jsonDir)

// Create RSS parser
client := &http.Client{Timeout: 30 * time.Second}
fp := gofeed.NewParser()
fp.Client = client

var results []EmailResult

// Process each podcast file
for _, f := range jsonFiles {
absJsonFilePath := filepath.Join(jsonDir, f.Name())
log.Printf("Processing file %s", absJsonFilePath)

jsonFileContent, err := os.ReadFile(absJsonFilePath)
if err != nil {
log.Printf("Error reading file %s: %v", absJsonFilePath, err)
continue
}

podcastInfo := &PodcastInformation{}
err = json.Unmarshal(jsonFileContent, podcastInfo)
if err != nil {
log.Printf("Error unmarshaling JSON from %s: %v", absJsonFilePath, err)
continue
}

if podcastInfo.RSSFeed == "" {
log.Printf("Skipping %s: no RSS feed URL", podcastInfo.Name)
continue
}

result := processRSSFeed(*podcastInfo, fp)
results = append(results, result)
}

// Output results
if outputFile != "" {
return writeResultsToFile(results, outputFile)
} else {
return printResults(results)
}
}

func processRSSFeed(podcast PodcastInformation, fp *gofeed.Parser) EmailResult {
result := EmailResult{
PodcastName: podcast.Name,
RSSFeed: podcast.RSSFeed,
Emails: []string{},
}

log.Printf("Parsing RSS feed for %s: %s", podcast.Name, podcast.RSSFeed)

feed, err := fp.ParseURL(podcast.RSSFeed)
if err != nil {
result.Error = err.Error()
log.Printf("Error parsing RSS feed for %s: %v", podcast.Name, err)
return result
}

// Use maps to avoid duplicates
emails := make(map[string]bool)
authors := make(map[string]AuthorInfo) // Use email as key to avoid duplicates

// Check managingEditor and webMaster fields (available in custom fields for RSS)
if managingEditor, ok := feed.Custom["managingEditor"]; ok {
addEmailsFromText(managingEditor, emails)
}
if webMaster, ok := feed.Custom["webMaster"]; ok {
addEmailsFromText(webMaster, emails)
}

// Check author fields
if feed.Author != nil {
if feed.Author.Email != "" {
emails[feed.Author.Email] = true
authors[feed.Author.Email] = AuthorInfo{
Name: feed.Author.Name,
Email: feed.Author.Email,
}
}
}
for _, author := range feed.Authors {
if author.Email != "" {
emails[author.Email] = true
authors[author.Email] = AuthorInfo{
Name: author.Name,
Email: author.Email,
}
}
}

// Check description for emails
if feed.Description != "" {
addEmailsFromText(feed.Description, emails)
}

// Check iTunes fields
if feed.ITunesExt != nil {
if feed.ITunesExt.Owner != nil && feed.ITunesExt.Owner.Email != "" {
emails[feed.ITunesExt.Owner.Email] = true
authors[feed.ITunesExt.Owner.Email] = AuthorInfo{
Name: feed.ITunesExt.Owner.Name,
Email: feed.ITunesExt.Owner.Email,
}
}
if feed.ITunesExt.Author != "" {
addEmailsFromText(feed.ITunesExt.Author, emails)
}
}

// Check first 3 episodes for contact info
episodesToCheck := len(feed.Items)
if episodesToCheck > 3 {
episodesToCheck = 3
}

for i := 0; i < episodesToCheck; i++ {
item := feed.Items[i]

if item.Author != nil && item.Author.Email != "" {
emails[item.Author.Email] = true
authors[item.Author.Email] = AuthorInfo{
Name: item.Author.Name,
Email: item.Author.Email,
}
}
for _, author := range item.Authors {
if author.Email != "" {
emails[author.Email] = true
authors[author.Email] = AuthorInfo{
Name: author.Name,
Email: author.Email,
}
}
}
if item.Description != "" {
addEmailsFromText(item.Description, emails)
}
}

// Convert maps to slices
for email := range emails {
result.Emails = append(result.Emails, email)
}

for _, author := range authors {
result.Authors = append(result.Authors, author)
}

if len(result.Emails) > 0 {
log.Printf("Found %d email(s) and %d author(s) for %s: %v", len(result.Emails), len(result.Authors), podcast.Name, result.Emails)
} else {
log.Printf("No emails found for %s", podcast.Name)
}

return result
}

func addEmailsFromText(text string, emails map[string]bool) {
// Regular expression to match email addresses
emailRegex := regexp.MustCompile(`[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}`)
matches := emailRegex.FindAllString(text, -1)

for _, match := range matches {
// Clean up the email (remove any trailing punctuation)
email := strings.Trim(match, ".,;!?")
emails[email] = true
}
}

func writeResultsToFile(results []EmailResult, filename string) error {
file, err := os.Create(filename)
if err != nil {
return err
}
defer file.Close()

encoder := json.NewEncoder(file)
encoder.SetIndent("", " ")

if err := encoder.Encode(results); err != nil {
return err
}

log.Printf("Results written to %s", filename)
return nil
}

func printResults(results []EmailResult) error {
// Print summary to stderr so JSON can be piped cleanly
log.Printf("Found emails from %d podcasts", len(results))

foundEmails := 0
totalEmails := 0
for _, result := range results {
if len(result.Emails) > 0 {
foundEmails++
totalEmails += len(result.Emails)
}
}
log.Printf("Emails found in %d/%d podcasts (%d total emails)", foundEmails, len(results), totalEmails)

// Print JSON to stdout
encoder := json.NewEncoder(os.Stdout)
encoder.SetIndent("", " ")
return encoder.Encode(results)
}
15 changes: 12 additions & 3 deletions app/go.mod
Original file line number Diff line number Diff line change
@@ -1,17 +1,26 @@
module github.com/EngineeringKiosk/GermanTechPodcasts

go 1.25
go 1.19

require (
github.com/gosimple/slug v1.15.0
github.com/spf13/cobra v1.10.1
github.com/mmcdole/gofeed v1.3.0
github.com/spf13/cobra v1.8.1
gopkg.in/yaml.v3 v3.0.1
)

require (
github.com/PuerkitoBio/goquery v1.8.0 // indirect
github.com/andybalholm/cascadia v1.3.1 // indirect
github.com/gosimple/unidecode v1.0.1 // indirect
github.com/inconshreveable/mousetrap v1.1.0 // indirect
github.com/json-iterator/go v1.1.12 // indirect
github.com/kr/pretty v0.1.0 // indirect
github.com/spf13/pflag v1.0.10 // indirect
github.com/mmcdole/goxpp v1.1.1-0.20240225020742-a0c311522b23 // indirect
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
github.com/modern-go/reflect2 v1.0.2 // indirect
github.com/spf13/pflag v1.0.5 // indirect
golang.org/x/net v0.4.0 // indirect
golang.org/x/text v0.5.0 // indirect
gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127 // indirect
)
45 changes: 39 additions & 6 deletions app/go.sum
Original file line number Diff line number Diff line change
@@ -1,21 +1,54 @@
github.com/cpuguy83/go-md2man/v2 v2.0.6/go.mod h1:oOW0eioCTA6cOiMLiUPZOpcVxMig6NIQQ7OS05n1F4g=
github.com/PuerkitoBio/goquery v1.8.0 h1:PJTF7AmFCFKk1N6V6jmKfrNH9tV5pNE6lZMkG0gta/U=
github.com/PuerkitoBio/goquery v1.8.0/go.mod h1:ypIiRMtY7COPGk+I/YbZLbxsxn9g5ejnI2HSMtkjZvI=
github.com/andybalholm/cascadia v1.3.1 h1:nhxRkql1kdYCc8Snf7D5/D3spOX+dBgjA6u8x004T2c=
github.com/andybalholm/cascadia v1.3.1/go.mod h1:R4bJ1UQfqADjvDa4P6HZHLh/3OxWWEqc0Sk8XGwHqvA=
github.com/cpuguy83/go-md2man/v2 v2.0.4/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o=
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg=
github.com/gosimple/slug v1.15.0 h1:wRZHsRrRcs6b0XnxMUBM6WK1U1Vg5B0R7VkIf1Xzobo=
github.com/gosimple/slug v1.15.0/go.mod h1:UiRaFH+GEilHstLUmcBgWcI42viBN7mAb818JrYOeFQ=
github.com/gosimple/unidecode v1.0.1 h1:hZzFTMMqSswvf0LBJZCZgThIZrpDHFXux9KeGmn6T/o=
github.com/gosimple/unidecode v1.0.1/go.mod h1:CP0Cr1Y1kogOtx0bJblKzsVWrqYaqfNOnHzpgWw4Awc=
github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8=
github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw=
github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM=
github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo=
github.com/kr/pretty v0.1.0 h1:L/CwN0zerZDmRFUapSPitk6f+Q3+0za1rQkzVuMiMFI=
github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo=
github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
github.com/kr/text v0.1.0 h1:45sCR5RtlFHMR4UwH9sdQ5TC8v0qDQCHnXt+kaKSTVE=
github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
github.com/mmcdole/gofeed v1.3.0 h1:5yn+HeqlcvjMeAI4gu6T+crm7d0anY85+M+v6fIFNG4=
github.com/mmcdole/gofeed v1.3.0/go.mod h1:9TGv2LcJhdXePDzxiuMnukhV2/zb6VtnZt1mS+SjkLE=
github.com/mmcdole/goxpp v1.1.1-0.20240225020742-a0c311522b23 h1:Zr92CAlFhy2gL+V1F+EyIuzbQNbSgP4xhTODZtrXUtk=
github.com/mmcdole/goxpp v1.1.1-0.20240225020742-a0c311522b23/go.mod h1:v+25+lT2ViuQ7mVxcncQ8ch1URund48oH+jhjiwEgS8=
github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg=
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
github.com/modern-go/reflect2 v1.0.2 h1:xBagoLtFs94CBntxluKeaWgTMpvLxC4ur3nMaC9Gz0M=
github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM=
github.com/spf13/cobra v1.10.1 h1:lJeBwCfmrnXthfAupyUTzJ/J4Nc1RsHC/mSRU2dll/s=
github.com/spf13/cobra v1.10.1/go.mod h1:7SmJGaTHFVBY0jW4NXGluQoLvhqFQM+6XSKD+P4XaB0=
github.com/spf13/pflag v1.0.9/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg=
github.com/spf13/pflag v1.0.10 h1:4EBh2KAYBwaONj6b2Ye1GiHfwjqyROoF4RwYO+vPwFk=
github.com/spf13/pflag v1.0.10/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg=
github.com/spf13/cobra v1.8.1 h1:e5/vxKd/rZsfSJMUX1agtjeTDf+qv1/JdBF8gg5k9ZM=
github.com/spf13/cobra v1.8.1/go.mod h1:wHxEcudfqmLYa8iTfL+OuZPbBZkmvliBWKIezN3kD9Y=
github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA=
github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg=
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
github.com/stretchr/testify v1.8.1 h1:w7B6lhMri9wdJUVmEZPGGhZzrYTPvgJArz7wNPgYKsk=
golang.org/x/net v0.0.0-20210916014120-12bc252f5db8/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
golang.org/x/net v0.4.0 h1:Q5QPcMlvfxFTAPV0+07Xz/MpK9NTXu2VDUuy0FeMfaU=
golang.org/x/net v0.4.0/go.mod h1:MBQ8lrhLObU/6UmLb4fmbmk5OcyYmqtbGd/9yIeKjEE=
golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
golang.org/x/text v0.5.0 h1:OLmvp0KP+FVG99Ct/qFiL/Fhk4zp4QQnZ7b2U+5piUM=
golang.org/x/text v0.5.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8=
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127 h1:qIbj1fsPNlZgppZ+VLlY7N33q108Sa+fhmuc+sWQYwY=
gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
Expand Down
Loading