diff --git a/.gitignore b/.gitignore index a8b026f4..dbe3b763 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,4 @@ # Binary -app/GermanTechPodcasts \ No newline at end of file +app/GermanTechPodcasts + +emails*.json \ No newline at end of file diff --git a/app/Makefile b/app/Makefile index 00fa8447..9df863d0 100644 --- a/app/Makefile +++ b/app/Makefile @@ -28,3 +28,7 @@ run: build ## Compiles and starts the application .PHONY: test test: ## Runs all unit tests go test -v -race ./... + +.PHONY: extract-emails +extract-emails: build ## Extracts contact emails from RSS feeds and saves to JSON file + ./GermanTechPodcasts extractEmails --json-directory ../generated --output ../emails_with_authors.json diff --git a/app/cmd/extractEmails.go b/app/cmd/extractEmails.go new file mode 100644 index 00000000..bb785277 --- /dev/null +++ b/app/cmd/extractEmails.go @@ -0,0 +1,273 @@ +package cmd + +import ( + "encoding/json" + "log" + "net/http" + "os" + "path/filepath" + "regexp" + "strings" + "time" + + libIO "github.com/EngineeringKiosk/GermanTechPodcasts/io" + "github.com/mmcdole/gofeed" + "github.com/spf13/cobra" +) + +type AuthorInfo struct { + Name string `json:"name,omitempty"` + Email string `json:"email,omitempty"` +} + +type EmailResult struct { + PodcastName string `json:"podcastName"` + RSSFeed string `json:"rssFeed"` + Emails []string `json:"emails"` + Authors []AuthorInfo `json:"authors,omitempty"` + Error string `json:"error,omitempty"` +} + +// extractEmailsCmd represents the extractEmails command +var extractEmailsCmd = &cobra.Command{ + Use: "extractEmails", + Short: "Extract contact email addresses from podcast RSS feeds", + Long: `This command reads all podcast JSON files and extracts contact email addresses +from their RSS feeds. It looks for email addresses in various RSS fields like +managingEditor, webMaster, author, and description.`, + RunE: cmdExtractEmails, +} + +func init() { + rootCmd.AddCommand(extractEmailsCmd) + + extractEmailsCmd.Flags().String("json-directory", "", "Directory containing the podcast JSON files") + extractEmailsCmd.Flags().String("output", "", "Optional: Output file to write results (if not provided, prints to stdout)") + + extractEmailsCmd.MarkFlagRequired("json-directory") +} + +func cmdExtractEmails(cmd *cobra.Command, args []string) error { + jsonDir, err := cmd.Flags().GetString("json-directory") + if err != nil { + return err + } + + outputFile, err := cmd.Flags().GetString("output") + if err != nil { + return err + } + + log.Printf("Reading files with extension %s from directory %s", libIO.JSONExtension, jsonDir) + jsonFiles, err := libIO.GetAllFilesFromDirectory(jsonDir, libIO.JSONExtension) + if err != nil { + return err + } + log.Printf("%d files found with extension %s in directory %s", len(jsonFiles), libIO.JSONExtension, jsonDir) + + // Create RSS parser + client := &http.Client{Timeout: 30 * time.Second} + fp := gofeed.NewParser() + fp.Client = client + + var results []EmailResult + + // Process each podcast file + for _, f := range jsonFiles { + absJsonFilePath := filepath.Join(jsonDir, f.Name()) + log.Printf("Processing file %s", absJsonFilePath) + + jsonFileContent, err := os.ReadFile(absJsonFilePath) + if err != nil { + log.Printf("Error reading file %s: %v", absJsonFilePath, err) + continue + } + + podcastInfo := &PodcastInformation{} + err = json.Unmarshal(jsonFileContent, podcastInfo) + if err != nil { + log.Printf("Error unmarshaling JSON from %s: %v", absJsonFilePath, err) + continue + } + + if podcastInfo.RSSFeed == "" { + log.Printf("Skipping %s: no RSS feed URL", podcastInfo.Name) + continue + } + + result := processRSSFeed(*podcastInfo, fp) + results = append(results, result) + } + + // Output results + if outputFile != "" { + return writeResultsToFile(results, outputFile) + } else { + return printResults(results) + } +} + +func processRSSFeed(podcast PodcastInformation, fp *gofeed.Parser) EmailResult { + result := EmailResult{ + PodcastName: podcast.Name, + RSSFeed: podcast.RSSFeed, + Emails: []string{}, + } + + log.Printf("Parsing RSS feed for %s: %s", podcast.Name, podcast.RSSFeed) + + feed, err := fp.ParseURL(podcast.RSSFeed) + if err != nil { + result.Error = err.Error() + log.Printf("Error parsing RSS feed for %s: %v", podcast.Name, err) + return result + } + + // Use maps to avoid duplicates + emails := make(map[string]bool) + authors := make(map[string]AuthorInfo) // Use email as key to avoid duplicates + + // Check managingEditor and webMaster fields (available in custom fields for RSS) + if managingEditor, ok := feed.Custom["managingEditor"]; ok { + addEmailsFromText(managingEditor, emails) + } + if webMaster, ok := feed.Custom["webMaster"]; ok { + addEmailsFromText(webMaster, emails) + } + + // Check author fields + if feed.Author != nil { + if feed.Author.Email != "" { + emails[feed.Author.Email] = true + authors[feed.Author.Email] = AuthorInfo{ + Name: feed.Author.Name, + Email: feed.Author.Email, + } + } + } + for _, author := range feed.Authors { + if author.Email != "" { + emails[author.Email] = true + authors[author.Email] = AuthorInfo{ + Name: author.Name, + Email: author.Email, + } + } + } + + // Check description for emails + if feed.Description != "" { + addEmailsFromText(feed.Description, emails) + } + + // Check iTunes fields + if feed.ITunesExt != nil { + if feed.ITunesExt.Owner != nil && feed.ITunesExt.Owner.Email != "" { + emails[feed.ITunesExt.Owner.Email] = true + authors[feed.ITunesExt.Owner.Email] = AuthorInfo{ + Name: feed.ITunesExt.Owner.Name, + Email: feed.ITunesExt.Owner.Email, + } + } + if feed.ITunesExt.Author != "" { + addEmailsFromText(feed.ITunesExt.Author, emails) + } + } + + // Check first 3 episodes for contact info + episodesToCheck := len(feed.Items) + if episodesToCheck > 3 { + episodesToCheck = 3 + } + + for i := 0; i < episodesToCheck; i++ { + item := feed.Items[i] + + if item.Author != nil && item.Author.Email != "" { + emails[item.Author.Email] = true + authors[item.Author.Email] = AuthorInfo{ + Name: item.Author.Name, + Email: item.Author.Email, + } + } + for _, author := range item.Authors { + if author.Email != "" { + emails[author.Email] = true + authors[author.Email] = AuthorInfo{ + Name: author.Name, + Email: author.Email, + } + } + } + if item.Description != "" { + addEmailsFromText(item.Description, emails) + } + } + + // Convert maps to slices + for email := range emails { + result.Emails = append(result.Emails, email) + } + + for _, author := range authors { + result.Authors = append(result.Authors, author) + } + + if len(result.Emails) > 0 { + log.Printf("Found %d email(s) and %d author(s) for %s: %v", len(result.Emails), len(result.Authors), podcast.Name, result.Emails) + } else { + log.Printf("No emails found for %s", podcast.Name) + } + + return result +} + +func addEmailsFromText(text string, emails map[string]bool) { + // Regular expression to match email addresses + emailRegex := regexp.MustCompile(`[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}`) + matches := emailRegex.FindAllString(text, -1) + + for _, match := range matches { + // Clean up the email (remove any trailing punctuation) + email := strings.Trim(match, ".,;!?") + emails[email] = true + } +} + +func writeResultsToFile(results []EmailResult, filename string) error { + file, err := os.Create(filename) + if err != nil { + return err + } + defer file.Close() + + encoder := json.NewEncoder(file) + encoder.SetIndent("", " ") + + if err := encoder.Encode(results); err != nil { + return err + } + + log.Printf("Results written to %s", filename) + return nil +} + +func printResults(results []EmailResult) error { + // Print summary to stderr so JSON can be piped cleanly + log.Printf("Found emails from %d podcasts", len(results)) + + foundEmails := 0 + totalEmails := 0 + for _, result := range results { + if len(result.Emails) > 0 { + foundEmails++ + totalEmails += len(result.Emails) + } + } + log.Printf("Emails found in %d/%d podcasts (%d total emails)", foundEmails, len(results), totalEmails) + + // Print JSON to stdout + encoder := json.NewEncoder(os.Stdout) + encoder.SetIndent("", " ") + return encoder.Encode(results) +} diff --git a/app/go.mod b/app/go.mod index 9bf3bd60..ae210cc1 100644 --- a/app/go.mod +++ b/app/go.mod @@ -1,17 +1,26 @@ module github.com/EngineeringKiosk/GermanTechPodcasts -go 1.25 +go 1.19 require ( github.com/gosimple/slug v1.15.0 - github.com/spf13/cobra v1.10.1 + github.com/mmcdole/gofeed v1.3.0 + github.com/spf13/cobra v1.8.1 gopkg.in/yaml.v3 v3.0.1 ) require ( + github.com/PuerkitoBio/goquery v1.8.0 // indirect + github.com/andybalholm/cascadia v1.3.1 // indirect github.com/gosimple/unidecode v1.0.1 // indirect github.com/inconshreveable/mousetrap v1.1.0 // indirect + github.com/json-iterator/go v1.1.12 // indirect github.com/kr/pretty v0.1.0 // indirect - github.com/spf13/pflag v1.0.10 // indirect + github.com/mmcdole/goxpp v1.1.1-0.20240225020742-a0c311522b23 // indirect + github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect + github.com/modern-go/reflect2 v1.0.2 // indirect + github.com/spf13/pflag v1.0.5 // indirect + golang.org/x/net v0.4.0 // indirect + golang.org/x/text v0.5.0 // indirect gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127 // indirect ) diff --git a/app/go.sum b/app/go.sum index bf9a4a66..14c88fc0 100644 --- a/app/go.sum +++ b/app/go.sum @@ -1,21 +1,54 @@ -github.com/cpuguy83/go-md2man/v2 v2.0.6/go.mod h1:oOW0eioCTA6cOiMLiUPZOpcVxMig6NIQQ7OS05n1F4g= +github.com/PuerkitoBio/goquery v1.8.0 h1:PJTF7AmFCFKk1N6V6jmKfrNH9tV5pNE6lZMkG0gta/U= +github.com/PuerkitoBio/goquery v1.8.0/go.mod h1:ypIiRMtY7COPGk+I/YbZLbxsxn9g5ejnI2HSMtkjZvI= +github.com/andybalholm/cascadia v1.3.1 h1:nhxRkql1kdYCc8Snf7D5/D3spOX+dBgjA6u8x004T2c= +github.com/andybalholm/cascadia v1.3.1/go.mod h1:R4bJ1UQfqADjvDa4P6HZHLh/3OxWWEqc0Sk8XGwHqvA= +github.com/cpuguy83/go-md2man/v2 v2.0.4/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o= +github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= github.com/gosimple/slug v1.15.0 h1:wRZHsRrRcs6b0XnxMUBM6WK1U1Vg5B0R7VkIf1Xzobo= github.com/gosimple/slug v1.15.0/go.mod h1:UiRaFH+GEilHstLUmcBgWcI42viBN7mAb818JrYOeFQ= github.com/gosimple/unidecode v1.0.1 h1:hZzFTMMqSswvf0LBJZCZgThIZrpDHFXux9KeGmn6T/o= github.com/gosimple/unidecode v1.0.1/go.mod h1:CP0Cr1Y1kogOtx0bJblKzsVWrqYaqfNOnHzpgWw4Awc= github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8= github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw= +github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM= +github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo= github.com/kr/pretty v0.1.0 h1:L/CwN0zerZDmRFUapSPitk6f+Q3+0za1rQkzVuMiMFI= github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo= github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= github.com/kr/text v0.1.0 h1:45sCR5RtlFHMR4UwH9sdQ5TC8v0qDQCHnXt+kaKSTVE= github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= +github.com/mmcdole/gofeed v1.3.0 h1:5yn+HeqlcvjMeAI4gu6T+crm7d0anY85+M+v6fIFNG4= +github.com/mmcdole/gofeed v1.3.0/go.mod h1:9TGv2LcJhdXePDzxiuMnukhV2/zb6VtnZt1mS+SjkLE= +github.com/mmcdole/goxpp v1.1.1-0.20240225020742-a0c311522b23 h1:Zr92CAlFhy2gL+V1F+EyIuzbQNbSgP4xhTODZtrXUtk= +github.com/mmcdole/goxpp v1.1.1-0.20240225020742-a0c311522b23/go.mod h1:v+25+lT2ViuQ7mVxcncQ8ch1URund48oH+jhjiwEgS8= +github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= +github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg= +github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= +github.com/modern-go/reflect2 v1.0.2 h1:xBagoLtFs94CBntxluKeaWgTMpvLxC4ur3nMaC9Gz0M= +github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= -github.com/spf13/cobra v1.10.1 h1:lJeBwCfmrnXthfAupyUTzJ/J4Nc1RsHC/mSRU2dll/s= -github.com/spf13/cobra v1.10.1/go.mod h1:7SmJGaTHFVBY0jW4NXGluQoLvhqFQM+6XSKD+P4XaB0= -github.com/spf13/pflag v1.0.9/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= -github.com/spf13/pflag v1.0.10 h1:4EBh2KAYBwaONj6b2Ye1GiHfwjqyROoF4RwYO+vPwFk= -github.com/spf13/pflag v1.0.10/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= +github.com/spf13/cobra v1.8.1 h1:e5/vxKd/rZsfSJMUX1agtjeTDf+qv1/JdBF8gg5k9ZM= +github.com/spf13/cobra v1.8.1/go.mod h1:wHxEcudfqmLYa8iTfL+OuZPbBZkmvliBWKIezN3kD9Y= +github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA= +github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= +github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= +github.com/stretchr/testify v1.8.1 h1:w7B6lhMri9wdJUVmEZPGGhZzrYTPvgJArz7wNPgYKsk= +golang.org/x/net v0.0.0-20210916014120-12bc252f5db8/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y= +golang.org/x/net v0.4.0 h1:Q5QPcMlvfxFTAPV0+07Xz/MpK9NTXu2VDUuy0FeMfaU= +golang.org/x/net v0.4.0/go.mod h1:MBQ8lrhLObU/6UmLb4fmbmk5OcyYmqtbGd/9yIeKjEE= +golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= +golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= +golang.org/x/text v0.5.0 h1:OLmvp0KP+FVG99Ct/qFiL/Fhk4zp4QQnZ7b2U+5piUM= +golang.org/x/text v0.5.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= +golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127 h1:qIbj1fsPNlZgppZ+VLlY7N33q108Sa+fhmuc+sWQYwY= gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=