Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions dev-requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -135,3 +135,15 @@ webencodings==0.5.1 ; python_version >= "3.11" and python_version < "4.0"
websockets==13.1 ; python_version >= "3.11" and python_version < "4.0"
wheel==0.44.0 ; python_version >= "3.11" and python_version < "4.0"
youtube-transcript-api==0.6.2 ; python_version >= "3.11" and python_version < "4.0"
python-docx>=0.8.11 ; python_version >= "3.11" and python_version < "4.0"
python-pptx>=0.6.21 ; python_version >= "3.11" and python_version < "4.0"
openpyxl>=3.1.2 ; python_version >= "3.11" and python_version < "4.0"
xlrd>=2.0.1 ; python_version >= "3.11" and python_version < "4.0"
pandas>=2.0.0 ; python_version >= "3.11" and python_version < "4.0"
requests>=2.31.0 ; python_version >= "3.11" and python_version < "4.0"
pymupdf>=1.23.8 ; python_version >= "3.11" and python_version < "4.0"
beautifulsoup4>=4.12.0 ; python_version >= "3.11" and python_version < "4.0"
youtube-transcript-api>=0.6.1 ; python_version >= "3.11" and python_version < "4.0"
google-generativeai>=0.3.0 ; python_version >= "3.11" and python_version < "4.0"
pytest>=7.4.0 ; python_version >= "3.11" and python_version < "4.0"
pytest-mock>=3.11.1 ; python_version >= "3.11" and python_version < "4.0"
200 changes: 84 additions & 116 deletions podcastfy/content_parser/content_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,9 @@
Content Extractor Module

This module provides functionality to extract content from various sources including
websites, YouTube videos, and PDF files. It serves as a central hub for content
extraction, delegating to specialized extractors based on the source type.
websites, YouTube videos, PDF files, and Microsoft Office documents. It serves as a
central hub for content extraction, delegating to specialized extractors based on
the source type.
"""

import logging
Expand All @@ -12,122 +13,89 @@
from urllib.parse import urlparse
from .youtube_transcriber import YouTubeTranscriber
from .website_extractor import WebsiteExtractor
from .pdf_extractor import PDFExtractor
from .unified_extractor import UnifiedExtractor
from podcastfy.utils.config import load_config

logger = logging.getLogger(__name__)

class ContentExtractor:
def __init__(self):
"""
Initialize the ContentExtractor.
"""
self.youtube_transcriber = YouTubeTranscriber()
self.website_extractor = WebsiteExtractor()
self.pdf_extractor = PDFExtractor()
self.config = load_config()
self.content_extractor_config = self.config.get('content_extractor', {})

def is_url(self, source: str) -> bool:
"""
Check if the given source is a valid URL.

Args:
source (str): The source to check.

Returns:
bool: True if the source is a valid URL, False otherwise.
"""
try:
# If the source doesn't start with a scheme, add 'https://'
if not source.startswith(('http://', 'https://')):
source = 'https://' + source

result = urlparse(source)
return all([result.scheme, result.netloc])
except ValueError:
return False

def extract_content(self, source: str) -> str:
"""
Extract content from various sources.

Args:
source (str): URL or file path of the content source.

Returns:
str: Extracted text content.

Raises:
ValueError: If the source type is unsupported.
"""
try:
if source.lower().endswith('.pdf'):
return self.pdf_extractor.extract_content(source)
elif self.is_url(source):
if any(pattern in source for pattern in self.content_extractor_config['youtube_url_patterns']):
return self.youtube_transcriber.extract_transcript(source)
else:
return self.website_extractor.extract_content(source)
else:
raise ValueError("Unsupported source type")
except Exception as e:
logger.error(f"Error extracting content from {source}: {str(e)}")
raise

def generate_topic_content(self, topic: str) -> str:
"""
Generate content based on a given topic using a generative model.

Args:
topic (str): The topic to generate content for.

Returns:
str: Generated content based on the topic.
"""
try:
import google.generativeai as genai

model = genai.GenerativeModel('models/gemini-1.5-flash-002')
topic_prompt = f'Be detailed. Search for {topic}'
response = model.generate_content(contents=topic_prompt, tools='google_search_retrieval')

return response.candidates[0].content.parts[0].text
except Exception as e:
logger.error(f"Error generating content for topic '{topic}': {str(e)}")
raise


def main(seed: int = 42) -> None:
"""
Main function to test the ContentExtractor class.
"""
logging.basicConfig(level=logging.INFO)

# Create an instance of ContentExtractor
extractor = ContentExtractor()

# Test sources
test_sources: List[str] = [
"www.souzatharsis.com",
"https://www.youtube.com/watch?v=dQw4w9WgXcQ",
"path/to/sample.pdf"
]

for source in test_sources:
try:
logger.info(f"Extracting content from: {source}")
content = extractor.extract_content(source)

# Print the first 500 characters of the extracted content
logger.info(f"Extracted content (first 500 characters):\n{content[:500]}...")

# Print the total length of the extracted content
logger.info(f"Total length of extracted content: {len(content)} characters")
logger.info("-" * 50)

except Exception as e:
logger.error(f"An error occurred while processing {source}: {str(e)}")

if __name__ == "__main__":
main()
def __init__(self):
"""
Initialize the ContentExtractor with specialized extractors for different content types.
"""
self.youtube_transcriber = YouTubeTranscriber()
self.website_extractor = WebsiteExtractor()
self.unified_extractor = UnifiedExtractor()
self.config = load_config()
self.content_extractor_config = self.config.get('content_extractor', {})

def is_url(self, source: str) -> bool:
"""
Check if the given source is a valid URL using strict validation.

Args:
source (str): The source to check.

Returns:
bool: True if the source is a valid URL, False otherwise.
"""
if not source or not isinstance(source, str):
return False

# URL pattern for validation
url_pattern = re.compile(
r'^(?:http|ftp)s?://' # http:// or https:// or ftp:// or ftps://
r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+[A-Z]{2,6}|' # domain
r'localhost|' # localhost
r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # or IP
r'(?::\d+)?' # optional port
r'(?:/?|[/?]\S+)$', re.IGNORECASE)

# For URLs with protocol
if url_pattern.match(source):
return True

# For domain-only URLs (e.g., example.com)
if '.' in source and ' ' not in source and not source.startswith('//'):
# Check if it's a valid domain pattern
domain_pattern = re.compile(
r'^(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+[A-Z]{2,6}(?:/?|[/?]\S+)?$',
re.IGNORECASE
)
return bool(domain_pattern.match(source))

return False

def extract_content(self, source: Union[str, bytes]) -> str:
"""
Extract content from various sources.

Args:
source (Union[str, bytes]): URL, file path, or bytes content of the source.

Returns:
str: Extracted text content.

Raises:
ValueError: If the source type is unsupported or invalid.
Exception: For other extraction errors.
"""
try:
source_lower = source.lower() if isinstance(source, str) else ""

# Handle YouTube URLs
if isinstance(source, str) and self.is_url(source):
# Check if it's a YouTube URL
if any(pattern in source_lower for pattern in self.content_extractor_config['youtube_url_patterns']):
return self.youtube_transcriber.extract_transcript(source)

# Check if it's a general website without specific file extension
if not any(ext in source_lower for ext in ['.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx', '.txt', '.json', '.xml', '.csv']):
return self.website_extractor.extract_content(source)

# Use UnifiedExtractor for all other content types
return self.unified_extractor.extract_content(source)

except Exception as e:
logger.error(f"Error extracting content from {source}: {str(e)}")
raise
68 changes: 0 additions & 68 deletions podcastfy/content_parser/pdf_extractor.py

This file was deleted.

Loading
Loading