souzatharsis · gsantoshkumar1999 · Jan 30, 2025 · Jan 30, 2025 · Jan 30, 2025 · Jan 30, 2025
diff --git a/dev-requirements.txt b/dev-requirements.txt
@@ -135,3 +135,15 @@ webencodings==0.5.1 ; python_version >= "3.11" and python_version < "4.0"
 websockets==13.1 ; python_version >= "3.11" and python_version < "4.0"
 wheel==0.44.0 ; python_version >= "3.11" and python_version < "4.0"
 youtube-transcript-api==0.6.2 ; python_version >= "3.11" and python_version < "4.0"
+python-docx>=0.8.11 ; python_version >= "3.11" and python_version < "4.0"
+python-pptx>=0.6.21 ; python_version >= "3.11" and python_version < "4.0"
+openpyxl>=3.1.2 ; python_version >= "3.11" and python_version < "4.0"
+xlrd>=2.0.1 ; python_version >= "3.11" and python_version < "4.0"
+pandas>=2.0.0 ; python_version >= "3.11" and python_version < "4.0"
+requests>=2.31.0 ; python_version >= "3.11" and python_version < "4.0"
+pymupdf>=1.23.8 ; python_version >= "3.11" and python_version < "4.0"
+beautifulsoup4>=4.12.0 ; python_version >= "3.11" and python_version < "4.0"
+youtube-transcript-api>=0.6.1 ; python_version >= "3.11" and python_version < "4.0"
+google-generativeai>=0.3.0 ; python_version >= "3.11" and python_version < "4.0"
+pytest>=7.4.0 ; python_version >= "3.11" and python_version < "4.0"
+pytest-mock>=3.11.1 ; python_version >= "3.11" and python_version < "4.0"
diff --git a/podcastfy/content_parser/content_extractor.py b/podcastfy/content_parser/content_extractor.py
@@ -2,8 +2,9 @@
 Content Extractor Module
 
 This module provides functionality to extract content from various sources including
-websites, YouTube videos, and PDF files. It serves as a central hub for content
-extraction, delegating to specialized extractors based on the source type.
+websites, YouTube videos, PDF files, and Microsoft Office documents. It serves as a
+central hub for content extraction, delegating to specialized extractors based on
+the source type.
 """
 
 import logging
@@ -12,122 +13,89 @@
 from urllib.parse import urlparse
 from .youtube_transcriber import YouTubeTranscriber
 from .website_extractor import WebsiteExtractor
-from .pdf_extractor import PDFExtractor
+from .unified_extractor import UnifiedExtractor
 from podcastfy.utils.config import load_config
 
 logger = logging.getLogger(__name__)
 
 class ContentExtractor:
-	def __init__(self):
-		"""
-		Initialize the ContentExtractor.
-		"""
-		self.youtube_transcriber = YouTubeTranscriber()
-		self.website_extractor = WebsiteExtractor()
-		self.pdf_extractor = PDFExtractor()
-		self.config = load_config()
-		self.content_extractor_config = self.config.get('content_extractor', {})
-
-	def is_url(self, source: str) -> bool:
-		"""
-		Check if the given source is a valid URL.
-
-		Args:
-			source (str): The source to check.
-
-		Returns:
-			bool: True if the source is a valid URL, False otherwise.
-		"""
-		try:
-			# If the source doesn't start with a scheme, add 'https://'
-			if not source.startswith(('http://', 'https://')):
-				source = 'https://' + source
-
-			result = urlparse(source)
-			return all([result.scheme, result.netloc])
-		except ValueError:
-			return False
-
-	def extract_content(self, source: str) -> str:
-		"""
-		Extract content from various sources.
-
-		Args:
-			source (str): URL or file path of the content source.
-
-		Returns:
-			str: Extracted text content.
-
-		Raises:
-			ValueError: If the source type is unsupported.
-		"""
-		try:
-			if source.lower().endswith('.pdf'):
-				return self.pdf_extractor.extract_content(source)
-			elif self.is_url(source):
-				if any(pattern in source for pattern in self.content_extractor_config['youtube_url_patterns']):
-					return self.youtube_transcriber.extract_transcript(source)
-				else:
-					return self.website_extractor.extract_content(source)
-			else:
-				raise ValueError("Unsupported source type")
-		except Exception as e:
-			logger.error(f"Error extracting content from {source}: {str(e)}")
-			raise
-
-	def generate_topic_content(self, topic: str) -> str:
-		"""
-		Generate content based on a given topic using a generative model.
-
-		Args:
-			topic (str): The topic to generate content for.
-
-		Returns:
-			str: Generated content based on the topic.
-		"""
-		try:
-			import google.generativeai as genai
-
-			model = genai.GenerativeModel('models/gemini-1.5-flash-002')
-			topic_prompt = f'Be detailed. Search for {topic}'
-			response = model.generate_content(contents=topic_prompt, tools='google_search_retrieval')
-
-			return response.candidates[0].content.parts[0].text
-		except Exception as e:
-			logger.error(f"Error generating content for topic '{topic}': {str(e)}")
-			raise
-
-
-def main(seed: int = 42) -> None:
-	"""
-	Main function to test the ContentExtractor class.
-	"""
-	logging.basicConfig(level=logging.INFO)
-
-	# Create an instance of ContentExtractor
-	extractor = ContentExtractor()
-
-	# Test sources
-	test_sources: List[str] = [
-		"www.souzatharsis.com",
-		"https://www.youtube.com/watch?v=dQw4w9WgXcQ",
-		"path/to/sample.pdf"
-	]
-
-	for source in test_sources:
-		try:
-			logger.info(f"Extracting content from: {source}")
-			content = extractor.extract_content(source)
-
-			# Print the first 500 characters of the extracted content
-			logger.info(f"Extracted content (first 500 characters):\n{content[:500]}...")
-
-			# Print the total length of the extracted content
-			logger.info(f"Total length of extracted content: {len(content)} characters")
-			logger.info("-" * 50)
-
-		except Exception as e:
-			logger.error(f"An error occurred while processing {source}: {str(e)}")
-
-if __name__ == "__main__":
-	main()
+    def __init__(self):
+        """
+        Initialize the ContentExtractor with specialized extractors for different content types.
+        """
+        self.youtube_transcriber = YouTubeTranscriber()
+        self.website_extractor = WebsiteExtractor()
+        self.unified_extractor = UnifiedExtractor()
+        self.config = load_config()
+        self.content_extractor_config = self.config.get('content_extractor', {})
+
+    def is_url(self, source: str) -> bool:
+        """
+        Check if the given source is a valid URL using strict validation.
+
+        Args:
+            source (str): The source to check.
+
+        Returns:
+            bool: True if the source is a valid URL, False otherwise.
+        """
+        if not source or not isinstance(source, str):
+            return False
+
+        # URL pattern for validation
+        url_pattern = re.compile(
+            r'^(?:http|ftp)s?://'  # http:// or https:// or ftp:// or ftps://
+            r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+[A-Z]{2,6}|'  # domain
+            r'localhost|'  # localhost
+            r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})'  # or IP
+            r'(?::\d+)?'  # optional port
+            r'(?:/?|[/?]\S+)$', re.IGNORECASE)
+
+        # For URLs with protocol
+        if url_pattern.match(source):
+            return True
+
+        # For domain-only URLs (e.g., example.com)
+        if '.' in source and ' ' not in source and not source.startswith('//'):
+            # Check if it's a valid domain pattern
+            domain_pattern = re.compile(
+                r'^(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+[A-Z]{2,6}(?:/?|[/?]\S+)?$',
+                re.IGNORECASE
+            )
+            return bool(domain_pattern.match(source))
+
+        return False
+
+    def extract_content(self, source: Union[str, bytes]) -> str:
+        """
+        Extract content from various sources.
+
+        Args:
+            source (Union[str, bytes]): URL, file path, or bytes content of the source.
+
+        Returns:
+            str: Extracted text content.
+
+        Raises:
+            ValueError: If the source type is unsupported or invalid.
+            Exception: For other extraction errors.
+        """
+        try:
+            source_lower = source.lower() if isinstance(source, str) else ""
+
+            # Handle YouTube URLs
+            if isinstance(source, str) and self.is_url(source):
+                # Check if it's a YouTube URL
+                if any(pattern in source_lower for pattern in self.content_extractor_config['youtube_url_patterns']):
+                    return self.youtube_transcriber.extract_transcript(source)
+
+                # Check if it's a general website without specific file extension
+                if not any(ext in source_lower for ext in ['.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx', '.txt', '.json', '.xml', '.csv']):
+                    return self.website_extractor.extract_content(source)
+
+            # Use UnifiedExtractor for all other content types
+            return self.unified_extractor.extract_content(source)
+
+        except Exception as e:
+            logger.error(f"Error extracting content from {source}: {str(e)}")
+            raise
diff --git a/podcastfy/content_parser/pdf_extractor.py b/podcastfy/content_parser/pdf_extractor.py