Refactor main.py: Enhance file processing with concurrent execution and add text MIME type checking; update LineCounter to utilize new scanning method

codeperfectplus · codeperfectplus · commit 74c8ca905842 · 2025-05-26T17:20:20.000+05:30
diff --git a/docs/changelog.rst b/docs/changelog.rst
@@ -1,6 +1,14 @@
 Changelog
 =========
 
+0.0.3 - Draft (2025-05-27)
+------------------
+
+- Enhance file processing with concurrent execution 
+- Add text MIME type checking
+- Update LineCounter to utilize os.scan instead of os.walk for efficiency
+- Improved the processing time by 17 seconds to 0.56s on tested directories
+
 0.0.2 (2025-05-26)
 ------------------
 
@@ -9,10 +17,6 @@ Changelog
 - Modular structure introduced to support upcoming plugins and advanced analysis modes.
 - Publishing pipeline verified for seamless CI/CD deployment to PyPI.
 
----
-
-Let me know if you'd like a one-liner summary too!
-
 
 0.0.1 (2025-05-24)
 ------------------
diff --git a/extliner/main.py b/extliner/main.py
@@ -2,41 +2,91 @@
 import json
 from pathlib import Path
 from collections import defaultdict
-from typing import Dict, List, Optional
+from typing import Dict, List, Optional, Tuple, Union
+from concurrent.futures import ProcessPoolExecutor, as_completed
+
+from tqdm import tqdm
+
+
+import mimetypes
+
+def is_text_mimetype(path: str) -> bool:
+    mime, _ = mimetypes.guess_type(path)
+    return mime is not None and mime.startswith("text/")
+
+def process_file(filepath: str, encoding: str) -> Optional[Tuple[str, int, int]]:
+    ext = (Path(filepath).suffix or "NO_EXT").lower()
+
+    try:
+        with open(filepath, "r", encoding=encoding, errors="ignore") as f:
+            with_spaces = sum(1 for _ in f)
+            f.seek(0)
+            without_spaces = sum(1 for line in f if line.strip())
+        return ext, with_spaces, without_spaces
+    except Exception:
+        return None
+
+def scan_files(directory: Path, ignore_folders: set, ignore_exts: set) -> List[str]:
+    file_list = []
+
+    def _recursive_scan(path: Path):
+        for entry in os.scandir(path):
+            entry_path = Path(entry.path)
+            if entry.is_dir(follow_symlinks=False):
+                if entry_path.name not in ignore_folders:
+                    _recursive_scan(entry_path)
+            elif entry.is_file(follow_symlinks=False):
+                ext = entry_path.suffix.lower() or "NO_EXT"
+                if ext not in ignore_exts:
+                    if is_text_mimetype(entry_path):
+                        file_list.append(str(entry_path))
+
+    _recursive_scan(directory)
+    return file_list
 
 
 class LineCounter:
-    def __init__(self, ignore_extensions: Optional[List[str]] = None, ignore_folder: Optional[List[str]] = None, encoding: str = "utf-8"):
+    def __init__(
+        self,
+        ignore_extensions: Optional[List[str]] = None,
+        ignore_folder: Optional[List[str]] = None,
+        encoding: str = "utf-8",
+        use_progress: bool = True,
+        max_workers: Optional[int] = None,
+    ):
         self.encoding = encoding
         self.ignore_folder = set(ignore_folder or [])
         self.ignore_extensions = set(ignore_extensions or [])
         self.with_spaces: Dict[str, int] = defaultdict(int)
         self.without_spaces: Dict[str, int] = defaultdict(int)
         self.file_count: Dict[str, int] = defaultdict(int)
+        self.use_progress = use_progress and tqdm is not None
+        self.max_workers = max_workers
 
-    def count_lines(self, directory: Path) -> Dict[str, Dict[str, int]]:
+    def count_lines(self, directory: Union[str, Path]) -> Dict[str, Dict[str, int]]:
         directory = Path(directory)
         if not directory.is_dir():
             raise ValueError(f"{directory} is not a valid directory")
-        
-        for root, dirs, files in os.walk(directory):
-            # Remove ignored folders from traversal
-            dirs[:] = [d for d in dirs if d not in self.ignore_folder]
-            for file in files:
-                filepath = Path(root) / file
-                ext = (filepath.suffix or "NO_EXT").lower()
-
-                if ext in self.ignore_extensions:
-                    continue
-
-                try:
-                    with open(filepath, "r", encoding=self.encoding, errors="ignore") as f:
-                        lines = f.readlines()
-                        self.file_count[ext] += 1
-                        self.with_spaces[ext] += len(lines)
-                        self.without_spaces[ext] += sum(1 for line in lines if line.strip())
-                except Exception as e:
-                    print(f"Error reading {filepath}: {e}")
+
+        filepaths = scan_files(directory, self.ignore_folder, self.ignore_extensions)
+
+        with ProcessPoolExecutor(max_workers=self.max_workers) as executor:
+            futures = [
+                executor.submit(process_file, filepath, self.encoding)
+                for filepath in filepaths
+            ]
+            iterator = as_completed(futures)
+
+            if self.use_progress:
+                iterator = tqdm(iterator, total=len(futures), desc="Counting lines")
+
+            for future in iterator:
+                result = future.result()
+                if result:
+                    ext, with_spaces, without_spaces = result
+                    self.file_count[ext] += 1
+                    self.with_spaces[ext] += with_spaces
+                    self.without_spaces[ext] += without_spaces
 
         return self._build_result()
 
@@ -53,7 +103,7 @@ def _build_result(self) -> Dict[str, Dict[str, int]]:
     @staticmethod
     def to_json(data: Dict) -> str:
         return json.dumps(data, indent=2)
-    
+
     @staticmethod
     def to_csv(data: Dict) -> str:
         import csv
@@ -67,12 +117,11 @@ def to_csv(data: Dict) -> str:
             writer.writerow([ext, counts["with_spaces"], counts["without_spaces"], counts["file_count"]])
 
         return output.getvalue()
-        
+
     @staticmethod
     def to_markdown(data: Dict) -> str:
         output = "| Extension | With Spaces | Without Spaces | File Count |\n"
         output += "|-----------|-------------|----------------|------------|\n"
         for ext, counts in data.items():
             output += f"| {ext} | {counts['with_spaces']} | {counts['without_spaces']} | {counts['file_count']} |\n"
-
         return output
diff --git a/requirements.txt b/requirements.txt
@@ -1 +1,2 @@
-tabulate==0.9.0
+tabulate==0.9.0
+tqdm==4.67.1

Original file line number	Diff line number	Diff line change
`@@ -1 +1,2 @@`
`1`		`-tabulate==0.9.0`
	`1`	`+tabulate==0.9.0`
	`2`	`+tqdm==4.67.1`