Skip to content

Commit 74c8ca9

Browse files
Refactor main.py: Enhance file processing with concurrent execution and add text MIME type checking; update LineCounter to utilize new scanning method
1 parent 7a907dd commit 74c8ca9

File tree

3 files changed

+84
-30
lines changed

3 files changed

+84
-30
lines changed

docs/changelog.rst

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,14 @@
11
Changelog
22
=========
33

4+
0.0.3 - Draft (2025-05-27)
5+
------------------
6+
7+
- Enhance file processing with concurrent execution
8+
- Add text MIME type checking
9+
- Update LineCounter to utilize os.scan instead of os.walk for efficiency
10+
- Improved the processing time by 17 seconds to 0.56s on tested directories
11+
412
0.0.2 (2025-05-26)
513
------------------
614

@@ -9,10 +17,6 @@ Changelog
917
- Modular structure introduced to support upcoming plugins and advanced analysis modes.
1018
- Publishing pipeline verified for seamless CI/CD deployment to PyPI.
1119

12-
---
13-
14-
Let me know if you'd like a one-liner summary too!
15-
1620

1721
0.0.1 (2025-05-24)
1822
------------------

extliner/main.py

Lines changed: 74 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -2,41 +2,91 @@
22
import json
33
from pathlib import Path
44
from collections import defaultdict
5-
from typing import Dict, List, Optional
5+
from typing import Dict, List, Optional, Tuple, Union
6+
from concurrent.futures import ProcessPoolExecutor, as_completed
7+
8+
from tqdm import tqdm
9+
10+
11+
import mimetypes
12+
13+
def is_text_mimetype(path: str) -> bool:
14+
mime, _ = mimetypes.guess_type(path)
15+
return mime is not None and mime.startswith("text/")
16+
17+
def process_file(filepath: str, encoding: str) -> Optional[Tuple[str, int, int]]:
18+
ext = (Path(filepath).suffix or "NO_EXT").lower()
19+
20+
try:
21+
with open(filepath, "r", encoding=encoding, errors="ignore") as f:
22+
with_spaces = sum(1 for _ in f)
23+
f.seek(0)
24+
without_spaces = sum(1 for line in f if line.strip())
25+
return ext, with_spaces, without_spaces
26+
except Exception:
27+
return None
28+
29+
def scan_files(directory: Path, ignore_folders: set, ignore_exts: set) -> List[str]:
30+
file_list = []
31+
32+
def _recursive_scan(path: Path):
33+
for entry in os.scandir(path):
34+
entry_path = Path(entry.path)
35+
if entry.is_dir(follow_symlinks=False):
36+
if entry_path.name not in ignore_folders:
37+
_recursive_scan(entry_path)
38+
elif entry.is_file(follow_symlinks=False):
39+
ext = entry_path.suffix.lower() or "NO_EXT"
40+
if ext not in ignore_exts:
41+
if is_text_mimetype(entry_path):
42+
file_list.append(str(entry_path))
43+
44+
_recursive_scan(directory)
45+
return file_list
646

747

848
class LineCounter:
9-
def __init__(self, ignore_extensions: Optional[List[str]] = None, ignore_folder: Optional[List[str]] = None, encoding: str = "utf-8"):
49+
def __init__(
50+
self,
51+
ignore_extensions: Optional[List[str]] = None,
52+
ignore_folder: Optional[List[str]] = None,
53+
encoding: str = "utf-8",
54+
use_progress: bool = True,
55+
max_workers: Optional[int] = None,
56+
):
1057
self.encoding = encoding
1158
self.ignore_folder = set(ignore_folder or [])
1259
self.ignore_extensions = set(ignore_extensions or [])
1360
self.with_spaces: Dict[str, int] = defaultdict(int)
1461
self.without_spaces: Dict[str, int] = defaultdict(int)
1562
self.file_count: Dict[str, int] = defaultdict(int)
63+
self.use_progress = use_progress and tqdm is not None
64+
self.max_workers = max_workers
1665

17-
def count_lines(self, directory: Path) -> Dict[str, Dict[str, int]]:
66+
def count_lines(self, directory: Union[str, Path]) -> Dict[str, Dict[str, int]]:
1867
directory = Path(directory)
1968
if not directory.is_dir():
2069
raise ValueError(f"{directory} is not a valid directory")
21-
22-
for root, dirs, files in os.walk(directory):
23-
# Remove ignored folders from traversal
24-
dirs[:] = [d for d in dirs if d not in self.ignore_folder]
25-
for file in files:
26-
filepath = Path(root) / file
27-
ext = (filepath.suffix or "NO_EXT").lower()
28-
29-
if ext in self.ignore_extensions:
30-
continue
31-
32-
try:
33-
with open(filepath, "r", encoding=self.encoding, errors="ignore") as f:
34-
lines = f.readlines()
35-
self.file_count[ext] += 1
36-
self.with_spaces[ext] += len(lines)
37-
self.without_spaces[ext] += sum(1 for line in lines if line.strip())
38-
except Exception as e:
39-
print(f"Error reading {filepath}: {e}")
70+
71+
filepaths = scan_files(directory, self.ignore_folder, self.ignore_extensions)
72+
73+
with ProcessPoolExecutor(max_workers=self.max_workers) as executor:
74+
futures = [
75+
executor.submit(process_file, filepath, self.encoding)
76+
for filepath in filepaths
77+
]
78+
iterator = as_completed(futures)
79+
80+
if self.use_progress:
81+
iterator = tqdm(iterator, total=len(futures), desc="Counting lines")
82+
83+
for future in iterator:
84+
result = future.result()
85+
if result:
86+
ext, with_spaces, without_spaces = result
87+
self.file_count[ext] += 1
88+
self.with_spaces[ext] += with_spaces
89+
self.without_spaces[ext] += without_spaces
4090

4191
return self._build_result()
4292

@@ -53,7 +103,7 @@ def _build_result(self) -> Dict[str, Dict[str, int]]:
53103
@staticmethod
54104
def to_json(data: Dict) -> str:
55105
return json.dumps(data, indent=2)
56-
106+
57107
@staticmethod
58108
def to_csv(data: Dict) -> str:
59109
import csv
@@ -67,12 +117,11 @@ def to_csv(data: Dict) -> str:
67117
writer.writerow([ext, counts["with_spaces"], counts["without_spaces"], counts["file_count"]])
68118

69119
return output.getvalue()
70-
120+
71121
@staticmethod
72122
def to_markdown(data: Dict) -> str:
73123
output = "| Extension | With Spaces | Without Spaces | File Count |\n"
74124
output += "|-----------|-------------|----------------|------------|\n"
75125
for ext, counts in data.items():
76126
output += f"| {ext} | {counts['with_spaces']} | {counts['without_spaces']} | {counts['file_count']} |\n"
77-
78127
return output

requirements.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1,2 @@
1-
tabulate==0.9.0
1+
tabulate==0.9.0
2+
tqdm==4.67.1

0 commit comments

Comments
 (0)