Skip to content

Commit ee37d64

Browse files
committed
refactor: consolidate pattern parsing and ignore file handling in CLI and query parser
1 parent b8be811 commit ee37d64

File tree

5 files changed

+75
-64
lines changed

5 files changed

+75
-64
lines changed

src/gitingest/cli.py

Lines changed: 5 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -6,57 +6,11 @@
66

77
import click
88

9-
from gitingest.config import MAX_FILE_SIZE, OUTPUT_FILE_PATH
9+
from gitingest.config import MAX_FILE_SIZE
10+
from gitingest.query_parser import _parse_patterns, parse_ignore_file
1011
from gitingest.repository_ingest import ingest
1112

1213

13-
def parse_ignore_file(ignore_file_path: Path) -> set[str]:
14-
"""
15-
Parse the .gitingestignore file and return a set of patterns to ignore.
16-
17-
Parameters
18-
----------
19-
ignore_file_path : Path
20-
Path to the .gitingestignore file
21-
22-
Returns
23-
-------
24-
set[str]
25-
Set of patterns to ignore
26-
"""
27-
if not ignore_file_path.exists():
28-
return set()
29-
30-
with open(ignore_file_path, encoding="utf-8") as f:
31-
# Read lines, strip whitespace, and filter out empty lines and comments
32-
patterns = {line.strip() for line in f if line.strip() and not line.startswith("#")}
33-
34-
return patterns
35-
36-
37-
def parse_patterns(patterns: tuple[str, ...]) -> set[str]:
38-
"""
39-
Parse patterns from command line arguments.
40-
Handles both space-separated patterns in a single string
41-
and multiple -e/-i arguments.
42-
43-
Parameters
44-
----------
45-
patterns : tuple[str, ...]
46-
Tuple of patterns from command line
47-
48-
Returns
49-
-------
50-
set[str]
51-
Set of parsed patterns
52-
"""
53-
result = set()
54-
for pattern_str in patterns:
55-
# Split on spaces and add each pattern
56-
result.update(p.strip() for p in pattern_str.split() if p.strip())
57-
return result
58-
59-
6014
@click.command()
6115
@click.argument("source", type=str, default=".")
6216
@click.option("--output", "-o", default=None, help="Output file path (default: <repo_name>.txt in current directory)")
@@ -139,8 +93,8 @@ async def async_main(
13993
output = f"{repo_name}.txt"
14094

14195
# Parse command line patterns
142-
exclude_patterns = parse_patterns(exclude_pattern)
143-
include_patterns = parse_patterns(include_pattern)
96+
exclude_patterns = _parse_patterns(exclude_pattern)
97+
include_patterns = _parse_patterns(include_pattern)
14498

14599
# Read and add patterns from ignore file
146100
ignore_file_path = Path(source) / ignore_file
@@ -170,4 +124,4 @@ async def async_main(
170124

171125

172126
if __name__ == "__main__":
173-
main()
127+
main()

src/gitingest/ignore_patterns.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -153,4 +153,6 @@
153153
"*.tfstate*",
154154
## Dependencies in various languages
155155
"vendor/",
156+
## gitingestignore file
157+
".gitingestignore",
156158
}

src/gitingest/query_parser.py

Lines changed: 54 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
from gitingest.config import MAX_FILE_SIZE, TMP_BASE_PATH
1313
from gitingest.exceptions import InvalidPatternError
1414
from gitingest.ignore_patterns import DEFAULT_IGNORE_PATTERNS
15-
from gitingest.repository_clone import _check_repo_exists, fetch_remote_branch_list
15+
from gitingest.repository_clone import CloneConfig, _check_repo_exists, clone_repo, fetch_remote_branch_list
1616

1717
HEX_DIGITS: set[str] = set(string.hexdigits)
1818

@@ -48,6 +48,30 @@ class ParsedQuery: # pylint: disable=too-many-instance-attributes
4848
pattern_type: str | None = None
4949

5050

51+
def parse_ignore_file(ignore_file_path: Path) -> set[str]:
52+
"""
53+
Parse the .gitingestignore file and return a set of patterns to ignore.
54+
55+
Parameters
56+
----------
57+
ignore_file_path : Path
58+
Path to the .gitingestignore file
59+
60+
Returns
61+
-------
62+
set[str]
63+
Set of patterns to ignore
64+
"""
65+
if not ignore_file_path.exists():
66+
return set()
67+
68+
with open(ignore_file_path, encoding="utf-8") as f:
69+
# Read lines, strip whitespace, and filter out empty lines and comments
70+
patterns = {line.strip() for line in f if line.strip() and not line.startswith("#")}
71+
72+
return patterns
73+
74+
5175
async def parse_query(
5276
source: str,
5377
max_file_size: int,
@@ -89,6 +113,24 @@ async def parse_query(
89113
# Local path scenario
90114
parsed_query = _parse_path(source)
91115

116+
# Clone the repository if it's a URL
117+
if parsed_query.url:
118+
clone_config = CloneConfig(
119+
url=parsed_query.url,
120+
local_path=str(parsed_query.local_path),
121+
commit=parsed_query.commit,
122+
branch=parsed_query.branch,
123+
)
124+
await clone_repo(clone_config)
125+
126+
# Look for .gitingestignore file in the cloned repository
127+
ignore_file_path = Path(parsed_query.local_path) / ".gitingestignore"
128+
additional_ignore_patterns = parse_ignore_file(ignore_file_path)
129+
if ignore_patterns:
130+
ignore_patterns.update(additional_ignore_patterns)
131+
else:
132+
ignore_patterns = additional_ignore_patterns
133+
92134
# Combine default ignore patterns + custom patterns
93135
ignore_patterns_set = DEFAULT_IGNORE_PATTERNS.copy()
94136
if ignore_patterns:
@@ -283,17 +325,18 @@ def _normalize_pattern(pattern: str) -> str:
283325
return pattern
284326

285327

286-
def _parse_patterns(pattern: set[str] | str) -> set[str]:
328+
def _parse_patterns(patterns: tuple[str, ...] | set[str] | str) -> set[str]:
287329
"""
288330
Parse and validate file/directory patterns for inclusion or exclusion.
289331
290-
Takes either a single pattern string or set of pattern strings and processes them into a normalized list.
291-
Patterns are split on commas and spaces, validated for allowed characters, and normalized.
332+
Takes either a single pattern string, a tuple of pattern strings, or a set of pattern strings
333+
and processes them into a normalized list. Patterns are split on commas and spaces, validated
334+
for allowed characters, and normalized.
292335
293336
Parameters
294337
----------
295-
pattern : set[str] | str
296-
Pattern(s) to parse - either a single string or set of strings
338+
patterns : tuple[str, ...] | set[str] | str
339+
Pattern(s) to parse - either a single string, a tuple of strings, or a set of strings
297340
298341
Returns
299342
-------
@@ -307,7 +350,11 @@ def _parse_patterns(pattern: set[str] | str) -> set[str]:
307350
dash (-), underscore (_), dot (.), forward slash (/), plus (+), and
308351
asterisk (*) are allowed.
309352
"""
310-
patterns = pattern if isinstance(pattern, set) else {pattern}
353+
# Convert patterns to a set if it's not already a set
354+
if isinstance(patterns, tuple):
355+
patterns = set(patterns)
356+
elif isinstance(patterns, str):
357+
patterns = {patterns}
311358

312359
parsed_patterns: set[str] = set()
313360
for p in patterns:

src/gitingest/repository_clone.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,11 @@
1-
""" This module contains functions for cloning a Git repository to a local path. """
1+
"""
2+
Module for cloning repositories in the gitingest package.
3+
"""
24

35
import asyncio
6+
import shutil
47
from dataclasses import dataclass
8+
from pathlib import Path
59

610
from gitingest.utils import async_timeout
711

@@ -78,6 +82,11 @@ async def clone_repo(config: CloneConfig) -> tuple[bytes, bytes]:
7882
if not await _check_repo_exists(url):
7983
raise ValueError("Repository not found, make sure it is public")
8084

85+
# Remove the directory if it exists and is not empty
86+
local_path_obj = Path(local_path)
87+
if local_path_obj.exists() and any(local_path_obj.iterdir()):
88+
shutil.rmtree(local_path_obj)
89+
8190
if commit:
8291
# Scenario 1: Clone and checkout a specific commit
8392
# Clone the repository without depth to ensure full history for checkout
@@ -89,7 +98,6 @@ async def clone_repo(config: CloneConfig) -> tuple[bytes, bytes]:
8998
return await _run_git_command(*checkout_cmd)
9099

91100
if branch and branch.lower() not in ("main", "master"):
92-
93101
# Scenario 2: Clone a specific branch with shallow depth
94102
clone_cmd = ["git", "clone", "--depth=1", "--single-branch", "--branch", branch, url, local_path]
95103
return await _run_git_command(*clone_cmd)

src/gitingest/repository_ingest.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
1-
""" Main entry point for ingesting a source and processing its contents. """
1+
"""
2+
Module for ingesting repositories in the gitingest package.
3+
"""
24

3-
import asyncio
45
import inspect
56
import shutil
67

@@ -71,7 +72,7 @@ async def ingest(
7172
clone_result = clone_repo(clone_config)
7273

7374
if inspect.iscoroutine(clone_result):
74-
asyncio.run(clone_result)
75+
await clone_result
7576
else:
7677
raise TypeError("clone_repo did not return a coroutine as expected.")
7778

@@ -85,5 +86,4 @@ async def ingest(
8586
finally:
8687
# Clean up the temporary directory if it was created
8788
if parsed_query.url:
88-
# Clean up the temporary directory
8989
shutil.rmtree(TMP_BASE_PATH, ignore_errors=True)

0 commit comments

Comments
 (0)