-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathgit_unreachable_objects_scraper.py
More file actions
160 lines (137 loc) · 5.17 KB
/
git_unreachable_objects_scraper.py
File metadata and controls
160 lines (137 loc) · 5.17 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
import subprocess
import argparse
def get_unreachable_objects(repo_path="."):
'''
Find unreachable Git objects using fsck.
'''
try:
result = subprocess.run(
["git", "fsck", "--full", "--no-reflogs"],
cwd=repo_path,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
check=True
)
unreachable_objects = []
for line in result.stdout.splitlines():
if line.startswith("unreachable") or line.startswith("dangling"):
parts = line.strip().split()
if len(parts) >= 3:
obj_type = parts[1]
sha = parts[2]
unreachable_objects.append((obj_type, sha))
return unreachable_objects
except subprocess.CalledProcessError as e:
print("Error running git fsck:", e.stderr)
return []
def get_object_content(repo_path, sha):
'''
Get the raw content of a Git object using git cat-file -p.
'''
try:
result = subprocess.run(
["git", "cat-file", "-p", sha],
cwd=repo_path,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
check=True
)
return result.stdout
except subprocess.CalledProcessError as e:
return f"Error reading object {sha}: {e.stderr}"
def get_commit_parent(repo_path, commit_sha):
'''
Get the parent commit SHA of a given commit.
'''
content = get_object_content(repo_path, commit_sha)
for line in content.splitlines():
if line.startswith("parent "):
return line.split()[1]
return None
def get_commit_tree_sha(repo_path, commit_sha):
'''
Extract the tree SHA from a commit object.
'''
content = get_object_content(repo_path, commit_sha)
for line in content.splitlines():
if line.startswith("tree "):
return line.split()[1]
return None
def get_diff_between_commits(repo_path, parent_sha, child_sha):
'''
Get the diff between two commits.
'''
try:
result = subprocess.run(
["git", "diff", parent_sha, child_sha],
cwd=repo_path,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
check=True
)
return result.stdout
except subprocess.CalledProcessError as e:
return f"Error running diff: {e.stderr}"
def get_blobs_from_tree(repo_path, tree_sha, prefix=""):
'''
Recursively get all blobs and their paths from a tree SHA.
'''
blobs = []
try:
result = subprocess.run(
["git", "ls-tree", tree_sha],
cwd=repo_path,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
check=True
)
for line in result.stdout.splitlines():
parts = line.split(None, 4)
if len(parts) >= 4:
mode, type_, sha, path = parts[0], parts[1], parts[2], parts[3]
full_path = f"{prefix}/{path}".lstrip("/")
if type_ == "blob":
blobs.append((full_path, sha))
elif type_ == "tree":
blobs.extend(get_blobs_from_tree(repo_path, sha, full_path))
return blobs
except subprocess.CalledProcessError as e:
print(f"Error reading tree {tree_sha}: {e.stderr}")
return []
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Detect and display unreachable Git commit contents.")
parser.add_argument("repo", nargs="?", default=".", help="Path to the Git repository")
parser.add_argument("-c", "--content", action="store_true", help="Display content of unreachable files")
args = parser.parse_args()
unreachable = get_unreachable_objects(args.repo)
if not unreachable:
print("No unreachable Git objects found.")
else:
for obj_type, sha in unreachable:
if obj_type == "commit":
print(f"\n=== Unreachable commit {sha} ===")
print("--- Commit Metadata ---")
commit_content = get_object_content(args.repo, sha)
print(commit_content.strip())
parent_sha = get_commit_parent(args.repo, sha)
if parent_sha:
print("\n--- Diff from parent ---")
diff_output = get_diff_between_commits(args.repo, parent_sha, sha)
print(diff_output.strip())
else:
print("No parent found. Possibly an initial commit.")
print("\n--- Commit Tree Files ---")
tree_sha = get_commit_tree_sha(args.repo, sha)
if not tree_sha:
print("No tree found.")
continue
blobs = get_blobs_from_tree(args.repo, tree_sha)
for file_path, blob_sha in blobs:
print(f"File: {file_path}\nBlob SHA: {blob_sha}")
if args.content:
content_display = get_object_content(args.repo, blob_sha)
print(content_display.strip())