Skip to content

Commit 6ba4b7d

Browse files
authored
Add file status to tutorial statistics (#6912)
This PR adds file status information Added (A), Copied (C), Deleted (D), Modified (M), Renamed (R) to the tutorial statistics tracking. The change enhances the `get_tutorials_stats.py` script to capture Git status information using the `--name-status` flag and store it in the `filenames.csv` output. Changes: * Added status field to FileInfo class * Modified file parsing logic to extract status information from Git output * Status is now included in the **filenames.csv** output for each file Local test generates this `filenames.csv`: | commit_id | date | filename | lines_added | lines_deleted | status | |-----------|------|----------|-------------|---------------|--------| | 7c45ceb313 | 2025-07-11 | tutorials/index.rst | 17 | 171 | M | | f4dda5dca1 | 2025-07-10 | tutorials/conf.py | 4 | 2 | M | | 3569db3e78 | 2025-07-10 | tutorials/conf.py | 0 | 1 | M | ... How to test: 1. Comment out the part that uploads to S3 2. Add save to local file: ``` def save_to_local_file(filename: str, docs: list[dict[str, Any]]) -> None: print(f"Writing {len(docs)} documents to {filename}") with open(filename, "w", newline="") as f: writer = csv.DictWriter(f, fieldnames=sorted(docs[0].keys())) writer.writeheader() writer.writerows(docs) print(f"Done writing to {filename}") ``` 4. Add this to main: ``` save_to_local_file("metadata.csv", history_log) save_to_local_file("filenames.csv", filenames) ``` 5. Make sure the tutorias_dir and pytorch_doc_dir point to the correct location of tutorials and pytorch repos.
1 parent 68ed115 commit 6ba4b7d

File tree

1 file changed

+63
-3
lines changed

1 file changed

+63
-3
lines changed

.github/scripts/get_tutorials_stats.py

Lines changed: 63 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,7 @@ class FileInfo(NamedTuple):
103103
filename: str
104104
lines_added: int
105105
lines_deleted: int
106+
status: str # 'A' for added, 'M' for modified, 'D' for deleted
106107

107108

108109
class CommitInfo(NamedTuple):
@@ -123,8 +124,19 @@ def get_file_names(
123124
cwd=cwd,
124125
env={"TZ": "UTC"},
125126
).split("\n")
126-
rc: List[CommitInfo] = []
127127

128+
# Get name-status for file status (A/M/D)
129+
status_cmd = "git log --date=short --pretty='format:%h;%ad' --name-status"
130+
if path_filter:
131+
status_cmd += f" -- {path_filter}"
132+
status_lines = run_command(
133+
status_cmd,
134+
cwd=cwd,
135+
env={"TZ": "UTC"},
136+
).split("\n")
137+
138+
# Process numstat output
139+
rc: List[CommitInfo] = []
128140
for line in lines:
129141
line = line.strip()
130142
if not line:
@@ -134,12 +146,59 @@ def get_file_names(
134146
rc.append(CommitInfo(commit_hash, date, []))
135147
else:
136148
added, deleted, name = line.split("\t")
149+
# Handle renamed files (containing =>)
150+
if " => " in name:
151+
name = name.split(" => ")[1] # Use only the new filename
137152
# Special casing for binary files
138153
if added == "-":
139154
assert deleted == "-"
140-
rc[-1].files.append(FileInfo(name, -1, -1))
155+
rc[-1].files.append(FileInfo(name, -1, -1, ""))
141156
else:
142-
rc[-1].files.append(FileInfo(name, int(added), int(deleted)))
157+
rc[-1].files.append(FileInfo(name, int(added), int(deleted), ""))
158+
159+
# Process name-status output to add status information
160+
current_commit = None
161+
status_map: Dict[str, Dict[str, str]] = {} # Maps commit_id -> {filename -> status}
162+
163+
for line in status_lines:
164+
line = line.strip()
165+
if not line:
166+
continue
167+
elif ";" in line: # This is a commit line
168+
commit_hash, date = line.split(";")
169+
current_commit = commit_hash # Update current_commit here
170+
else: # This is a file status line
171+
parts = line.split("\t")
172+
status = parts[0]
173+
if status.startswith("R") or status.startswith("C"):
174+
# Handle renamed/copied files
175+
old_filename = parts[1]
176+
new_filename = parts[2]
177+
if current_commit is not None:
178+
standardized_status = status[0] # Just take first character
179+
status_map.setdefault(current_commit, {})[new_filename] = (
180+
standardized_status
181+
)
182+
else:
183+
filename = parts[1] if len(parts) > 1 else ""
184+
if current_commit is not None and filename:
185+
status_map.setdefault(current_commit, {})[filename] = status
186+
187+
# Update file statuses
188+
for commit in rc:
189+
for i, file_info in enumerate(commit.files):
190+
if (
191+
commit.commit_id in status_map
192+
and file_info.filename in status_map[commit.commit_id]
193+
):
194+
# Replace the FileInfo with a new one that includes the status
195+
commit.files[i] = FileInfo(
196+
file_info.filename,
197+
file_info.lines_added,
198+
file_info.lines_deleted,
199+
status_map[commit.commit_id][file_info.filename],
200+
)
201+
143202
return rc
144203

145204

@@ -153,6 +212,7 @@ def convert_to_dict(
153212
"filename": i.filename,
154213
"lines_added": i.lines_added,
155214
"lines_deleted": i.lines_deleted,
215+
"status": i.status,
156216
}
157217
for i in entry.files
158218
]

0 commit comments

Comments
 (0)