-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathbuild_mapping_multiuser.py
More file actions
118 lines (96 loc) · 4.02 KB
/
build_mapping_multiuser.py
File metadata and controls
118 lines (96 loc) · 4.02 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import re
import csv
from pathlib import Path
# 1) Reference folder (your 500 words videos)
REF_DIR = Path(r"/home/antpc/Downloads/5000_RTH_Videos (Copy)") # <-- change if needed
# 2) Collected folders (your 22k videos)
# user002 has multiple paths, so we list them all.
USER_DIRS = {
"001": [
Path(r"/mnt/9a528fe4-4fe8-4dff-9a0c-8b1a3cf3d7ba/ISL_DATA_USER001/All_clips_USer001_18-01-2026"),
],
"002": [
Path(r"/mnt/9a528fe4-4fe8-4dff-9a0c-8b1a3cf3d7ba/ISL_DATA_USER002/download_2026-01-05_15-16-42/user_002_clips/output_clips"),
Path(r"/mnt/9a528fe4-4fe8-4dff-9a0c-8b1a3cf3d7ba/ISL_DATA_USER002/user002_output_18-01-2026/output_folder"),
Path(r"/mnt/9a528fe4-4fe8-4dff-9a0c-8b1a3cf3d7ba/ISL_DATA_USER002/user002_output_18-01-2026/Skipped_output_folder_user002"),
],
"003": [
Path(r"/mnt/9a528fe4-4fe8-4dff-9a0c-8b1a3cf3d7ba/ISL_DATA_USER003/All_user003_clips_18-01-2026"),
],
"004": [
Path(r"/mnt/9a528fe4-4fe8-4dff-9a0c-8b1a3cf3d7ba/ISL_DATA_USER004/output_clips_20-01-2026_user004"),
],
"005": [
Path(r"/mnt/9a528fe4-4fe8-4dff-9a0c-8b1a3cf3d7ba/ISL_DATA_USER005/All_clips_19-01-2026"),
],
}
VIDEO_EXTS = {".mp4", ".mpg", ".mov", ".mkv", ".avi", ".webm"}
OUT_CSV = Path("mapping.csv")
def normalize_word(s: str) -> str:
return s.strip().lower()
def word_from_reference(ref_file: Path) -> str:
# reference filename is exactly the word: hello.mp4 -> hello
return normalize_word(ref_file.stem)
def word_from_collected_filename(filename: str) -> str:
"""
IMPORTANT: This is the only part you might need to tweak.
Current rule:
- Take the first continuous letters at the start of filename stem.
Examples:
hello_001.mp4 -> hello
thankyou-user002-10.mp4 -> thankyou
school12.mp4 -> school
If your collected files are like: 001_hello_0001.mp4
then we need a different rule.
"""
stem = Path(filename).stem.lower()
# remove common prefixes like user001_, u001_, etc.
stem = re.sub(r"^(user|u)\d{1,3}[_-]*", "", stem)
m = re.match(r"([a-z]+)", stem)
return m.group(1) if m else ""
def main():
# Build reference lookup: word -> ref_path
ref_lookup = {}
for f in REF_DIR.iterdir():
if f.is_file() and f.suffix.lower() in VIDEO_EXTS:
w = word_from_reference(f)
ref_lookup[w] = str(f)
if not ref_lookup:
raise RuntimeError(f"No reference videos found in: {REF_DIR}")
rows = []
skipped_no_word = 0
skipped_no_ref = 0
total_collected = 0
for user_id, roots in USER_DIRS.items():
for root in roots:
if not root.exists():
print(f"⚠️ Missing folder for user{user_id}: {root}")
continue
for vid in root.rglob("*"):
if not vid.is_file() or vid.suffix.lower() not in VIDEO_EXTS:
continue
total_collected += 1
w = word_from_collected_filename(vid.name)
if not w:
skipped_no_word += 1
continue
ref_path = ref_lookup.get(w)
if not ref_path:
skipped_no_ref += 1
continue
rows.append([w, ref_path, str(vid), user_id])
rows.sort(key=lambda r: (r[0], r[3], r[2]))
with OUT_CSV.open("w", newline="", encoding="utf-8") as f:
writer = csv.writer(f)
writer.writerow(["word", "reference_path", "collected_path", "user_id"])
writer.writerows(rows)
print("\n====== SUMMARY ======")
print(f"Reference words found: {len(ref_lookup)}")
print(f"Collected videos scanned: {total_collected}")
print(f"Rows written to mapping.csv: {len(rows)}")
print(f"Skipped (couldn't extract word from filename): {skipped_no_word}")
print(f"Skipped (word not found in reference list): {skipped_no_ref}")
print("=====================\n")
print("Next: open mapping.csv and verify a few rows look correct.")
if __name__ == "__main__":
main()