Skip to content

Commit ebff8dd

Browse files
authored
Added feature to have a persistent MLC index (mlcommons#197)
1 parent 19054a7 commit ebff8dd

File tree

3 files changed

+213
-20
lines changed

3 files changed

+213
-20
lines changed

.github/workflows/test-mlc-core-actions.yaml

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -205,8 +205,9 @@ jobs:
205205
206206
- name: Test 22 - Test silent mode
207207
run: |
208-
! mlcr detect,cpu -j -s 2>&1 | grep -q INFO
209-
! mlcr detect,cpu -j --silent 2>&1 | grep -q INFO
208+
mlcr detect,cpu -j -s --quiet
209+
! mlcr detect,cpu -j -s --quiet 2>&1 | grep -q INFO
210+
! mlcr detect,cpu -j --silent --quiet 2>&1 | grep -q INFO
210211
211212
- name: Test 23 - Test verbose mode
212213
run: |

CONTRIBUTORS.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,11 +29,11 @@ Once your contribution exceeds 50 lines of code (in total), we will:
2929
## Current Contributors
3030

3131
- **[Arjun Suresh](https://github.com/arjunsuresh)** - *Initial Development Discussions, {Script,Cache} Action implementations*
32-
- **[Anandhu Sooraj](https://github.com/anandhu-eng)** - *Initial Development Discussions, RepoAction implmentation, Github Tests*
32+
- **[Anandhu Sooraj](https://github.com/anandhu-eng)** - *Initial Development Discussions, RepoAction implementation, Github Tests*
33+
- **[Sujith Kanakkassery](https://github.com/sujik18)** - *Initial Development Discussions, Persistent index implementation*
3334
- **[Shaik Masthan](https://github.com/csemasthan)** - *Initial Development Discussions*
3435
- **[Sahil Avaran](https://github.com/sahilavaran)** - *Initial Development Discussions*, added logging
3536
- **[R.A Sidharth](https://github.com/Sid9993)** - *Find repo implementation*
36-
- **[Sujith Kanakkassery](https://github.com/sujik18)** - *Initial Development Discussions*, adding logging to a file
3737
- **[Your Name Here]** - This could be you! 🎉
3838

3939
---

mlc/index.py

Lines changed: 208 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import json
44
import yaml
55
from .repo import Repo
6+
from datetime import datetime
67

78
class CustomJSONEncoder(json.JSONEncoder):
89
def default(self, obj):
@@ -35,8 +36,50 @@ def __init__(self, repos_path, repos):
3536
"experiment": os.path.join(repos_path, "index_experiment.json")
3637
}
3738
self.indices = {key: [] for key in self.index_files.keys()}
39+
self.modified_times_file = os.path.join(repos_path, "modified_times.json")
40+
self.modified_times = self._load_modified_times()
41+
self._load_existing_index()
3842
self.build_index()
3943

44+
def _load_modified_times(self):
45+
"""
46+
Load stored mtimes to check for changes in scripts.
47+
"""
48+
if os.path.exists(self.modified_times_file):
49+
try:
50+
# logger.info(f"Loading modified times from {self.modified_times_file}")
51+
with open(self.modified_times_file, "r") as f:
52+
return json.load(f)
53+
except Exception:
54+
return {}
55+
return {}
56+
57+
def _save_modified_times(self):
58+
"""
59+
Save updated mtimes in modified_times json file.
60+
"""
61+
logger.debug(f"Saving modified times to {self.modified_times_file}")
62+
with open(self.modified_times_file, "w") as f:
63+
json.dump(self.modified_times, f, indent=4)
64+
65+
def _load_existing_index(self):
66+
"""
67+
Load previously saved index to allow incremental updates.
68+
"""
69+
for folder_type, file_path in self.index_files.items():
70+
if os.path.exists(file_path):
71+
try:
72+
# logger.info(f"Loading existing index for {folder_type}")
73+
with open(file_path, "r") as f:
74+
self.indices[folder_type] = json.load(f)
75+
# Convert repo dicts back into Repo objects
76+
for item in self.indices[folder_type]:
77+
if isinstance(item.get("repo"), dict):
78+
item["repo"] = Repo(**item["repo"])
79+
80+
except Exception:
81+
pass # fall back to empty index
82+
4083
def add(self, meta, folder_type, path, repo):
4184
if not repo:
4285
logger.error(f"Repo for index add for {path} is none")
@@ -87,6 +130,16 @@ def rm(self, meta, folder_type, path):
87130
del(self.indices[folder_type][index])
88131
self._save_indices()
89132

133+
def get_item_mtime(self,file):
134+
# logger.debug(f"Getting latest modified time for file: {file}")
135+
latest = 0
136+
t = os.path.getmtime(file)
137+
if t > latest:
138+
latest = t
139+
logger.debug(f"Latest modified time updated to: {latest}")
140+
# logger.debug("No changes in modified time detected.")
141+
return latest
142+
90143
def build_index(self):
91144
"""
92145
Build shared indices for script, cache, and experiment folders across all repositories.
@@ -95,31 +148,158 @@ def build_index(self):
95148
None
96149
"""
97150

98-
#for repo in os.listdir(self.repos_path):
151+
# track all currently detected item paths
152+
current_item_keys = set()
153+
changed = False
154+
repos_changed = False
155+
156+
# load existing modified times
157+
self.modified_times = self._load_modified_times()
158+
159+
index_json_path = os.path.join(self.repos_path, "index_script.json")
160+
161+
rebuild_index = False
162+
163+
#file does not exist, rebuild
164+
if not os.path.exists(index_json_path):
165+
logger.warning("index_script.json missing. Forcing full index rebuild...")
166+
logger.debug("Resetting modified_times...")
167+
self.modified_times = {}
168+
self._save_modified_times()
169+
else:
170+
logger.debug("index_script.json exists. Skipping forced rebuild.")
171+
172+
#check repos.json mtime
173+
repos_json_path = os.path.join(self.repos_path, "repos.json")
174+
repos_mtime = os.path.getmtime(repos_json_path)
175+
176+
key = f"{repos_json_path}"
177+
old = self.modified_times.get(key)
178+
repo_old_mtime = old["mtime"] if isinstance(old, dict) else old
179+
180+
logger.debug(f"Current repos.json mtime: {repos_mtime}")
181+
logger.debug(f"Old repos.json mtime: {repo_old_mtime}")
182+
current_item_keys.add(key)
183+
184+
# if changed, reset indexes
185+
if repo_old_mtime is None or repo_old_mtime != repos_mtime:
186+
logger.debug("repos.json modified. Clearing index ........")
187+
# reset indices
188+
self.indices = {key: [] for key in self.index_files.keys()}
189+
# record repo mtime
190+
self.modified_times[key] = {
191+
"mtime": repos_mtime,
192+
"date_time": datetime.fromtimestamp(repos_mtime).strftime("%Y-%m-%d %H:%M:%S")
193+
}
194+
# clear modified times except for repos.json
195+
self.modified_times = {key: self.modified_times[key]}
196+
self._save_indices()
197+
self._save_modified_times()
198+
repos_changed = True
199+
else:
200+
logger.debug("Repos.json not modified")
201+
99202
for repo in self.repos:
100-
repo_path = repo.path#os.path.join(self.repos_path, repo)
203+
repo_path = repo.path #os.path.join(self.repos_path, repo)
101204
if not os.path.isdir(repo_path):
102205
continue
103-
206+
logger.debug(f"Checking repository: {repo_path}")
104207
# Filter for relevant directories in the repo
105208
for folder_type in ["script", "cache", "experiment"]:
209+
logger.debug(f"Checking folder type: {folder_type}")
106210
folder_path = os.path.join(repo_path, folder_type)
107211
if not os.path.isdir(folder_path):
108212
continue
109213

110214
# Process each automation directory
111215
for automation_dir in os.listdir(folder_path):
216+
# logger.debug(f"Checking automation directory: {automation_dir}")
112217
automation_path = os.path.join(folder_path, automation_dir)
113218
if not os.path.isdir(automation_path):
219+
logger.debug(f"Skipping non-directory automation path: {automation_path}")
114220
continue
221+
222+
yaml_path = os.path.join(automation_path, "meta.yaml")
223+
json_path = os.path.join(automation_path, "meta.json")
115224

116-
# Check for configuration files (meta.yaml or meta.json)
117-
for config_file in ["meta.yaml", "meta.json"]:
118-
config_path = os.path.join(automation_path, config_file)
119-
if os.path.isfile(config_path):
120-
self._process_config_file(config_path, folder_type, automation_path, repo)
121-
break # Only process one config file per automation_dir
122-
self._save_indices()
225+
if os.path.isfile(yaml_path):
226+
# logger.debug(f"Found YAML config file: {yaml_path}")
227+
config_path = yaml_path
228+
elif os.path.isfile(json_path):
229+
# logger.debug(f"Found JSON config file: {json_path}")
230+
config_path = json_path
231+
else:
232+
logger.debug(f"No config file found in {automation_path}, skipping")
233+
if automation_dir in self.modified_times:
234+
del self.modified_times[automation_dir]
235+
if any(automation_dir in item["path"] for item in self.indices[folder_type]):
236+
logger.debug(f"Removed index entry (if it exists) for {folder_type} : {automation_dir}")
237+
self._remove_index_entry(automation_path)
238+
self._save_indices()
239+
continue
240+
current_item_keys.add(config_path)
241+
mtime = self.get_item_mtime(config_path)
242+
243+
old = self.modified_times.get(config_path)
244+
old_mtime = old["mtime"] if isinstance(old, dict) else old
245+
246+
# skip if unchanged
247+
if old_mtime == mtime and repos_changed != 1:
248+
# logger.debug(f"No changes detected for {config_path}, skipping reindexing.")
249+
continue
250+
if(old_mtime is None):
251+
logger.debug(f"New config file detected: {config_path}. Adding to index.")
252+
# update mtime
253+
logger.debug(f"{config_path} is modified, index getting updated")
254+
if config_path not in self.modified_times:
255+
logger.debug(f"*************{config_path} not found in modified_times; creating new entry***************")
256+
257+
self.modified_times[config_path] = {
258+
"mtime": mtime,
259+
"date_time": datetime.fromtimestamp(mtime).strftime("%Y-%m-%d %H:%M:%S")
260+
}
261+
logger.debug(f"Modified time for {config_path} updated to {mtime}")
262+
changed = True
263+
# meta file changed, so reindex
264+
self._process_config_file(config_path, folder_type, automation_path, repo)
265+
266+
# remove deleted scripts
267+
old_keys = set(self.modified_times.keys())
268+
deleted_keys = old_keys - current_item_keys
269+
for key in deleted_keys:
270+
logger.warning(f"Detected deleted item, removing entry form modified times: {key}")
271+
del self.modified_times[key]
272+
folder_key = os.path.dirname(key)
273+
logger.warning(f"Removing index entry for folder: {folder_key}")
274+
self._remove_index_entry(folder_key)
275+
changed = True
276+
logger.debug(f"Deleted keys removed from modified times and indices: {deleted_keys}")
277+
278+
if changed:
279+
logger.debug("Changes detected, saving updated index and modified times.")
280+
self._save_modified_times()
281+
self._save_indices()
282+
logger.debug("**************Index updated (changes detected).*************************")
283+
else:
284+
logger.debug("**************Index unchanged (no changes detected).********************")
285+
286+
def _remove_index_entry(self, key):
287+
logger.debug(f"Removing index entry for {key}")
288+
for ft in self.indices:
289+
self.indices[ft] = [
290+
item for item in self.indices[ft]
291+
if key not in item["path"]
292+
]
293+
294+
def _delete_by_uid(self, folder_type, uid, alias):
295+
"""
296+
Delete old index entry using UID (prevents duplicates).
297+
"""
298+
logger.debug(f"Deleting and updating index entry for the script {alias} with UID {uid}")
299+
self.indices[folder_type] = [
300+
item for item in self.indices[folder_type]
301+
if item["uid"] != uid
302+
]
123303

124304
def _process_config_file(self, config_file, folder_type, folder_path, repo):
125305
"""
@@ -133,25 +313,36 @@ def _process_config_file(self, config_file, folder_type, folder_path, repo):
133313
Returns:
134314
None
135315
"""
316+
if config_file is None:
317+
logger.debug(f"No meta file in {folder_path}, skipping")
318+
return
319+
136320
try:
137321
# Determine the file type based on the extension
138322
if config_file.endswith(".yaml") or config_file.endswith(".yml"):
139323
with open(config_file, "r") as f:
140-
data = yaml.safe_load(f)
324+
data = yaml.safe_load(f) or {}
141325
elif config_file.endswith(".json"):
142326
with open(config_file, "r") as f:
143-
data = json.load(f)
327+
data = json.load(f) or {}
144328
else:
145-
logger.info(f"Skipping {config_file}: Unsupported file format.")
329+
logger.warning(f"Skipping {config_file}: Unsupported file format.")
330+
return
331+
332+
if not isinstance(data, dict):
333+
logger.warning(f"Skipping {config_file}: Invalid or empty meta")
146334
return
147-
148335
# Extract necessary fields
149336
unique_id = data.get("uid")
337+
if not unique_id:
338+
logger.warning(f"Skipping {config_file}: missing uid")
339+
return
150340
tags = data.get("tags", [])
151341
alias = data.get("alias", None)
152342

153343
# Validate and add to indices
154344
if unique_id:
345+
self._delete_by_uid(folder_type, unique_id, alias)
155346
self.indices[folder_type].append({
156347
"uid": unique_id,
157348
"tags": tags,
@@ -160,7 +351,8 @@ def _process_config_file(self, config_file, folder_type, folder_path, repo):
160351
"repo": repo
161352
})
162353
else:
163-
logger.info(f"Skipping {config_file}: Missing 'uid' field.")
354+
logger.warning(f"Skipping {config_file}: Missing 'uid' field.")
355+
164356
except Exception as e:
165357
logger.error(f"Error processing {config_file}: {e}")
166358

@@ -178,6 +370,6 @@ def _save_indices(self):
178370
try:
179371
with open(output_file, "w") as f:
180372
json.dump(index_data, f, indent=4, cls=CustomJSONEncoder)
181-
#logger.debug(f"Shared index for {folder_type} saved to {output_file}.")
373+
logger.debug(f"Shared index for {folder_type} saved to {output_file}.")
182374
except Exception as e:
183375
logger.error(f"Error saving shared index for {folder_type}: {e}")

0 commit comments

Comments
 (0)