1+ """A script to populate the 'validated_runs' field in data records."""
12import json
23import os
34import re
45
6+
57def build_validation_lookup (directory ):
6- """
7- Scans all 'validated-runs' files to build a lookup table mapping
8- a record's recid to its validation type ('full' or 'muonsonly').
8+ """Scan all 'validated-runs' files to build a lookup table.
9+
10+ This function maps a record's recid to its validation type ('full' or 'muonsonly').
911 """
1012 validation_lookup = {}
11- validated_files_regex = re .compile (r' .*validated-runs.*\.json' , re .IGNORECASE )
12- muons_only_regex = re .compile (r' only valid muons' , re .IGNORECASE )
13+ validated_files_regex = re .compile (r" .*validated-runs.*\.json" , re .IGNORECASE )
14+ muons_only_regex = re .compile (r" only valid muons" , re .IGNORECASE )
1315
1416 for filename in os .listdir (directory ):
1517 if validated_files_regex .match (filename ):
1618 filepath = os .path .join (directory , filename )
1719 try :
18- with open (filepath , 'r' , encoding = ' utf-8' ) as f :
20+ with open (filepath , "r" , encoding = " utf-8" ) as f :
1921 data = json .load (f )
2022
2123 records_to_process = []
@@ -28,13 +30,15 @@ def build_validation_lookup(directory):
2830 if not isinstance (record , dict ):
2931 continue
3032
31- recid = record .get (' recid' )
32- title = record .get (' title' , '' )
33- title_additional = record .get (' title_additional' , '' )
33+ recid = record .get (" recid" )
34+ title = record .get (" title" , "" )
35+ title_additional = record .get (" title_additional" , "" )
3436
3537 if recid and (title or title_additional ):
3638 validation_type = "full"
37- if muons_only_regex .search (title ) or muons_only_regex .search (title_additional ):
39+ if muons_only_regex .search (title ) or muons_only_regex .search (
40+ title_additional
41+ ):
3842 validation_type = "muonsonly"
3943 validation_lookup [str (recid )] = validation_type
4044
@@ -45,18 +49,22 @@ def build_validation_lookup(directory):
4549
4650 return validation_lookup , validated_files_regex
4751
52+
4853def fix_and_add_validated_runs (directory , validation_lookup , validated_files_regex ):
54+ """Add a 'validated_runs' field to records based on a lookup table.
55+
56+ This function uses a pre-built validation lookup table and skips all
57+ validated-run files themselves.
4958 """
50- Adds a 'validated_runs' field to records based on the pre-built
51- validation lookup table.
52- """
53- validated_description_regex = re .compile (r'validated (runs|lumi sections)' , re .IGNORECASE )
59+ validated_description_regex = re .compile (
60+ r"validated (runs|lumi sections)" , re .IGNORECASE
61+ )
5462
5563 for filename in os .listdir (directory ):
5664 if not validated_files_regex .match (filename ):
5765 filepath = os .path .join (directory , filename )
5866 try :
59- with open (filepath , 'r' , encoding = ' utf-8' ) as f :
67+ with open (filepath , "r" , encoding = " utf-8" ) as f :
6068 data = json .load (f )
6169
6270 records_to_process = []
@@ -67,35 +75,48 @@ def fix_and_add_validated_runs(directory, validation_lookup, validated_files_reg
6775
6876 modified = False
6977 for record in records_to_process :
70- if not isinstance (record , dict ) or ' validated_runs' in record :
78+ if not isinstance (record , dict ) or " validated_runs" in record :
7179 continue
7280
73- if ('abstract' in record and
74- isinstance (record .get ('abstract' ), dict ) and
75- validated_description_regex .search (record ['abstract' ].get ('description' , '' ))):
81+ if (
82+ "abstract" in record
83+ and isinstance (record .get ("abstract" ), dict )
84+ and validated_description_regex .search (
85+ record ["abstract" ].get ("description" , "" )
86+ )
87+ ):
7688
77- links = record [' abstract' ].get (' links' , [])
89+ links = record [" abstract" ].get (" links" , [])
7890 if links :
79- record [' validated_runs' ] = []
91+ record [" validated_runs" ] = []
8092 for link in links :
81- link_recid = link .get (' recid' )
93+ link_recid = link .get (" recid" )
8294 if link_recid :
83- validation_type = validation_lookup .get (str (link_recid ), "full" )
84- record ['validated_runs' ].append ({
85- "recid" : link_recid ,
86- "validation" : validation_type
87- })
95+ validation_type = validation_lookup .get (
96+ str (link_recid ), "full"
97+ )
98+ record ["validated_runs" ].append (
99+ {
100+ "recid" : link_recid ,
101+ "validation" : validation_type ,
102+ }
103+ )
88104 modified = True
89105 if modified :
90- final_data = records_to_process if isinstance (data , list ) else records_to_process [0 ]
91- with open (filepath , 'w' , encoding = 'utf-8' ) as f :
106+ final_data = (
107+ records_to_process
108+ if isinstance (data , list )
109+ else records_to_process [0 ]
110+ )
111+ with open (filepath , "w" , encoding = "utf-8" ) as f :
92112 json .dump (final_data , f , indent = 2 , ensure_ascii = False )
93- f .write (' \n ' )
113+ f .write (" \n " )
94114 print (f"Updated validated runs in { filename } " )
95115
96116 except (json .JSONDecodeError , IOError ) as e :
97117 print (f"An error occurred with { filename } : { e } " )
98118
119+
99120if __name__ == "__main__" :
100121 data_directory = "data/records"
101122 print ("Building validation lookup table..." )
@@ -104,4 +125,4 @@ def fix_and_add_validated_runs(directory, validation_lookup, validated_files_reg
104125
105126 print ("\n Processing dataset records..." )
106127 fix_and_add_validated_runs (data_directory , validation_map , validated_files_pattern )
107- print ("\n Script finished." )
128+ print ("\n Script finished." )
0 commit comments