1717 else, set the type to "Unknown".
18185. Finally, force instructions whose names start with specific prefixes:
1919 - Names starting with "fcvt" or "fmv" are forced to R-type.
20- - **Loads** (names starting with "lb", "ld", "lh", or "lw") are forced to I-type.
20+ - Load instructions (names starting with "lb", "ld", "lh", or "lw") are forced to I-type.
21+ - Any instruction whose name contains "fence" is forced to I-type.
22+ 6. As a fallback, if all tests have failed (the type is still "Unknown")
23+ and the encoding’s variables contain only "rd" and "rs1", then the type is set to R-type.
2124
22- Once determined, the script inserts (or updates) a new field named `type: `
25+ Once determined, the script inserts (or updates) a new field named `format `
2326immediately after the `long_name:` field.
2427"""
2528
2932from ruamel .yaml .scalarstring import PlainScalarString
3033from ruamel .yaml .representer import RoundTripRepresenter
3134
32-
33- yaml = YAML (typ = "rt" ) # Use round-trip mode
35+ # Use round-trip mode to preserve as much of the original formatting as possible.
36+ yaml = YAML (typ = "rt" )
3437yaml .preserve_quotes = True # Preserve original quoting
3538yaml .indent (mapping = 2 , sequence = 4 , offset = 2 )
3639yaml .width = 4096 # Prevent line wrapping
3740
3841
3942def represent_plain_str (dumper , data ):
40- # Force plain style (empty string) regardless of the content .
43+ # Force plain style (no quotes) for PlainScalarString instances .
4144 return dumper .represent_scalar ("tag:yaml.org,2002:str" , data , style = "" )
4245
4346
@@ -51,13 +54,12 @@ def parse_location(location):
5154 - If the location is an integer, it is treated as a single bit
5255 (e.g. 7 becomes [(7, 7)]).
5356 - If it is a string (e.g. "31|7|30-25|11-8"), it is assumed to be delimited by '|'
54- characters. Each segment is either a single bit (e.g. "7") or a range (e.g. "30-25") .
57+ characters. Each segment is either a single bit or a range.
5558
5659 Returns a list of tuples in the form (high_bit, low_bit).
5760 """
5861 if isinstance (location , int ):
5962 return [(location , location )]
60-
6163 segments = [seg .strip () for seg in location .split ("|" ) if seg .strip ()]
6264 parsed = []
6365 for seg in segments :
@@ -89,7 +91,6 @@ def identify_immediate_type(imm_location):
8991 """
9092 segments = parse_location (imm_location )
9193 seg_set = set (segments )
92-
9394 if len (segments ) == 1 and segments [0 ] == (31 , 20 ):
9495 return "I-type"
9596 if len (segments ) == 2 and seg_set == {(31 , 25 ), (11 , 7 )}:
@@ -100,7 +101,6 @@ def identify_immediate_type(imm_location):
100101 return "U-type"
101102 if len (segments ) == 4 and seg_set == {(31 , 31 ), (30 , 21 ), (20 , 20 ), (19 , 12 )}:
102103 return "J-type"
103-
104104 return "Unknown"
105105
106106
@@ -115,7 +115,6 @@ def check_rtype_registers(variables):
115115 found_source1 = False
116116 found_source2 = False
117117 found_dest = False
118-
119118 for var in variables :
120119 loc_str = var .get ("location" , "" )
121120 segments = parse_location (loc_str )
@@ -127,7 +126,6 @@ def check_rtype_registers(variables):
127126 found_source2 = True
128127 elif seg == (11 , 7 ):
129128 found_dest = True
130-
131129 return found_source1 and found_source2 and found_dest
132130
133131
@@ -142,19 +140,12 @@ def classify_compressed(match_field):
142140 Returns one of:
143141 "CIW", "CL", "CS", "CI", "CR", "CB", "CJ"
144142 or falls back to "C-type" if none match.
145-
146- Note: This mapping is a simplification and may not cover all cases.
147143 """
148144 if len (match_field ) != 16 :
149145 return "C-type"
150- # In our assumed representation, bit15 is match_field[0] and bit0 is match_field[15]
151- # Extract the two least-significant bits (bits 1:0):
152146 group = match_field [- 2 :]
153- # Extract bits 15:13 as funct3:
154147 funct3 = match_field [0 :3 ]
155-
156148 if group == "00" :
157- # Group 0: usually CIW, CL, or CS.
158149 if funct3 == "000" :
159150 return "CIW" # e.g., C.ADDI4SPN
160151 elif funct3 == "010" :
@@ -164,7 +155,6 @@ def classify_compressed(match_field):
164155 else :
165156 return "C-type"
166157 elif group == "01" :
167- # Group 1: often CI, CR, or CB.
168158 if funct3 in ["000" , "010" , "011" ]:
169159 return "CI" # e.g., C.ADDI, C.LI, C.ADDI16SP
170160 elif funct3 == "100" :
@@ -176,7 +166,6 @@ def classify_compressed(match_field):
176166 else :
177167 return "C-type"
178168 elif group == "10" :
179- # Group 2: similar to group 1.
180169 if funct3 in ["000" , "010" , "011" ]:
181170 return "CI"
182171 elif funct3 == "100" :
@@ -194,7 +183,7 @@ def classify_compressed(match_field):
194183def ensure_plain_match (enc ):
195184 """
196185 Ensure that if the given encoding dict has a 'match' field that is a string,
197- it is wrapped in PlainScalarString so that it is output without quotes.
186+ it is wrapped in PlainScalarString so that it is output without added quotes.
198187 """
199188 if isinstance (enc , dict ) and "match" in enc :
200189 match_value = enc ["match" ]
@@ -208,15 +197,20 @@ def process_file(filepath):
208197 - Determine the instruction type using the encoding section.
209198 - If the chosen encoding's "match" field is 16 characters long,
210199 classify the instruction using classify_compressed().
200+ - Otherwise, if the match field is entirely hardcoded (only "0" and "1"),
201+ force the instruction type to I-type.
211202 - Otherwise, if an "imm" variable is present, use it to classify the instruction.
212203 - Else if a "shamt" variable is present, classify the instruction as I-type.
213204 - Otherwise, if registers appear as expected for R-type, classify as R-type;
214205 else, set the type to "Unknown".
215206 - Finally, force instructions whose names start with specific prefixes:
216207 - Names starting with "fcvt" or "fmv" are forced to R-type.
217- - **Load instructions** (names starting with "lb", "ld", "lh", or "lw") are forced to I-type.
208+ - Load instructions (names starting with "lb", "ld", "lh", or "lw") are forced to I-type.
209+ - If the instruction name contains "fence", force it to I-type.
210+ - As a fallback: if all tests have failed (type is "Unknown") and the only register
211+ variables in the encoding are "rd" and "rs1", force the type to R-type.
218212 - Insert (or update) a new field "type:" immediately after "long_name:".
219- - Ensure that the 'match' field remains unquoted by wrapping it in PlainScalarString .
213+ - Ensure that the 'match' field remains unquoted.
220214 - Write the updated YAML back to the same file.
221215 """
222216 try :
@@ -228,29 +222,26 @@ def process_file(filepath):
228222
229223 # Handle nested encoding (e.g., RV32, RV64) versus flat encoding.
230224 encoding = data .get ("encoding" , {})
231- chosen_encoding = {}
232225 if isinstance (encoding , dict ) and ("RV32" in encoding or "RV64" in encoding ):
233- # Prefer RV32 if available; otherwise use RV64.
234226 chosen_encoding = encoding .get ("RV32" , encoding .get ("RV64" , {}))
235227 else :
236228 chosen_encoding = encoding
237229
238- # First, if the match field is 16 characters long, classify as a specific C-type.
239230 match_field = chosen_encoding .get ("match" , "" )
240- if isinstance (match_field , str ) and len (match_field ) == 16 :
231+ # If the match field is entirely hardcoded (only "0" and "1"), force I-type.
232+ if isinstance (match_field , str ) and match_field and set (match_field ) <= {"0" , "1" }:
233+ inst_type = "I-type"
234+ elif isinstance (match_field , str ) and len (match_field ) == 16 :
241235 inst_type = classify_compressed (match_field )
242236 else :
243- # Otherwise, use our usual tests.
244237 variables = chosen_encoding .get ("variables" , [])
245238 imm_location = None
246239 shamt_exists = False
247-
248240 for var in variables :
249241 if var .get ("name" ) == "imm" :
250242 imm_location = var .get ("location" )
251243 if var .get ("name" ) == "shamt" :
252244 shamt_exists = True
253-
254245 if imm_location is not None :
255246 inst_type = identify_immediate_type (imm_location )
256247 elif shamt_exists :
@@ -262,18 +253,51 @@ def process_file(filepath):
262253
263254 # Force specific instruction types based on the instruction name.
264255 inst_name = data .get ("name" , "" ).lower ()
265- # Force instructions starting with "fcvt" or "fmv" to be R-type.
266256 if inst_name .startswith ("fcvt" ) or inst_name .startswith ("fmv" ):
267257 inst_type = "R-type"
268- # Force load instructions (lb, ld, lh, lw) to be I-type.
269- elif inst_name .startswith (("lb" , "ld" , "lh" , "lw" , "lr" )):
270- # Loads are I-type
258+ elif inst_name .startswith (("lb" , "ld" , "lh" , "lw" , "lr" , "li" )):
271259 inst_type = "I-type"
260+ elif inst_name .startswith (("sw." , "sh." , "sd." , "sb." )):
261+ inst_type = "S-type"
262+ elif (
263+ "fence" in inst_name
264+ or inst_name .startswith ("cbo." )
265+ or inst_name .startswith ("ssrdp" )
266+ ):
267+ inst_type = "I-type"
268+
269+ # Fallback: if inst_type is still "Unknown", check if there are "rd" and "rs1".
270+ if inst_type == "Unknown" :
271+ var_names = [
272+ var .get ("name" , "" ).lower () for var in chosen_encoding .get ("variables" , [])
273+ ]
274+ # Remove empty names if any.
275+ var_names = [name for name in var_names if name ]
276+ if (
277+ any (name .endswith ("s1" ) for name in var_names )
278+ and any (name .endswith ("d" ) for name in var_names )
279+ or (
280+ any (name .endswith ("s1" ) for name in var_names )
281+ and any (name .endswith ("s2" ) for name in var_names )
282+ )
283+ or (
284+ any (name .endswith ("s2" ) for name in var_names )
285+ and any (name .endswith ("d" ) for name in var_names )
286+ )
287+ ):
288+ inst_type = "R-type"
289+ elif {"csr" , "imm" , "rd" }.issubset (set (var_names )) or {
290+ "csr" ,
291+ "uimm" ,
292+ "rd" ,
293+ }.issubset (set (var_names )):
294+ inst_type = "I-type"
272295
273- # Insert or update the new field "type:" immediately after "long_name:"
296+ # Insert ( or update) a new field "type:" immediately after "long_name:".
274297 if "long_name" in data :
275298 keys = list (data .keys ())
276299 idx = keys .index ("long_name" )
300+ # Use "format" as the key (change to "type" if desired)
277301 if "format" in data :
278302 data ["format" ] = inst_type
279303 else :
@@ -303,9 +327,7 @@ def main():
303327 if len (sys .argv ) < 2 :
304328 print ("Usage: {} <file_or_directory>" .format (sys .argv [0 ]))
305329 sys .exit (1 )
306-
307330 path = Path (sys .argv [1 ])
308-
309331 if path .is_file () and path .suffix == ".yaml" :
310332 process_file (path )
311333 elif path .is_dir ():
0 commit comments