Skip to content

Commit 1af26f1

Browse files
committed
rebuilt the string matching system
1 parent 12e926f commit 1af26f1

File tree

1 file changed

+55
-32
lines changed

1 file changed

+55
-32
lines changed

z80-disassembler.py

Lines changed: 55 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -188,25 +188,44 @@ def process_hextype(hexaddr):
188188
return hexaddr.replace("0x","#")
189189
return hexaddr
190190

191-
def build_strings_from_binary_data(binary_data):
192-
strings = []
193-
current_string = []
194-
195-
for byte in binary_data:
196-
if is_alphanumeric(byte):
197-
current_string.append(chr(byte))
198-
elif is_terminator(byte):
199-
if current_string:
200-
current_string.append(decode_terminator(byte))
201-
strings.append(''.join(current_string))
202-
current_string = []
203-
204-
# Append the last string if it exists
205-
if current_string:
206-
strings.append(''.join(current_string))
207-
208-
# return strings
209-
return (''.join(strings))
191+
# def build_strings_from_binary_data(binary_data):
192+
# strings = []
193+
# current_string = []
194+
#
195+
# for byte in binary_data:
196+
# if is_alphanumeric(byte):
197+
# current_string.append(chr(byte))
198+
# elif is_terminator(byte):
199+
# if current_string:
200+
# current_string.append(decode_terminator(byte))
201+
# strings.append(''.join(current_string))
202+
# current_string = []
203+
#
204+
# # Append the last string if it exists
205+
# if current_string:
206+
# strings.append(''.join(current_string))
207+
#
208+
# # return strings
209+
# return (''.join(strings))
210+
211+
212+
def build_strings_from_binary_data(binary_data, min_length=3):
213+
"""
214+
Searches binary data for ASCII strings of at least min_length and returns a list of found strings.
215+
216+
Args:
217+
binary_data (bytes): The binary data to search.
218+
min_length (int): The minimum length of ASCII string to find. Default is 4.
219+
220+
Returns:
221+
list[str]: List of ASCII strings found in the binary data.
222+
"""
223+
# Regular expression to match runs of printable ASCII characters
224+
# print(f"\n{len(binary_data)}")
225+
pattern = rb'[\x20-\x7E]{%d,}' % min_length
226+
matches = re.findall(pattern, binary_data)
227+
# Decode bytes to string, ignoring errors
228+
return [m.decode('ascii', errors='ignore') for m in matches]
210229

211230
def print_progress_bar(iteration, total, prefix='', suffix='', decimals=1, length=50, fill='█', print_end="\r"):
212231
"""
@@ -1465,22 +1484,22 @@ def findstring(memstart, memend):
14651484
# print("3")
14661485
# found terminator, output it
14671486
# known_string=f'DEFB {b}{decode_terminator(code[m][0])}'
1468-
code_output(orig,f'x1 DEFB {b}{decode_terminator(code[m][0])}',list_address,f'{addcomment}{hexstyle}{orig:x} to {hexstyle}{(orig+len(a)+1):x}')
1487+
code_output(orig,f'DEFB {b}{decode_terminator(code[m][0])}',list_address,f'{addcomment}{hexstyle}{orig:x} to {hexstyle}{(orig+len(a)+1):x}')
14691488
# print(f'Bump 1 {hex(program_counter)}-->{hex(program_counter+len(a)-1)}')
14701489
program_counter += len(a)-1
14711490
elif identified(m)=="S" and not is_terminator(code[m][0]):
14721491
# print("------>>>> 4")
14731492
# Causing issues with some string endings
14741493
#No terminator, just dump the string
14751494
# print("-->", hex(program_counter),b,a)
1476-
code_output(orig,f'x2 DEFB {a}',list_address,f'{addcomment}{hexstyle}{orig:x} to {hexstyle}{orig+len(a)-2:x}')
1495+
code_output(orig,f'DEFB {a}',list_address,f'{addcomment}{hexstyle}{orig:x} to {hexstyle}{orig+len(a)-2:x}')
14771496
# print(f'Bump 2 {hex(program_counter)}-->{hex(program_counter+len(a)-2)}')
14781497
program_counter += len(a)-1
14791498
# program_counter=program_counter+len(b)
14801499
# str_locations[program_counter]
14811500
else:
14821501
# print("5")
1483-
code_output(orig,f'x3 DEFB "{d}',list_address,f'{addcomment}{hexstyle}{orig:x} to {hexstyle}{orig+len(a)-2:x}')
1502+
code_output(orig,f'DEFB "{d}',list_address,f'{addcomment}{hexstyle}{orig:x} to {hexstyle}{orig+len(a)-2:x}')
14841503
# print(f'Bump 3 {hex(program_counter)}-->{hex(program_counter+len(a)-2)}')
14851504
program_counter += len(a)-2
14861505
# print(hex(program_counter))
@@ -1513,19 +1532,20 @@ def findstring(memstart, memend):
15131532
cnt=program_counter
15141533
result=build_strings_from_binary_data(tmp_array)
15151534
# print("-->",result)
1535+
# print(f"{len(result)}")
15161536
# result=result.replace('"', '",34,"').replace("\\", '", 0x5c, "')
15171537
# print("---->",result,code[src_array_index][1],---code[src_array_index][2],"\n")
15181538
# program_counter=program_counter+len(result)
15191539
str_len=len(result)
1520-
result=result.replace('"', '",34,"').replace("\\", '", 0x5c, "')
15211540
# print("-->",result,(identified(program_counter) == "S"),is_terminator(code[program_counter][0]))
15221541
# dump_code_array("-- term -->",program_counter,)
15231542
# print("-->",result)
15241543
#--------------------------------
15251544
#FIXME: Something in here is breaking labels after a string, probably one of the increments
15261545
# So its adding code area to the string if the string isn't terminated, but the area is marked as code.
15271546
# !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
1528-
if result!="":
1547+
if str_len>0:
1548+
result=result[0].replace('"', '",34,"').replace("\\", '", 0x5c, "')
15291549
# if 0xf77b < program_counter < 0xf79c:
15301550
# print("----> 1-",hex(program_counter),identified(program_counter))
15311551
# program_counter=program_counter+str_len
@@ -1535,31 +1555,34 @@ def findstring(memstart, memend):
15351555
else:
15361556
addcomment=""
15371557

1538-
code_output(program_counter,f'x4 DEFB "{result}{decode_terminator(code[program_counter+str_len][0])}',list_address,f'{addcomment}{hexstyle}{program_counter:x} to {hexstyle}{(program_counter+str_len+1):x}')
1558+
code_output(program_counter,f'DEFB "{result}{decode_terminator(code[program_counter+str_len][0])}',list_address,f'{addcomment}{hexstyle}{program_counter:x} to {hexstyle}{(program_counter+str_len):x}')
15391559
# Bump for terminator
15401560
# print(f'Bump 4 {hex(program_counter)}-->{hex(program_counter+str_len)}')
1541-
program_counter +=str_len+1
1561+
program_counter +=len(result)+1
15421562
else:
15431563
#Probably never called, but better safe etc etc
1544-
code_output(program_counter,f'x5 DEFB "{result}"',list_address)
1564+
code_output(program_counter,f'DEFB "{result}"',list_address)
15451565
elif (identified(program_counter) == "S") and (code[program_counter][0]>0x80) and not is_terminator(code[program_counter][0]):
15461566
# if 0xf77b < program_counter < 0xf79c:
15471567
# print("----> 2 -",hex(program_counter),identified(program_counter))
15481568
#Issue #30: This is part of the issue, but not sure why yet.
1549-
result=result+decode_terminator(code[program_counter][0]).replace('",',"")
1550-
code_output(program_counter-str_len,f'x6 DEFB {result}',list_address)
1569+
# result=result+decode_terminator(code[program_counter][0]).replace('",',"")
1570+
1571+
result=hex(code[program_counter][0])
1572+
1573+
code_output(program_counter-str_len,f'DEFB {result}',list_address)
15511574
# print(f'Bump 5 {hex(program_counter)}-->{hex(program_counter+1)}')
15521575
program_counter +=1 #str_len
15531576
else:
15541577
# print("----> 3 -",hex(program_counter),identified(program_counter))
1555-
code_output(program_counter-str_len,f'x7 DEFB {hexstyle}{(code[program_counter][0]):x}',list_address)
1578+
code_output(program_counter-str_len,f'DEFB {hexstyle}{(code[program_counter][0]):x}',list_address)
15561579
# print(f'Bump 6 {hex(program_counter)}-->{hex(program_counter+1)}')
15571580
program_counter +=1
15581581
# elif identified(program_counter) == "D" and (program_counter in str_locations) and not stay_in_code:
15591582
elif identified(program_counter) == "D" and (program_counter in str_locations):
15601583
#Its a string!
15611584
code_output(
1562-
program_counter, "x8 DEFB " + str_locations[program_counter], list_address
1585+
program_counter, "DEFB " + str_locations[program_counter], list_address
15631586
)
15641587
# print(f'Bump 7 {hex(program_counter)}-->{hex(program_counter+str_sizes[program_counter])}')
15651588
program_counter += str_sizes[program_counter]
@@ -1581,7 +1604,7 @@ def findstring(memstart, memend):
15811604
#BUG: Causes defb 01 01 on -c 0
15821605
if commentlevel==0:
15831606
out_tmp="; "+out_tmp
1584-
code_output(program_counter, f"x9 DEFB {hexstyle}{tmp:x}", list_address, f'{out_tmp}')
1607+
code_output(program_counter, f"DEFB {hexstyle}{tmp:x}", list_address, f'{out_tmp}')
15851608
# debug("PC Bump")
15861609
program_counter += 1 #FIXME - tripping PC too much?
15871610
elif identified(program_counter) == "Dw":

0 commit comments

Comments
 (0)