@@ -437,26 +437,33 @@ def generateCollTestDataObjects(self, filename, icu_version, ignorePunctuation,
437437 # It's a data line. Include in testing.
438438 if not prev :
439439 # Just getting started.
440- prev = self .parseCollTestData (item )
440+ prev , prev_codepoints = self .parseCollTestData (item )
441441 continue
442442
443443 # Get the code points for each test
444- next = self .parseCollTestData (item )
444+ try :
445+ next , next_codepoints = self .parseCollTestData (item )
446+ except BaseException as e :
447+ pass
445448
446449 if not next :
447450 # This is a problem with the data input. D80[0-F] is the high surrogate
448451 data_errors .append ([index , item ])
449452 continue
450453
451454 label = str (count ).rjust (max_digits , "0" )
452- new_test = {"label" : label , "s1" : prev , "s2" : next , "strength" : "identical" , "line" : line_number , "source_file" : filename }
455+ new_test = {"label" : label , "s1" : prev , "s2" : next , "strength" : "identical" , "line" : line_number ,
456+ "source_file" : filename ,
457+ "s1_codes" : prev_codepoints ,
458+ "s2_codes" : next_codepoints }
453459 if ignorePunctuation :
454460 new_test ["ignorePunctuation" ] = True
455461 test_list .append (new_test )
456462
457463 verify_list .append ({"label" : label , "verify" : True })
458464
459465 prev = next # set up for next pair
466+ prev_codepoints = next_codepoints
460467 count += 1
461468 index += 1
462469
@@ -476,16 +483,21 @@ def parseCollTestData(self, testdata):
476483 recodepoint = re .compile (r"[0-9a-fA-F]{4,6}" )
477484
478485 return_list = []
486+ code_text = []
479487 codepoints = recodepoint .findall (testdata )
480488 for code in codepoints :
489+ code_text .append (code )
481490 num_code = int (code , 16 )
482491 if num_code >= 0xD800 and num_code <= 0xDFFF :
483- return None
492+ return None , None
484493 return_list .append (self .stringifyCode (num_code ))
485- return "" .join (return_list )
494+ return "" .join (return_list ), code_text
486495
487496 def stringifyCode (self , cp ):
488497 # Just include character and escaping will work in JSONification
498+ if cp >= 0x10000 :
499+ # This is an SMP character. Do we handle it differently?
500+ pass
489501 try :
490502 teststring = chr (cp )
491503 except ValueError as err :
0 commit comments