1
- # This software is Copyright (c) 2012-2020 magnum, and it is hereby
1
+ # This software is Copyright (c) 2012-2024 magnum, and it is hereby
2
2
# released to the general public under the following terms:
3
3
# Redistribution and use in source and binary forms, with or without
4
4
# modification, are permitted.
5
5
#
6
6
# Generic implementation of "dumb" exhaustive search of Unicode BMP.
7
- # Default is to try *all* allocated characters in the BMP of Unicode v13
8
- # (there's 55,387 of them). Even if a fast format can exhaust two characters
7
+ # Default is to try *all* allocated characters in the BMP of Unicode v16
8
+ # (there's 55,537 of them). Even if a fast format can exhaust two characters
9
9
# in 15 minutes, three characters would take 1.5 years...
10
10
#
11
11
# Note that these modes will handle --max-len differently than normal: They
@@ -22,7 +22,7 @@ int maxlength; // Maximum password length to try
22
22
int last; // Last character position, zero-based
23
23
int lastid; // Character index in the last position
24
24
int id[0x7f]; // Current character indices for other positions
25
- int charset[0x10000 ], c0; // Characters
25
+ int charset[0xd900 ], c0; // Characters
26
26
27
27
void init()
28
28
{
@@ -43,7 +43,7 @@ void init()
43
43
44
44
/*
45
45
* This defines the character set. This is auto-generated from UnicodeData.txt
46
- * and we skip control characters.
46
+ * of Unicode 16.0.0 and we skip control characters.
47
47
*/
48
48
i = 0;
49
49
// 0000..007F; Basic Latin
@@ -119,9 +119,6 @@ void init()
119
119
charset[i++] = c++;
120
120
// 0600..06FF; Arabic
121
121
c = 0x600; // from ARABIC NUMBER SIGN
122
- while (c <= 0x61c) // ..to ARABIC LETTER MARK
123
- charset[i++] = c++;
124
- c = 0x61e; // from ARABIC TRIPLE DOT PUNCTUATION MARK
125
122
while (c <= 0x6ff) // ..to ARABIC LETTER HEH WITH INVERTED V
126
123
charset[i++] = c++;
127
124
// 0700..074F; Syriac
@@ -163,14 +160,17 @@ void init()
163
160
c = 0x860; // from SYRIAC LETTER MALAYALAM NGA
164
161
while (c <= 0x86a) // ..to SYRIAC LETTER MALAYALAM SSA
165
162
charset[i++] = c++;
166
- // 08A0..08FF ; Arabic Extended-A
167
- c = 0x8a0 ; // from ARABIC LETTER BEH WITH SMALL V BELOW
168
- while (c <= 0x8b4 ) // ..to ARABIC LETTER KAF WITH DOT BELOW
163
+ // 0870..089F ; Arabic Extended-B
164
+ c = 0x870 ; // from ARABIC LETTER ALEF WITH ATTACHED FATHA
165
+ while (c <= 0x88e ) // ..to ARABIC VERTICAL TAIL
169
166
charset[i++] = c++;
170
- c = 0x8b6; // from ARABIC LETTER BEH WITH SMALL MEEM ABOVE
171
- while (c <= 0x8c7) // ..to ARABIC LETTER LAM WITH SMALL ARABIC LETTER TAH ABOVE
167
+ charset[i++] = 0x890; // ARABIC POUND MARK ABOVE
168
+ charset[i++] = 0x891; // ARABIC PIASTRE MARK ABOVE
169
+ c = 0x897; // from ARABIC PEPET
170
+ while (c <= 0x89f) // ..to ARABIC HALF MADDA OVER MADDA
172
171
charset[i++] = c++;
173
- c = 0x8d3; // from ARABIC SMALL LOW WAW
172
+ // 08A0..08FF; Arabic Extended-A
173
+ c = 0x8a0; // from ARABIC LETTER BEH WITH SMALL V BELOW
174
174
while (c <= 0x8ff) // ..to ARABIC MARK SIDEWAYS NOON GHUNNA
175
175
charset[i++] = c++;
176
176
// 0900..097F; Devanagari
@@ -360,7 +360,7 @@ void init()
360
360
c = 0xc2a; // from TELUGU LETTER PA
361
361
while (c <= 0xc39) // ..to TELUGU LETTER HA
362
362
charset[i++] = c++;
363
- c = 0xc3d ; // from TELUGU SIGN AVAGRAHA
363
+ c = 0xc3c ; // from TELUGU SIGN NUKTA
364
364
while (c <= 0xc44) // ..to TELUGU VOWEL SIGN VOCALIC RR
365
365
charset[i++] = c++;
366
366
charset[i++] = 0xc46; // TELUGU VOWEL SIGN E
@@ -406,14 +406,16 @@ void init()
406
406
charset[i++] = c++;
407
407
charset[i++] = 0xcd5; // KANNADA LENGTH MARK
408
408
charset[i++] = 0xcd6; // KANNADA AI LENGTH MARK
409
+ charset[i++] = 0xcdd; // KANNADA LETTER NAKAARA POLLU
410
+ charset[i++] = 0xcde; // KANNADA LETTER FA
409
411
c = 0xce0; // from KANNADA LETTER VOCALIC RR
410
412
while (c <= 0xce3) // ..to KANNADA VOWEL SIGN VOCALIC LL
411
413
charset[i++] = c++;
412
414
c = 0xce6; // from KANNADA DIGIT ZERO
413
415
while (c <= 0xcef) // ..to KANNADA DIGIT NINE
414
416
charset[i++] = c++;
415
417
charset[i++] = 0xcf1; // KANNADA SIGN JIHVAMULIYA
416
- charset[i++] = 0xcf2 ; // KANNADA SIGN UPADHMANIYA
418
+ charset[i++] = 0xcf3 ; // KANNADA SIGN COMBINING ANUSVARA ABOVE RIGHT
417
419
// 0D00..0D7F; Malayalam
418
420
c = 0xd00; // from MALAYALAM SIGN COMBINING ANUSVARA ABOVE
419
421
while (c <= 0xd0c) // ..to MALAYALAM LETTER VOCALIC L
@@ -483,7 +485,7 @@ void init()
483
485
while (c <= 0xec4) // ..to LAO VOWEL SIGN AI
484
486
charset[i++] = c++;
485
487
c = 0xec8; // from LAO TONE MAI EK
486
- while (c <= 0xecd ) // ..to LAO NIGGAHITA
488
+ while (c <= 0xece ) // ..to LAO YAMAKKAN
487
489
charset[i++] = c++;
488
490
c = 0xed0; // from LAO DIGIT ZERO
489
491
while (c <= 0xed9) // ..to LAO DIGIT NINE
@@ -596,11 +598,9 @@ void init()
596
598
charset[i++] = c++;
597
599
// 1700..171F; Tagalog
598
600
c = 0x1700; // from TAGALOG LETTER A
599
- while (c <= 0x170c) // ..to TAGALOG LETTER YA
600
- charset[i++] = c++;
601
- c = 0x170e; // from TAGALOG LETTER LA
602
- while (c <= 0x1714) // ..to TAGALOG SIGN VIRAMA
601
+ while (c <= 0x1715) // ..to TAGALOG SIGN PAMUDPOD
603
602
charset[i++] = c++;
603
+ charset[i++] = 0x171f; // TAGALOG LETTER ARCHAIC RA
604
604
// 1720..173F; Hanunoo
605
605
c = 0x1720; // from HANUNOO LETTER A
606
606
while (c <= 0x1736) // ..to PHILIPPINE DOUBLE PUNCTUATION
@@ -629,9 +629,6 @@ void init()
629
629
charset[i++] = c++;
630
630
// 1800..18AF; Mongolian
631
631
c = 0x1800; // from MONGOLIAN BIRGA
632
- while (c <= 0x180e) // ..to MONGOLIAN VOWEL SEPARATOR
633
- charset[i++] = c++;
634
- c = 0x1810; // from MONGOLIAN DIGIT ZERO
635
632
while (c <= 0x1819) // ..to MONGOLIAN DIGIT NINE
636
633
charset[i++] = c++;
637
634
c = 0x1820; // from MONGOLIAN LETTER A
@@ -704,14 +701,14 @@ void init()
704
701
charset[i++] = c++;
705
702
// 1AB0..1AFF; Combining Diacritical Marks Extended
706
703
c = 0x1ab0; // from COMBINING DOUBLED CIRCUMFLEX ACCENT
707
- while (c <= 0x1ac0 ) // ..to COMBINING LATIN SMALL LETTER TURNED W BELOW
704
+ while (c <= 0x1ace ) // ..to COMBINING LATIN SMALL LETTER INSULAR T
708
705
charset[i++] = c++;
709
706
// 1B00..1B7F; Balinese
710
707
c = 0x1b00; // from BALINESE SIGN ULU RICEM
711
- while (c <= 0x1b4b ) // ..to BALINESE LETTER ASYURA SASAK
708
+ while (c <= 0x1b4c ) // ..to BALINESE LETTER ARCHAIC JNYA
712
709
charset[i++] = c++;
713
- c = 0x1b50 ; // from BALINESE DIGIT ZERO
714
- while (c <= 0x1b7c ) // ..to BALINESE MUSICAL SYMBOL LEFT-HAND OPEN PING
710
+ c = 0x1b4e ; // from BALINESE INVERTED CARIK SIKI
711
+ while (c <= 0x1b7f ) // ..to BALINESE PANTI BAWAK
715
712
charset[i++] = c++;
716
713
// 1B80..1BBF; Sundanese
717
714
c = 0x1b80; // from SUNDANESE SIGN PANYECEK
@@ -739,7 +736,7 @@ void init()
739
736
charset[i++] = c++;
740
737
// 1C80..1C8F; Cyrillic Extended-C
741
738
c = 0x1c80; // from CYRILLIC SMALL LETTER ROUNDED VE
742
- while (c <= 0x1c88 ) // ..to CYRILLIC SMALL LETTER UNBLENDED UK
739
+ while (c <= 0x1c8a ) // ..to CYRILLIC SMALL LETTER TJE
743
740
charset[i++] = c++;
744
741
// 1C90..1CBF; Georgian Extended
745
742
c = 0x1c90; // from GEORGIAN MTAVRULI CAPITAL LETTER AN
@@ -765,9 +762,6 @@ void init()
765
762
charset[i++] = c++;
766
763
// 1DC0..1DFF; Combining Diacritical Marks Supplement
767
764
c = 0x1dc0; // from COMBINING DOTTED GRAVE ACCENT
768
- while (c <= 0x1df9) // ..to COMBINING WIDE INVERTED BRIDGE BELOW
769
- charset[i++] = c++;
770
- c = 0x1dfb; // from COMBINING DELETION MARK
771
765
while (c <= 0x1dff) // ..to COMBINING RIGHT ARROWHEAD AND DOWN ARROWHEAD BELOW
772
766
charset[i++] = c++;
773
767
// 1E00..1EFF; Latin Extended Additional
@@ -831,7 +825,7 @@ void init()
831
825
charset[i++] = c++;
832
826
// 20A0..20CF; Currency Symbols
833
827
c = 0x20a0; // from EURO-CURRENCY SIGN
834
- while (c <= 0x20bf ) // ..to BITCOIN SIGN
828
+ while (c <= 0x20c0 ) // ..to SOM SIGN
835
829
charset[i++] = c++;
836
830
// 20D0..20FF; Combining Diacritical Marks for Symbols
837
831
c = 0x20d0; // from COMBINING LEFT HARPOON ABOVE
@@ -859,7 +853,7 @@ void init()
859
853
charset[i++] = c++;
860
854
// 2400..243F; Control Pictures
861
855
c = 0x2400; // from SYMBOL FOR NULL
862
- while (c <= 0x2426 ) // ..to SYMBOL FOR SUBSTITUTE FORM TWO
856
+ while (c <= 0x2429 ) // ..to SYMBOL FOR DELETE MEDIUM SHADE FORM
863
857
charset[i++] = c++;
864
858
// 2440..245F; Optical Character Recognition
865
859
c = 0x2440; // from OCR HOOK
@@ -925,10 +919,7 @@ void init()
925
919
charset[i++] = c++;
926
920
// 2C00..2C5F; Glagolitic
927
921
c = 0x2c00; // from GLAGOLITIC CAPITAL LETTER AZU
928
- while (c <= 0x2c2e) // ..to GLAGOLITIC CAPITAL LETTER LATINATE MYSLITE
929
- charset[i++] = c++;
930
- c = 0x2c30; // from GLAGOLITIC SMALL LETTER AZU
931
- while (c <= 0x2c5e) // ..to GLAGOLITIC SMALL LETTER LATINATE MYSLITE
922
+ while (c <= 0x2c5f) // ..to GLAGOLITIC SMALL LETTER CAUDATE CHRIVI
932
923
charset[i++] = c++;
933
924
// 2C60..2C7F; Latin Extended-C
934
925
c = 0x2c60; // from LATIN CAPITAL LETTER L WITH DOUBLE BAR
@@ -989,7 +980,7 @@ void init()
989
980
charset[i++] = c++;
990
981
// 2E00..2E7F; Supplemental Punctuation
991
982
c = 0x2e00; // from RIGHT ANGLE SUBSTITUTION MARKER
992
- while (c <= 0x2e52 ) // ..to TIRONIAN SIGN CAPITAL ET
983
+ while (c <= 0x2e5d ) // ..to OBLIQUE HYPHEN
993
984
charset[i++] = c++;
994
985
// 2E80..2EFF; CJK Radicals Supplement
995
986
c = 0x2e80; // from CJK RADICAL REPEAT
@@ -1004,7 +995,7 @@ void init()
1004
995
charset[i++] = c++;
1005
996
// 2FF0..2FFF; Ideographic Description Characters
1006
997
c = 0x2ff0; // from IDEOGRAPHIC DESCRIPTION CHARACTER LEFT TO RIGHT
1007
- while (c <= 0x2ffb ) // ..to IDEOGRAPHIC DESCRIPTION CHARACTER OVERLAID
998
+ while (c <= 0x2fff ) // ..to IDEOGRAPHIC DESCRIPTION CHARACTER ROTATION
1008
999
charset[i++] = c++;
1009
1000
// 3000..303F; CJK Symbols and Punctuation
1010
1001
c = 0x3000; // from IDEOGRAPHIC SPACE
@@ -1039,8 +1030,9 @@ void init()
1039
1030
charset[i++] = c++;
1040
1031
// 31C0..31EF; CJK Strokes
1041
1032
c = 0x31c0; // from CJK STROKE T
1042
- while (c <= 0x31e3 ) // ..to CJK STROKE Q
1033
+ while (c <= 0x31e5 ) // ..to CJK STROKE SZP
1043
1034
charset[i++] = c++;
1035
+ charset[i++] = 0x31ef; // IDEOGRAPHIC DESCRIPTION CHARACTER SUBTRACTION
1044
1036
// 31F0..31FF; Katakana Phonetic Extensions
1045
1037
c = 0x31f0; // from KATAKANA LETTER SMALL KU
1046
1038
while (c <= 0x31ff) // ..to KATAKANA LETTER SMALL RO
@@ -1066,7 +1058,7 @@ void init()
1066
1058
charset[i++] = c++;
1067
1059
// 4E00..9FFF; CJK Unified Ideographs
1068
1060
c = 0x4e00; // from <CJK Ideograph, First>
1069
- while (c <= 0x9ffc ) // ..to <CJK Ideograph, Last>
1061
+ while (c <= 0x9fff ) // ..to <CJK Ideograph, Last>
1070
1062
charset[i++] = c++;
1071
1063
// A000..A48F; Yi Syllables
1072
1064
c = 0xa000; // from YI SYLLABLE IT
@@ -1098,12 +1090,14 @@ void init()
1098
1090
charset[i++] = c++;
1099
1091
// A720..A7FF; Latin Extended-D
1100
1092
c = 0xa720; // from MODIFIER LETTER STRESS AND HIGH TONE
1101
- while (c <= 0xa7bf ) // ..to LATIN SMALL LETTER GLOTTAL U
1093
+ while (c <= 0xa7cd ) // ..to LATIN SMALL LETTER S WITH DIAGONAL STROKE
1102
1094
charset[i++] = c++;
1103
- c = 0xa7c2; // from LATIN CAPITAL LETTER ANGLICANA W
1104
- while (c <= 0xa7ca) // ..to LATIN SMALL LETTER S WITH SHORT STROKE OVERLAY
1095
+ charset[i++] = 0xa7d0; // LATIN CAPITAL LETTER CLOSED INSULAR G
1096
+ charset[i++] = 0xa7d1; // LATIN SMALL LETTER CLOSED INSULAR G
1097
+ c = 0xa7d5; // from LATIN SMALL LETTER DOUBLE WYNN
1098
+ while (c <= 0xa7dc) // ..to LATIN CAPITAL LETTER LAMBDA WITH STROKE
1105
1099
charset[i++] = c++;
1106
- c = 0xa7f5 ; // from LATIN CAPITAL LETTER REVERSED HALF H
1100
+ c = 0xa7f2 ; // from MODIFIER LETTER CAPITAL C
1107
1101
while (c <= 0xa7ff) // ..to LATIN EPIGRAPHIC LETTER ARCHAIC M
1108
1102
charset[i++] = c++;
1109
1103
// A800..A82F; Syloti Nagri
@@ -1258,19 +1252,16 @@ void init()
1258
1252
charset[i++] = c++;
1259
1253
// FB50..FDFF; Arabic Presentation Forms-A
1260
1254
c = 0xfb50; // from ARABIC LETTER ALEF WASLA ISOLATED FORM
1261
- while (c <= 0xfbc1 ) // ..to ARABIC SYMBOL SMALL TAH BELOW
1255
+ while (c <= 0xfbc2 ) // ..to ARABIC SYMBOL WASLA ABOVE
1262
1256
charset[i++] = c++;
1263
1257
c = 0xfbd3; // from ARABIC LETTER NG ISOLATED FORM
1264
- while (c <= 0xfd3f) // ..to ORNATE RIGHT PARENTHESIS
1265
- charset[i++] = c++;
1266
- c = 0xfd50; // from ARABIC LIGATURE TEH WITH JEEM WITH MEEM INITIAL FORM
1267
1258
while (c <= 0xfd8f) // ..to ARABIC LIGATURE MEEM WITH KHAH WITH MEEM INITIAL FORM
1268
1259
charset[i++] = c++;
1269
1260
c = 0xfd92; // from ARABIC LIGATURE MEEM WITH JEEM WITH KHAH INITIAL FORM
1270
1261
while (c <= 0xfdc7) // ..to ARABIC LIGATURE NOON WITH JEEM WITH YEH FINAL FORM
1271
1262
charset[i++] = c++;
1272
1263
c = 0xfdf0; // from ARABIC LIGATURE SALLA USED AS KORANIC STOP SIGN ISOLATED FORM
1273
- while (c <= 0xfdfd ) // ..to ARABIC LIGATURE BISMILLAH AR-RAHMAN AR-RAHEEM
1264
+ while (c <= 0xfdff ) // ..to ARABIC LIGATURE AZZA WA JALL
1274
1265
charset[i++] = c++;
1275
1266
// FE00..FE0F; Variation Selectors
1276
1267
c = 0xfe00; // from VARIATION SELECTOR-1
0 commit comments