1515
1616public class BuildFilterFile {
1717
18- public static final int SEGMENT_BITS = 4 ;
18+ public static final int SEGMENT_BITS = 10 ;
1919
2020 public static void main (String ... args ) throws IOException {
2121 if (args .length != 1 ) {
2222 System .out .println ("Usage: java " + BuildFilterFile .class .getName () + " <textFile>\n "
23- + "Builds a .filter file from a text file that contains SHA-1 hashes and counts." );
24- // see also https://haveibeenpwned.com/passwords
23+ + "Builds a .filter file from a text file that contains SHA-1 hashes and counts.\n "
24+ + "You can get the hash file from https://haveibeenpwned.com/passwords\n "
25+ + "It needs to be a list of SHA-1 hashes, ordered by hash, line format <hash>:<count>." );
2526 return ;
2627 }
2728 String textFile = args [0 ];
@@ -32,6 +33,7 @@ public static void main(String... args) throws IOException {
3233 new File (filterFileName ).delete ();
3334 RandomAccessFile out = new RandomAccessFile (filterFileName , "rw" );
3435 int lines = 0 ;
36+ long [] segmentStarts = new long [1 << SEGMENT_BITS ];
3537 // header
3638 out .write (new byte [8 << SEGMENT_BITS ]);
3739 int currentSegment = 0 ;
@@ -45,49 +47,54 @@ public static void main(String... args) throws IOException {
4547 lines ++;
4648 long hash = 0 ;
4749 for (int i = 0 ; i < 16 ; i ++) {
48- hash <<= 4 ;
49- hash |= StringUtils .getHex (line .charAt (i ));
50+ hash = (hash << 4 ) | StringUtils .getHex (line .charAt (i ));
5051 }
5152 if (lastHash == hash ) {
52- System .out .println ("Warning: duplicate hash detected, ignoring: " + line );
53+ System .out .println ("Warning: duplicate 64-bit key detected, ignoring: " + line );
5354 continue ;
55+ } else if (Long .compareUnsigned (hash , lastHash ) < 0 ) {
56+ throw new IllegalArgumentException ("The file is not sorted by hash" );
5457 }
5558 lastHash = hash ;
5659 int dot = line .lastIndexOf (':' );
5760 int count = Integer .parseInt (line .substring (dot + 1 ), 10 );
58- // set the lowest bit to 0
61+ // clear the lowest bit
5962 long key = hash ^ (hash & 1 );
6063 // if common, set the lowest bit
6164 if (count > 9 ) {
6265 key |= 1 ;
6366 }
6467 int segment = (int ) (key >>> (64 - SEGMENT_BITS ));
6568 if (segment != currentSegment ) {
66- writeSegment ( keys , currentSegment , out );
67- long time = System . nanoTime () - start ;
68- System . out . println ( "Lines processed: " + lines + " " + ( time / lines ) + " ns/line" );
69+ segmentStarts [ currentSegment ] = out . getFilePointer ( );
70+ out . write ( getSegment ( keys )) ;
71+ keys . clear ( );
6972 currentSegment = segment ;
7073 }
74+ if (lines % 10000000 == 0 ) {
75+ long time = System .nanoTime () - start ;
76+ System .out .println (lines / 1000000 + " million lines processed, " + (time / lines ) + " ns/line" );
77+ }
7178 keys .add (key );
7279 }
73- writeSegment (keys , currentSegment , out );
80+ segmentStarts [currentSegment ] = out .getFilePointer ();
81+ out .write (getSegment (keys ));
7482 lineReader .close ();
83+ out .seek (0 );
84+ for (long s : segmentStarts ) {
85+ out .writeLong (s );
86+ }
7587 out .close ();
88+ long time = System .nanoTime () - start ;
89+ System .out .println (lines + " lines processed, " + (time / 1000000 / 1000 ) + " seconds" );
7690 }
7791
78- private static void writeSegment (ArrayList <Long > keys , int segment ,
79- RandomAccessFile out ) throws IOException {
92+ private static byte [] getSegment (ArrayList <Long > keys ) {
8093 long [] array = new long [keys .size ()];
81- for (int i = 0 ; i < keys .size (); i ++) {
94+ for (int i = 0 ; i < keys .size (); i ++) {
8295 array [i ] = keys .get (i );
8396 }
84- long start = out .length ();
85- out .seek (segment * 8 );
86- out .writeLong (start );
87- out .seek (start );
88- XorPlus8 filter = XorPlus8 .construct (array );
89- out .write (filter .getData ());
90- keys .clear ();
97+ return XorPlus8 .construct (array ).getData ();
9198 }
9299
93100}
0 commit comments