@@ -71,12 +71,20 @@ size_t line_count(mmap_handle_t mht){
7171}
7272
7373std::tuple<std::string, std::string> shuffle (const std::string &src, const std::string &tgt){
74+ auto res = shuffle_sample (src, tgt, 0 );
75+ return std::make_tuple (std::get<0 >(res), std::get<1 >(res));
76+ }
77+
78+ std::tuple<std::string, std::string, std::string, std::string> shuffle_sample (const std::string &src, const std::string &tgt, long long sample){
7479 mmap_handle_t smht = mmap_open (src);
7580 mmap_handle_t tmht = mmap_open (tgt);
7681
7782 std::string src_out = src + " .shuffled" ;
7883 std::string tgt_out = tgt + " .shuffled" ;
7984
85+ std::string src_sample_out = src_out + " .sample" ;
86+ std::string tgt_sample_out = tgt_out + " .sample" ;
87+
8088 // size_t src_count = line_count(smht);
8189
8290 std::vector<line_off_t > offsets;
@@ -122,12 +130,38 @@ std::tuple<std::string, std::string> shuffle(const std::string &src, const std::
122130 std::ofstream src_of (src_out, std::ios::trunc);
123131 std::ofstream tgt_of (tgt_out, std::ios::trunc);
124132
133+ std::ofstream *src_sample_of = nullptr ;
134+ std::ofstream *tgt_sample_of = nullptr ;
135+
136+ if (sample > 0 ){
137+ src_sample_of = new std::ofstream (src_sample_out, std::ios::trunc);
138+ tgt_sample_of = new std::ofstream (tgt_sample_out, std::ios::trunc);
139+ }
140+
125141 // std::cout << offsets.size() << std::endl << std::endl;
126142 for (size_t i = 0 ; i < offsets.size (); i++){
127- src_of.write (offsets[i].src_start , offsets[i].src_end - offsets[i].src_start );
128- src_of.write (" \n " , 1 );
129- tgt_of.write (offsets[i].tgt_start , offsets[i].tgt_end - offsets[i].tgt_start );
130- tgt_of.write (" \n " , 1 );
143+ if (sample > 0 && i < sample){
144+ src_sample_of->write (offsets[i].src_start , offsets[i].src_end - offsets[i].src_start );
145+ src_sample_of->write (" \n " , 1 );
146+ tgt_sample_of->write (offsets[i].tgt_start , offsets[i].tgt_end - offsets[i].tgt_start );
147+ tgt_sample_of->write (" \n " , 1 );
148+ }else {
149+ src_of.write (offsets[i].src_start , offsets[i].src_end - offsets[i].src_start );
150+ src_of.write (" \n " , 1 );
151+ tgt_of.write (offsets[i].tgt_start , offsets[i].tgt_end - offsets[i].tgt_start );
152+ tgt_of.write (" \n " , 1 );
153+ }
154+ }
155+
156+ if (tgt_sample_of != nullptr ){
157+ tgt_sample_of->close ();
158+ delete tgt_sample_of;
159+ tgt_sample_of = nullptr ;
160+ }
161+ if (src_sample_of != nullptr ){
162+ src_sample_of->close ();
163+ delete src_sample_of;
164+ src_sample_of = nullptr ;
131165 }
132166
133167 src_of.close ();
@@ -136,5 +170,5 @@ std::tuple<std::string, std::string> shuffle(const std::string &src, const std::
136170 mmap_close (smht);
137171 mmap_close (tmht);
138172
139- return std::make_tuple (src_out, tgt_out);
173+ return std::make_tuple (src_out, tgt_out, src_sample_out, tgt_sample_out );
140174}
0 commit comments