@@ -350,40 +350,73 @@ void Term::setup_bins()
350350{
351351 if (bins_start_index.size ()==0 ) // if not previously calculated or wrongly sized
352352 {
353- // bins
354- size_t observations_in_bin{std::max ((max_index+1 )/bins,static_cast <size_t >(1 ))};
355-
356- // Finding unique values in values_sorted - these will be used to skip i%observations_in_bin==0 criteria for general eligibility
357- std::vector<double > values_sorted_unique (sorted_vectors.values_sorted .size ());
358- for (size_t i = 0 ; i <= max_index; ++i)
359- {
360- values_sorted_unique[i]=sorted_vectors.values_sorted [i];
361- }
362- auto ip{std::unique (values_sorted_unique.begin (),values_sorted_unique.end ())};
363- values_sorted_unique.resize (std::distance (values_sorted_unique.begin (),ip));
364-
365353 bins_start_index.reserve (bins+1 );
366354 bins_end_index.reserve (bins+1 );
367- // Start_index
368355 bins_start_index.push_back (0 );
369- if (bins>1 )
356+
357+ // Start indexes
358+ bool can_create_bins{bins>1 };
359+ if (can_create_bins)
370360 {
371361 size_t start_row{min_observations_in_split};
372362 size_t end_row{max_index+1 -min_observations_in_split};
363+
364+ // find potential start indexes
365+ std::vector<size_t > potential_start_indexes;
366+ potential_start_indexes.reserve (sorted_vectors.values_sorted .size ());
373367 for (size_t i = start_row; i <= end_row; ++i)
374368 {
375- size_t last_bin_start_index{bins_start_index[bins_start_index.size ()-1 ]};
376- bool eligible_on_spacing_between_observations{i >= last_bin_start_index + observations_in_bin || values_sorted_unique.size ()<=bins || i == start_row || i == end_row};
377- bool eligible_on_unique_numbers{i>0 && !check_if_approximately_equal (sorted_vectors.values_sorted [i],sorted_vectors.values_sorted [i-1 ])};
369+ bool is_eligible_start_index{i>0 && !check_if_approximately_equal (sorted_vectors.values_sorted [i],sorted_vectors.values_sorted [i-1 ])};
370+ if (is_eligible_start_index)
371+ potential_start_indexes.push_back (i);
372+ }
373+ size_t last_potential_start_index{potential_start_indexes.size ()-1 };
378374
379- bool create_bin{eligible_on_spacing_between_observations && eligible_on_unique_numbers};
380- if (create_bin)
375+ bool potential_start_indexes_exist{potential_start_indexes.size ()>0 };
376+ bool fewer_start_indexes_than_bins{potential_start_indexes.size ()<bins};
377+ if (potential_start_indexes_exist)
378+ {
379+ if (fewer_start_indexes_than_bins)
380+ {
381+ bins_start_index.insert (bins_start_index.end (),std::make_move_iterator (potential_start_indexes.begin ()),std::make_move_iterator (potential_start_indexes.end ()));
382+ }
383+ else if (bins==2 )
384+ {
385+ bins_start_index.push_back (potential_start_indexes[0 ]);
386+ }
387+ else if (bins==3 )
388+ {
389+ bins_start_index.push_back (potential_start_indexes[0 ]);
390+ bins_start_index.push_back (potential_start_indexes[last_potential_start_index]);
391+ }
392+ else
381393 {
382- bins_start_index.push_back (i);
394+ bins_start_index.push_back (potential_start_indexes[0 ]); // first bin
395+
396+ size_t observations_between_outer_start_indexes{potential_start_indexes[last_potential_start_index]-potential_start_indexes[0 ]};
397+ size_t bins_to_create{bins-2 };
398+ size_t desired_observations_in_bin{std::max ((observations_between_outer_start_indexes)/bins_to_create+1 , static_cast <size_t >(1 ))};
399+ size_t desired_observations_in_second_last_bin{desired_observations_in_bin*4 /5 };
400+ size_t index_of_start_index_for_previous_bin{0 };
401+ size_t distance;
402+ size_t distance_to_end;
403+ for (size_t index_of_start_index = 1 ; index_of_start_index < last_potential_start_index-1 ; ++index_of_start_index)
404+ {
405+ distance = potential_start_indexes[index_of_start_index]-potential_start_indexes[index_of_start_index_for_previous_bin];
406+ distance_to_end = potential_start_indexes[last_potential_start_index]-potential_start_indexes[index_of_start_index];
407+ bool can_add_bin{distance>=desired_observations_in_bin && distance_to_end>=desired_observations_in_second_last_bin};
408+ if (can_add_bin)
409+ {
410+ bins_start_index.push_back (potential_start_indexes[index_of_start_index]);
411+ index_of_start_index_for_previous_bin = index_of_start_index;
412+ }
413+ }
414+
415+ bins_start_index.push_back (potential_start_indexes[last_potential_start_index]); // last bin
383416 }
384417 }
385418 }
386- // End index
419+ // End indexes
387420 if (bins_start_index.size ()>0 )
388421 {
389422 for (size_t i = 1 ; i < bins_start_index.size (); ++i)
@@ -692,4 +725,4 @@ std::vector<std::vector<size_t>> distribute_terms_to_cores(std::vector<Term> &te
692725 }
693726
694727 return output;
695- }
728+ }
0 commit comments