Skip to content

Commit 47ab88a

Browse files
Merge pull request #6 from ottenbreit-data-science/bins_test
Further improved methodology for splitting into bins
2 parents 54eae0e + ad8fd79 commit 47ab88a

File tree

2 files changed

+56
-23
lines changed

2 files changed

+56
-23
lines changed

cpp/term.h

Lines changed: 55 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -350,40 +350,73 @@ void Term::setup_bins()
350350
{
351351
if(bins_start_index.size()==0) //if not previously calculated or wrongly sized
352352
{
353-
//bins
354-
size_t observations_in_bin{std::max((max_index+1)/bins,static_cast<size_t>(1))};
355-
356-
//Finding unique values in values_sorted - these will be used to skip i%observations_in_bin==0 criteria for general eligibility
357-
std::vector<double> values_sorted_unique(sorted_vectors.values_sorted.size());
358-
for (size_t i = 0; i <= max_index; ++i)
359-
{
360-
values_sorted_unique[i]=sorted_vectors.values_sorted[i];
361-
}
362-
auto ip{std::unique(values_sorted_unique.begin(),values_sorted_unique.end())};
363-
values_sorted_unique.resize(std::distance(values_sorted_unique.begin(),ip));
364-
365353
bins_start_index.reserve(bins+1);
366354
bins_end_index.reserve(bins+1);
367-
//Start_index
368355
bins_start_index.push_back(0);
369-
if(bins>1)
356+
357+
//Start indexes
358+
bool can_create_bins{bins>1};
359+
if(can_create_bins)
370360
{
371361
size_t start_row{min_observations_in_split};
372362
size_t end_row{max_index+1-min_observations_in_split};
363+
364+
//find potential start indexes
365+
std::vector<size_t> potential_start_indexes;
366+
potential_start_indexes.reserve(sorted_vectors.values_sorted.size());
373367
for (size_t i = start_row; i <= end_row; ++i)
374368
{
375-
size_t last_bin_start_index{bins_start_index[bins_start_index.size()-1]};
376-
bool eligible_on_spacing_between_observations{i >= last_bin_start_index + observations_in_bin || values_sorted_unique.size()<=bins || i == start_row || i == end_row};
377-
bool eligible_on_unique_numbers{i>0 && !check_if_approximately_equal(sorted_vectors.values_sorted[i],sorted_vectors.values_sorted[i-1])};
369+
bool is_eligible_start_index{i>0 && !check_if_approximately_equal(sorted_vectors.values_sorted[i],sorted_vectors.values_sorted[i-1])};
370+
if(is_eligible_start_index)
371+
potential_start_indexes.push_back(i);
372+
}
373+
size_t last_potential_start_index{potential_start_indexes.size()-1};
378374

379-
bool create_bin{eligible_on_spacing_between_observations && eligible_on_unique_numbers};
380-
if(create_bin)
375+
bool potential_start_indexes_exist{potential_start_indexes.size()>0};
376+
bool fewer_start_indexes_than_bins{potential_start_indexes.size()<bins};
377+
if(potential_start_indexes_exist)
378+
{
379+
if(fewer_start_indexes_than_bins)
380+
{
381+
bins_start_index.insert(bins_start_index.end(),std::make_move_iterator(potential_start_indexes.begin()),std::make_move_iterator(potential_start_indexes.end()));
382+
}
383+
else if(bins==2)
384+
{
385+
bins_start_index.push_back(potential_start_indexes[0]);
386+
}
387+
else if(bins==3)
388+
{
389+
bins_start_index.push_back(potential_start_indexes[0]);
390+
bins_start_index.push_back(potential_start_indexes[last_potential_start_index]);
391+
}
392+
else
381393
{
382-
bins_start_index.push_back(i);
394+
bins_start_index.push_back(potential_start_indexes[0]); //first bin
395+
396+
size_t observations_between_outer_start_indexes{potential_start_indexes[last_potential_start_index]-potential_start_indexes[0]};
397+
size_t bins_to_create{bins-2};
398+
size_t desired_observations_in_bin{std::max((observations_between_outer_start_indexes)/bins_to_create+1, static_cast<size_t>(1))};
399+
size_t desired_observations_in_second_last_bin{desired_observations_in_bin*4/5};
400+
size_t index_of_start_index_for_previous_bin{0};
401+
size_t distance;
402+
size_t distance_to_end;
403+
for (size_t index_of_start_index = 1; index_of_start_index < last_potential_start_index-1; ++index_of_start_index)
404+
{
405+
distance = potential_start_indexes[index_of_start_index]-potential_start_indexes[index_of_start_index_for_previous_bin];
406+
distance_to_end = potential_start_indexes[last_potential_start_index]-potential_start_indexes[index_of_start_index];
407+
bool can_add_bin{distance>=desired_observations_in_bin && distance_to_end>=desired_observations_in_second_last_bin};
408+
if(can_add_bin)
409+
{
410+
bins_start_index.push_back(potential_start_indexes[index_of_start_index]);
411+
index_of_start_index_for_previous_bin = index_of_start_index;
412+
}
413+
}
414+
415+
bins_start_index.push_back(potential_start_indexes[last_potential_start_index]); //last bin
383416
}
384417
}
385418
}
386-
//End index
419+
//End indexes
387420
if(bins_start_index.size()>0)
388421
{
389422
for (size_t i = 1; i < bins_start_index.size(); ++i)
@@ -692,4 +725,4 @@ std::vector<std::vector<size_t>> distribute_terms_to_cores(std::vector<Term> &te
692725
}
693726

694727
return output;
695-
}
728+
}

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515

1616
setuptools.setup(
1717
name='aplr',
18-
version='1.0.7',
18+
version='1.0.8',
1919
description='Automatic Piecewise Linear Regression',
2020
ext_modules=[sfc_module],
2121
author="Mathias von Ottenbreit",

0 commit comments

Comments
 (0)