1515from fractions import Fraction
1616
1717from functools import reduce
18- from typing import Callable , Dict , List , Union
18+ from typing import Callable , Dict , List , Tuple , Union
1919
2020import numpy as np
2121import pandas as pd
2222
2323from balance import adjustment as balance_adjustment , util as balance_util
24- from ipfn import ipfn
2524
2625logger = logging .getLogger (__package__ )
2726
2827
2928# TODO: Add options for only marginal distributions input
29+ def _run_ipf_numpy (
30+ original : np .ndarray ,
31+ target_margins : List [np .ndarray ],
32+ convergence_rate : float ,
33+ max_iteration : int ,
34+ rate_tolerance : float ,
35+ ) -> Tuple [np .ndarray , int , pd .DataFrame ]:
36+ """Run iterative proportional fitting on a NumPy array.
37+
38+ This reimplements the minimal subset of the :mod:`ipfn` package that is
39+ required for balance's usage. The original implementation spends most of
40+ its time looping in pure Python over every slice of the contingency table,
41+ which is prohibitively slow for the high-cardinality problems we test
42+ against. The logic here mirrors the algorithm used by ``ipfn.ipfn`` but
43+ applies the adjustments in a vectorised manner, yielding identical
44+ numerical results with a fraction of the runtime.
45+
46+ The caller is expected to pass ``target_margins`` that correspond to
47+ single-axis marginals (which is how :func:`rake` constructs the inputs).
48+ """
49+
50+ if original .ndim == 0 :
51+ raise ValueError ("`original` must have at least one dimension" )
52+
53+ table = np .asarray (original , dtype = np .float64 )
54+ margins = [np .asarray (margin , dtype = np .float64 ) for margin in target_margins ]
55+
56+ # Pre-compute shapes and axes that are repeatedly required during the
57+ # iterative updates. Each entry in ``axis_shapes`` represents how a
58+ # one-dimensional scaling factor should be reshaped in order to broadcast
59+ # along the appropriate axis of ``table``.
60+ axis_shapes : List [Tuple [int , ...]] = []
61+ sum_axes : List [Tuple [int , ...]] = []
62+ for axis in range (table .ndim ):
63+ shape = [1 ] * table .ndim
64+ shape [axis ] = table .shape [axis ]
65+ axis_shapes .append (tuple (shape ))
66+ sum_axes .append (tuple (i for i in range (table .ndim ) if i != axis ))
67+
68+ conv = np .inf
69+ old_conv = - np .inf
70+ conv_history : List [float ] = []
71+ iteration = 0
72+
73+ while (
74+ iteration <= max_iteration
75+ and conv > convergence_rate
76+ and abs (conv - old_conv ) > rate_tolerance
77+ ):
78+ old_conv = conv
79+
80+ # Sequentially update the table for each marginal. Because the
81+ # marginals correspond to single axes we can compute all scaling
82+ # factors at once, avoiding the expensive Python loops present in the
83+ # reference implementation.
84+ for axis , margin in enumerate (margins ):
85+ current = table .sum (axis = sum_axes [axis ])
86+ factors = np .ones_like (margin , dtype = np .float64 )
87+ np .divide (margin , current , out = factors , where = current != 0 )
88+ table *= factors .reshape (axis_shapes [axis ])
89+
90+ # Measure convergence using the same criterion as ``ipfn.ipfn``. The
91+ # implementation there keeps the maximum absolute proportional
92+ # difference while naturally ignoring NaNs (which arise for 0/0). We
93+ # match that behaviour by treating NaNs as zero deviation.
94+ conv = 0.0
95+ for axis , margin in enumerate (margins ):
96+ current = table .sum (axis = sum_axes [axis ])
97+ with np .errstate (divide = "ignore" , invalid = "ignore" ):
98+ diff = np .abs (np .divide (current , margin ) - 1.0 )
99+ current_conv = float (np .nanmax (diff )) if diff .size else 0.0
100+ if math .isnan (current_conv ):
101+ current_conv = 0.0
102+ if current_conv > conv :
103+ conv = current_conv
104+
105+ conv_history .append (conv )
106+ iteration += 1
107+
108+ converged = int (iteration <= max_iteration )
109+ iterations_df = pd .DataFrame (
110+ {"iteration" : range (len (conv_history )), "conv" : conv_history }
111+ ).set_index ("iteration" )
112+
113+ return table , converged , iterations_df
114+
115+
30116def rake (
31117 sample_df : pd .DataFrame ,
32118 sample_weights : pd .Series ,
@@ -179,24 +265,11 @@ def rake(
179265 # Calculate {# covariates}-dimensional array representation of the sample
180266 # for the ipfn algorithm
181267
182- # Create a multi-index DataFrame with all possible combinations
268+ grouped_sample_series = sample_df . groupby ( alphabetized_variables )[ "weight" ]. sum ()
183269 index = pd .MultiIndex .from_product (categories , names = alphabetized_variables )
184- full_df = pd .DataFrame (index = index ).reset_index ()
185-
186- # Group by covariates and sum weights
187- grouped_sample = (
188- sample_df .groupby (alphabetized_variables )["weight" ].sum ().reset_index ()
189- )
190-
191- # Merge to ensure all combinations are present (fill missing with 0)
192- merged = (
193- pd .merge (full_df , grouped_sample , on = alphabetized_variables , how = "left" )
194- .fillna (0 )
195- .infer_objects (copy = False )
196- )
197-
198- # Reshape to n-dimensional array
199- m_sample = merged ["weight" ].values .reshape ([len (c ) for c in categories ])
270+ grouped_sample_full = grouped_sample_series .reindex (index , fill_value = 0 )
271+ m_sample = grouped_sample_full .to_numpy ().reshape ([len (c ) for c in categories ])
272+ m_fit_input = m_sample .copy ()
200273
201274 # Calculate target margins for ipfn
202275 target_margins = []
@@ -208,7 +281,6 @@ def rake(
208281 )
209282 sums = sums .reindex (cats , fill_value = 0 )
210283 target_margins .append (sums .values )
211- dimensions = [[i ] for i in range (len (alphabetized_variables ))]
212284
213285 logger .debug (
214286 "Raking algorithm running following settings: "
@@ -219,16 +291,13 @@ def rake(
219291 # for that specific set of covariates
220292 # no longer uses the dataframe version of the ipfn algorithm
221293 # due to incompatability with latest Python versions
222- ipfn_obj = ipfn .ipfn (
223- original = m_sample ,
224- aggregates = target_margins ,
225- dimensions = dimensions ,
226- convergence_rate = convergence_rate ,
227- max_iteration = max_iteration ,
228- verbose = 2 ,
229- rate_tolerance = rate_tolerance ,
294+ m_fit , converged , iterations = _run_ipf_numpy (
295+ m_fit_input ,
296+ target_margins ,
297+ convergence_rate ,
298+ max_iteration ,
299+ rate_tolerance ,
230300 )
231- m_fit , converged , iterations = ipfn_obj .iteration ()
232301
233302 logger .debug (
234303 f"Raking algorithm terminated with following convergence: { converged } ; "
@@ -238,14 +307,9 @@ def rake(
238307 if not converged :
239308 logger .warning ("Maximum iterations reached, convergence was not achieved" )
240309
241- # Convert array representation of the weighted sample into a dataframe
242- # Generate all possible combinations of categories (cartesian product)
243310 combos = list (itertools .product (* categories ))
244- # Flatten the array to match the order of combos
245- weights = m_fit .flatten ()
246- # Build the DataFrame
247311 fit = pd .DataFrame (combos , columns = alphabetized_variables )
248- fit ["rake_weight" ] = weights
312+ fit ["rake_weight" ] = m_fit . flatten ()
249313
250314 raked = pd .merge (
251315 sample_df .reset_index (),
@@ -254,23 +318,24 @@ def rake(
254318 on = alphabetized_variables ,
255319 )
256320
257- # Merge back to original sample weights
258321 raked_rescaled = pd .merge (
259322 raked ,
260- (grouped_sample .rename (columns = {"weight" : "total_survey_weight" })),
323+ grouped_sample_series .reset_index ().rename (
324+ columns = {"weight" : "total_survey_weight" }
325+ ),
261326 how = "left" ,
262327 on = alphabetized_variables ,
263- ).set_index ("index" ) # important if dropping rows due to NA
328+ ).set_index ("index" )
264329
265- # use above merge to give each individual sample its proportion of the
266- # cell's total weight
267330 raked_rescaled ["rake_weight" ] = (
268331 raked_rescaled ["rake_weight" ] / raked_rescaled ["total_survey_weight" ]
269332 )
270- # rescale weights to sum to target_sum_weights (sum of initial target weights)
333+
271334 w = (
272- raked_rescaled ["rake_weight" ] / raked_rescaled ["rake_weight" ].sum ()
273- ) * target_sum_weights
335+ raked_rescaled ["rake_weight" ]
336+ / raked_rescaled ["rake_weight" ].sum ()
337+ * target_sum_weights
338+ ).rename ("rake_weight" )
274339 return {
275340 "weight" : w ,
276341 "model" : {
0 commit comments