@@ -431,16 +431,37 @@ def _get_robust(self, strata: str) -> tuple[float, float, float]:
431431 V1 = sum (ai ** 2 * di_1 ) + sum (di_1 ) * ai
432432 V2 = sum (ai ** 2 * di_2 ) + sum (di_2 ) * ai
433433 V3 = sum (ai ** 2 * di_3 ) + sum (di_3 ) * ai
434+
435+ # Adjust negative variance to zero
436+ if any (x < 0 for x in [V1 , V2 , V3 ]):
437+ if self .verbose == 2 :
438+ print (
439+ "Negative variances calculated. These are being adjusted to 0."
440+ )
441+ V1 = max (V1 , 0 )
442+ V2 = max (V2 , 0 )
443+ V3 = max (V3 , 0 )
444+
434445 return (V1 , V2 , V3 )
435446 else :
436447 return (0 , 0 , 0 )
437448
438- def _get_variance (self , strata : str ) -> Any :
449+ def _get_standard_variance (self , strata : str ) -> Any :
439450 """Get standard variance estimates."""
440451 x_pop = self .strata_results [strata ]["x_sum_pop" ]
441452 x_utv = self .strata_results [strata ]["x_sum_sample" ]
442453 s2 = self .strata_results [strata ]["sigma2" ]
443- V = x_pop ** 2 * (x_pop - x_utv ) / x_pop * s2 / x_utv
454+
455+ if (x_pop > 0 ) and (x_utv > 0 ):
456+ V = x_pop ** 2 * (x_pop - x_utv ) / x_pop * s2 / x_utv
457+ else :
458+ V = np .nan
459+
460+ if V < 0 :
461+ if self .verbose == 2 :
462+ print ("Negative variances calculated. These are being adjusted to 0." )
463+ V = 0
464+
444465 return V
445466
446467 def _get_domain_estimates (self , domain : str , uncertainty_type : str ) -> pd .DataFrame :
@@ -463,48 +484,98 @@ def _get_domain_estimates(self, domain: str, uncertainty_type: str) -> pd.DataFr
463484 # Get additional domain information on sample, pop sizes and estimates
464485 N = temp_dom .shape [0 ]
465486 n = np .sum (temp_dom [self .flag_var ] == 1 )
466- est = np .sum (temp_dom [f"{ self .y_var } _imputed " ])
487+ est = np .sum (temp_dom [f"{ self .y_var } _imp " ])
467488
468489 # Loop through strata to get the partial variances
469490 var = 0
491+ x_sum_sample = 0
492+ x_sum_pop = 0
470493 for s in strata_unique :
471494 mask_s = (temp_dom [strata_var ] == s ) & (
472495 temp_dom [self .flag_var ] == 0
473496 ) # Those not in sample
474- Uh_sh = np .sum (temp_dom .loc [mask_s , self .x_var ])
475- xh = res [s ]["x_sum_sample" ]
497+ Uh_sh = np .sum (
498+ temp_dom .loc [mask_s , self .x_var ]
499+ ) # Sum of x not in sample
500+ xh = res [s ]["x_sum_sample" ] # Sum of x in sample
476501 s2 = res [s ]["sigma2" ]
477- var += s2 * (Uh_sh + xh ) / xh * Uh_sh
502+ x_sum_pop += np .sum (temp_dom .loc [temp_dom [strata_var ] == s , self .x_var ])
503+ x_sum_sample += np .sum (
504+ temp_dom .loc [
505+ (temp_dom [strata_var ] == s ) & (temp_dom [self .flag_var ] == 1 ),
506+ self .x_var ,
507+ ]
508+ )
509+
510+ # Add in variance for stratum if sum of x is greater than 0 and var is not na or inf!!!
511+ if (xh > 0 ) & (not np .isinf (s2 )) & (not np .isnan (s2 )):
512+ var += s2 * Uh_sh * ((Uh_sh + xh ) / xh )
478513
479514 # Add calculations to domain dict
480515 domain_df [d ] = {
481516 "domain" : d ,
482517 "N" : N ,
483518 "n" : n ,
519+ f"{ self .x_var } _sum_pop" : x_sum_pop ,
520+ f"{ self .x_var } _sum_sample" : x_sum_sample ,
484521 f"{ self .y_var } _EST" : est ,
485522 f"{ self .y_var } _VAR" : var ,
486- f"{ self .y_var } _SE" : np .sqrt (var ),
487- f"{ self .y_var } _LB" : est - (1.96 * np .sqrt (var )),
488- f"{ self .y_var } _UB" : est + (1.96 * np .sqrt (var )),
489- f"{ self .y_var } _CV" : np .sqrt (var ) / est * 100 ,
490523 }
491524
492- # Format and drop variables that are not asked for
493- domain_pd = pd .DataFrame (domain_df ).T
525+ # Convert to pandas
526+ domain_pd = pd .DataFrame ([v for k , v in domain_df .items ()])
527+ domain_pd = self ._clean_output (
528+ domain_pd ,
529+ uncertainty_type = uncertainty_type ,
530+ variance_type = "standard" ,
531+ return_type = "unbiased" ,
532+ )
533+
534+ return domain_pd
494535
495- if "CV" not in uncertainty_type :
496- domain_pd = domain_pd .drop ([f"{ self .y_var } _CV" ], axis = 1 )
536+ def _clean_output (
537+ self ,
538+ result : pd .DataFrame ,
539+ uncertainty_type : str ,
540+ variance_type : str ,
541+ return_type : str ,
542+ ) -> pd .DataFrame :
543+ """Clean up results set to include the chosen return type."""
544+ y_var = self .y_var
497545
498- if "SE" not in uncertainty_type :
499- domain_pd = domain_pd .drop ([f"{ self .y_var } _SE" ], axis = 1 )
546+ # Format and add in CV, SE, CI
547+ if variance_type == "standard" :
548+ variance_list = ["" ]
549+ if (variance_type == "robust" ) & (return_type == "unbiased" ):
550+ variance_list = ["2" ]
551+ if (variance_type == "robust" ) & (return_type == "all" ):
552+ variance_list = ["1" , "2" , "3" ]
500553
501- if "CI" not in uncertainty_type :
502- domain_pd = domain_pd .drop ([f"{ self .y_var } _LB" , f"{ self .y_var } _UB" ], axis = 1 )
554+ for i in variance_list :
555+ if "CV" in uncertainty_type :
556+ result [f"{ y_var } _CV{ i } " ] = (
557+ np .sqrt (result [f"{ y_var } _VAR{ i } " ]) / result [f"{ y_var } _EST" ] * 100
558+ )
503559
504- if "VAR" not in uncertainty_type :
505- domain_pd = domain_pd . drop ( [f"{ self . y_var } _VAR" ], axis = 1 )
560+ if "SE" in uncertainty_type :
561+ result [ f" { y_var } _SE { i } " ] = np . sqrt ( result [f"{ y_var } _VAR{ i } " ] )
506562
507- return domain_pd
563+ if "CI" in uncertainty_type :
564+ result [f"{ y_var } _LB{ i } " ] = result [f"{ y_var } _EST" ] - (
565+ 1.96 * np .sqrt (result [f"{ y_var } _VAR{ i } " ])
566+ )
567+ result [f"{ y_var } _UB{ i } " ] = result [f"{ y_var } _EST" ] + (
568+ 1.96 * np .sqrt (result [f"{ y_var } _VAR{ i } " ])
569+ )
570+
571+ if "VAR" not in uncertainty_type :
572+ result = result .drop ([f"{ y_var } _VAR{ i } " ], axis = 1 )
573+
574+ if (return_type == "unbiased" ) & (variance_type == "robust" ):
575+ result = result .drop ([f"{ y_var } _VAR1" ], axis = 1 )
576+ result = result .drop ([f"{ y_var } _VAR3" ], axis = 1 )
577+
578+ return result
508579
509580 def _get_domain (self , domain : str ) -> Any :
510581 """Get mapping of domain to the strata results."""
@@ -531,7 +602,7 @@ def _get_domain(self, domain: str) -> Any:
531602 # map key
532603 strata_res = pd .DataFrame (self .strata_results ).T
533604 domain_mapped = strata_res ["_strata_var_mod" ].map (domain_key )
534- # print(f"domain mapped: {type(domain_mapped)}")
605+
535606 return domain_mapped .str [0 ]
536607
537608 def get_estimates (
@@ -577,8 +648,8 @@ def get_estimates(
577648 strata_df ["N" ] = pd .to_numeric (strata_df ["N" ])
578649 strata_df ["n" ] = pd .to_numeric (strata_df ["n" ])
579650
580- strata_df ["x_sum_sample " ] = pd .to_numeric (strata_df ["x_sum_sample " ])
581- strata_df ["x_sum_pop " ] = pd .to_numeric (strata_df ["x_sum_pop " ])
651+ strata_df [f" { self . x_var } _sum_pop " ] = pd .to_numeric (strata_df ["x_sum_pop " ])
652+ strata_df [f" { self . x_var } _sum_sample " ] = pd .to_numeric (strata_df ["x_sum_sample " ])
582653
583654 # Add estimates
584655 strata_df ["beta" ] = pd .to_numeric (strata_df ["beta" ])
@@ -590,12 +661,8 @@ def get_estimates(
590661 if variance_type == "standard" :
591662 var1 = []
592663 for s in strata_df ["_strata_var_mod" ]:
593- var1 .append (self ._get_variance (s ))
664+ var1 .append (self ._get_standard_variance (s ))
594665
595- # Check for negative variances
596- if any (x < 0 for x in var1 ):
597- print ("Negative variances calculated. These are being adjusted to 0." )
598- var1 = [i if i >= 0 else 0 for i in var1 ]
599666 strata_df [f"{ self .y_var } _VAR" ] = np .array (var1 )
600667
601668 # Aggregate to domain
@@ -615,18 +682,11 @@ def get_estimates(
615682 var2 .append (var [1 ])
616683 var3 .append (var [2 ])
617684
618- # Adjust negative variance to zero
619- if any (x < 0 for x in var1 + var2 + var3 ):
620- print ("Negative variances calculated. These are being adjusted to 0." )
621- strata_df [f"{ self .y_var } _VAR1" ] = np .array (
622- [i if i >= 0 else 0 for i in var1 ]
623- )
624- strata_df [f"{ self .y_var } _VAR2" ] = np .array (
625- [i if i >= 0 else 0 for i in var2 ]
626- )
627- strata_df [f"{ self .y_var } _VAR3" ] = np .array (
628- [i if i >= 0 else 0 for i in var3 ]
629- )
685+ # Add to results
686+ variables = ["VAR1" , "VAR2" , "VAR3" ]
687+ variance_list = [var1 , var2 , var3 ]
688+ for var_name , data in zip (variables , variance_list , strict = False ):
689+ strata_df [f"{ self .y_var } _{ var_name } " ] = data
630690
631691 # Aggregate to domain
632692 result = (
@@ -635,6 +695,8 @@ def get_estimates(
635695 domain ,
636696 "N" ,
637697 "n" ,
698+ f"{ self .x_var } _sum_pop" ,
699+ f"{ self .x_var } _sum_sample" ,
638700 f"{ self .y_var } _EST" ,
639701 f"{ self .y_var } _VAR1" ,
640702 f"{ self .y_var } _VAR2" ,
@@ -646,35 +708,12 @@ def get_estimates(
646708 )
647709
648710 # Format and add in CV, SE, CI
649- if variance_type == "standard" :
650- variance_list = ["" ]
651- if (variance_type == "robust" ) & (return_type == "unbiased" ):
652- variance_list = ["2" ]
653- if (variance_type == "robust" ) & (return_type == "all" ):
654- variance_list = ["1" , "2" , "3" ]
655-
656- for i in variance_list :
657-
658- if "CV" in uncertainty_type :
659- result [f"{ self .y_var } _CV{ i } " ] = (
660- np .sqrt (result [f"{ self .y_var } _VAR{ i } " ])
661- / result [f"{ self .y_var } _EST" ]
662- * 100
663- )
664-
665- if "SE" in uncertainty_type :
666- result [f"{ self .y_var } _SE{ i } " ] = np .sqrt (result [f"{ self .y_var } _VAR{ i } " ])
667-
668- if "CI" in uncertainty_type :
669- result [f"{ self .y_var } _LB{ i } " ] = result [f"{ self .y_var } _EST" ] - (
670- 1.96 * np .sqrt (result [f"{ self .y_var } _VAR{ i } " ])
671- )
672- result [f"{ self .y_var } _UB{ i } " ] = result [f"{ self .y_var } _EST" ] + (
673- 1.96 * np .sqrt (result [f"{ self .y_var } _VAR{ i } " ])
674- )
675-
676- if "VAR" not in uncertainty_type :
677- result = result .drop ([f"{ self .y_var } _VAR{ i } " ], axis = 1 )
711+ result = self ._clean_output (
712+ result ,
713+ uncertainty_type = uncertainty_type ,
714+ variance_type = variance_type ,
715+ return_type = return_type ,
716+ )
678717
679718 return result
680719
@@ -757,12 +796,12 @@ def _get_beta(stratum: str) -> Any:
757796 pop ["beta" ] = pop ["_strata_var_mod" ].apply (_get_beta )
758797
759798 # Calculate imputed values
760- pop [f"{ self .y_var } _imputed " ] = pop ["beta" ] * pop [self .x_var ]
799+ pop [f"{ self .y_var } _imp " ] = pop ["beta" ] * pop [self .x_var ]
761800
762801 # Link in survey values
763802 id_to_yvar_map = utvalg .set_index (self .id_nr )[self .y_var ]
764- pop [f"{ self .y_var } _imputed " ] = (
765- pop [self .id_nr ].map (id_to_yvar_map ).fillna (pop [f"{ self .y_var } _imputed " ])
803+ pop [f"{ self .y_var } _imp " ] = (
804+ pop [self .id_nr ].map (id_to_yvar_map ).fillna (pop [f"{ self .y_var } _imp " ])
766805 )
767806 pop_pd = pd .DataFrame (pop )
768807 return pop_pd
@@ -805,3 +844,11 @@ def _check_extreme_run(self) -> None:
805844 raise RuntimeError (
806845 "Model has not been fitted for calculating extreme values. Please re-run fit() with control_extremes = True"
807846 )
847+
848+ def _add_flag (self ) -> None :
849+ """Add flag in population data to say if unit is in the sample or not."""
850+ self .flag_var = f"{ self .y_var } _flag_sample"
851+ sample_ids = set (self .sample_data [self .id_nr ])
852+ self .pop_data [self .flag_var ] = self .pop_data [self .id_nr ].apply (
853+ lambda x : 1 if x in sample_ids else 0
854+ )
0 commit comments