Skip to content

Commit cc9272e

Browse files
authored
Merge pull request #27 from statisticsnorway/dev_bugs_and_vars
Dev bugs and vars
2 parents 8472bd5 + 010e3b6 commit cc9272e

File tree

4 files changed

+127
-94
lines changed

4 files changed

+127
-94
lines changed

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[tool.poetry]
22
name = "ssb-statstruk"
3-
version = "0.0.5"
3+
version = "0.0.6"
44
description = "SSB Statstruk"
55
authors = ["Susie Jentoft <coo@ssb.no>"]
66
license = "MIT"

src/statstruk/ratemodel.py

Lines changed: 120 additions & 73 deletions
Original file line numberDiff line numberDiff line change
@@ -431,16 +431,37 @@ def _get_robust(self, strata: str) -> tuple[float, float, float]:
431431
V1 = sum(ai**2 * di_1) + sum(di_1) * ai
432432
V2 = sum(ai**2 * di_2) + sum(di_2) * ai
433433
V3 = sum(ai**2 * di_3) + sum(di_3) * ai
434+
435+
# Adjust negative variance to zero
436+
if any(x < 0 for x in [V1, V2, V3]):
437+
if self.verbose == 2:
438+
print(
439+
"Negative variances calculated. These are being adjusted to 0."
440+
)
441+
V1 = max(V1, 0)
442+
V2 = max(V2, 0)
443+
V3 = max(V3, 0)
444+
434445
return (V1, V2, V3)
435446
else:
436447
return (0, 0, 0)
437448

438-
def _get_variance(self, strata: str) -> Any:
449+
def _get_standard_variance(self, strata: str) -> Any:
439450
"""Get standard variance estimates."""
440451
x_pop = self.strata_results[strata]["x_sum_pop"]
441452
x_utv = self.strata_results[strata]["x_sum_sample"]
442453
s2 = self.strata_results[strata]["sigma2"]
443-
V = x_pop**2 * (x_pop - x_utv) / x_pop * s2 / x_utv
454+
455+
if (x_pop > 0) and (x_utv > 0):
456+
V = x_pop**2 * (x_pop - x_utv) / x_pop * s2 / x_utv
457+
else:
458+
V = np.nan
459+
460+
if V < 0:
461+
if self.verbose == 2:
462+
print("Negative variances calculated. These are being adjusted to 0.")
463+
V = 0
464+
444465
return V
445466

446467
def _get_domain_estimates(self, domain: str, uncertainty_type: str) -> pd.DataFrame:
@@ -463,48 +484,98 @@ def _get_domain_estimates(self, domain: str, uncertainty_type: str) -> pd.DataFr
463484
# Get additional domain information on sample, pop sizes and estimates
464485
N = temp_dom.shape[0]
465486
n = np.sum(temp_dom[self.flag_var] == 1)
466-
est = np.sum(temp_dom[f"{self.y_var}_imputed"])
487+
est = np.sum(temp_dom[f"{self.y_var}_imp"])
467488

468489
# Loop through strata to get the partial variances
469490
var = 0
491+
x_sum_sample = 0
492+
x_sum_pop = 0
470493
for s in strata_unique:
471494
mask_s = (temp_dom[strata_var] == s) & (
472495
temp_dom[self.flag_var] == 0
473496
) # Those not in sample
474-
Uh_sh = np.sum(temp_dom.loc[mask_s, self.x_var])
475-
xh = res[s]["x_sum_sample"]
497+
Uh_sh = np.sum(
498+
temp_dom.loc[mask_s, self.x_var]
499+
) # Sum of x not in sample
500+
xh = res[s]["x_sum_sample"] # Sum of x in sample
476501
s2 = res[s]["sigma2"]
477-
var += s2 * (Uh_sh + xh) / xh * Uh_sh
502+
x_sum_pop += np.sum(temp_dom.loc[temp_dom[strata_var] == s, self.x_var])
503+
x_sum_sample += np.sum(
504+
temp_dom.loc[
505+
(temp_dom[strata_var] == s) & (temp_dom[self.flag_var] == 1),
506+
self.x_var,
507+
]
508+
)
509+
510+
# Add in variance for stratum if sum of x is greater than 0 and var is not na or inf!!!
511+
if (xh > 0) & (not np.isinf(s2)) & (not np.isnan(s2)):
512+
var += s2 * Uh_sh * ((Uh_sh + xh) / xh)
478513

479514
# Add calculations to domain dict
480515
domain_df[d] = {
481516
"domain": d,
482517
"N": N,
483518
"n": n,
519+
f"{self.x_var}_sum_pop": x_sum_pop,
520+
f"{self.x_var}_sum_sample": x_sum_sample,
484521
f"{self.y_var}_EST": est,
485522
f"{self.y_var}_VAR": var,
486-
f"{self.y_var}_SE": np.sqrt(var),
487-
f"{self.y_var}_LB": est - (1.96 * np.sqrt(var)),
488-
f"{self.y_var}_UB": est + (1.96 * np.sqrt(var)),
489-
f"{self.y_var}_CV": np.sqrt(var) / est * 100,
490523
}
491524

492-
# Format and drop variables that are not asked for
493-
domain_pd = pd.DataFrame(domain_df).T
525+
# Convert to pandas
526+
domain_pd = pd.DataFrame([v for k, v in domain_df.items()])
527+
domain_pd = self._clean_output(
528+
domain_pd,
529+
uncertainty_type=uncertainty_type,
530+
variance_type="standard",
531+
return_type="unbiased",
532+
)
533+
534+
return domain_pd
494535

495-
if "CV" not in uncertainty_type:
496-
domain_pd = domain_pd.drop([f"{self.y_var}_CV"], axis=1)
536+
def _clean_output(
537+
self,
538+
result: pd.DataFrame,
539+
uncertainty_type: str,
540+
variance_type: str,
541+
return_type: str,
542+
) -> pd.DataFrame:
543+
"""Clean up results set to include the chosen return type."""
544+
y_var = self.y_var
497545

498-
if "SE" not in uncertainty_type:
499-
domain_pd = domain_pd.drop([f"{self.y_var}_SE"], axis=1)
546+
# Format and add in CV, SE, CI
547+
if variance_type == "standard":
548+
variance_list = [""]
549+
if (variance_type == "robust") & (return_type == "unbiased"):
550+
variance_list = ["2"]
551+
if (variance_type == "robust") & (return_type == "all"):
552+
variance_list = ["1", "2", "3"]
500553

501-
if "CI" not in uncertainty_type:
502-
domain_pd = domain_pd.drop([f"{self.y_var}_LB", f"{self.y_var}_UB"], axis=1)
554+
for i in variance_list:
555+
if "CV" in uncertainty_type:
556+
result[f"{y_var}_CV{i}"] = (
557+
np.sqrt(result[f"{y_var}_VAR{i}"]) / result[f"{y_var}_EST"] * 100
558+
)
503559

504-
if "VAR" not in uncertainty_type:
505-
domain_pd = domain_pd.drop([f"{self.y_var}_VAR"], axis=1)
560+
if "SE" in uncertainty_type:
561+
result[f"{y_var}_SE{i}"] = np.sqrt(result[f"{y_var}_VAR{i}"])
506562

507-
return domain_pd
563+
if "CI" in uncertainty_type:
564+
result[f"{y_var}_LB{i}"] = result[f"{y_var}_EST"] - (
565+
1.96 * np.sqrt(result[f"{y_var}_VAR{i}"])
566+
)
567+
result[f"{y_var}_UB{i}"] = result[f"{y_var}_EST"] + (
568+
1.96 * np.sqrt(result[f"{y_var}_VAR{i}"])
569+
)
570+
571+
if "VAR" not in uncertainty_type:
572+
result = result.drop([f"{y_var}_VAR{i}"], axis=1)
573+
574+
if (return_type == "unbiased") & (variance_type == "robust"):
575+
result = result.drop([f"{y_var}_VAR1"], axis=1)
576+
result = result.drop([f"{y_var}_VAR3"], axis=1)
577+
578+
return result
508579

509580
def _get_domain(self, domain: str) -> Any:
510581
"""Get mapping of domain to the strata results."""
@@ -531,7 +602,7 @@ def _get_domain(self, domain: str) -> Any:
531602
# map key
532603
strata_res = pd.DataFrame(self.strata_results).T
533604
domain_mapped = strata_res["_strata_var_mod"].map(domain_key)
534-
# print(f"domain mapped: {type(domain_mapped)}")
605+
535606
return domain_mapped.str[0]
536607

537608
def get_estimates(
@@ -577,8 +648,8 @@ def get_estimates(
577648
strata_df["N"] = pd.to_numeric(strata_df["N"])
578649
strata_df["n"] = pd.to_numeric(strata_df["n"])
579650

580-
strata_df["x_sum_sample"] = pd.to_numeric(strata_df["x_sum_sample"])
581-
strata_df["x_sum_pop"] = pd.to_numeric(strata_df["x_sum_pop"])
651+
strata_df[f"{self.x_var}_sum_pop"] = pd.to_numeric(strata_df["x_sum_pop"])
652+
strata_df[f"{self.x_var}_sum_sample"] = pd.to_numeric(strata_df["x_sum_sample"])
582653

583654
# Add estimates
584655
strata_df["beta"] = pd.to_numeric(strata_df["beta"])
@@ -590,12 +661,8 @@ def get_estimates(
590661
if variance_type == "standard":
591662
var1 = []
592663
for s in strata_df["_strata_var_mod"]:
593-
var1.append(self._get_variance(s))
664+
var1.append(self._get_standard_variance(s))
594665

595-
# Check for negative variances
596-
if any(x < 0 for x in var1):
597-
print("Negative variances calculated. These are being adjusted to 0.")
598-
var1 = [i if i >= 0 else 0 for i in var1]
599666
strata_df[f"{self.y_var}_VAR"] = np.array(var1)
600667

601668
# Aggregate to domain
@@ -615,18 +682,11 @@ def get_estimates(
615682
var2.append(var[1])
616683
var3.append(var[2])
617684

618-
# Adjust negative variance to zero
619-
if any(x < 0 for x in var1 + var2 + var3):
620-
print("Negative variances calculated. These are being adjusted to 0.")
621-
strata_df[f"{self.y_var}_VAR1"] = np.array(
622-
[i if i >= 0 else 0 for i in var1]
623-
)
624-
strata_df[f"{self.y_var}_VAR2"] = np.array(
625-
[i if i >= 0 else 0 for i in var2]
626-
)
627-
strata_df[f"{self.y_var}_VAR3"] = np.array(
628-
[i if i >= 0 else 0 for i in var3]
629-
)
685+
# Add to results
686+
variables = ["VAR1", "VAR2", "VAR3"]
687+
variance_list = [var1, var2, var3]
688+
for var_name, data in zip(variables, variance_list, strict=False):
689+
strata_df[f"{self.y_var}_{var_name}"] = data
630690

631691
# Aggregate to domain
632692
result = (
@@ -635,6 +695,8 @@ def get_estimates(
635695
domain,
636696
"N",
637697
"n",
698+
f"{self.x_var}_sum_pop",
699+
f"{self.x_var}_sum_sample",
638700
f"{self.y_var}_EST",
639701
f"{self.y_var}_VAR1",
640702
f"{self.y_var}_VAR2",
@@ -646,35 +708,12 @@ def get_estimates(
646708
)
647709

648710
# Format and add in CV, SE, CI
649-
if variance_type == "standard":
650-
variance_list = [""]
651-
if (variance_type == "robust") & (return_type == "unbiased"):
652-
variance_list = ["2"]
653-
if (variance_type == "robust") & (return_type == "all"):
654-
variance_list = ["1", "2", "3"]
655-
656-
for i in variance_list:
657-
658-
if "CV" in uncertainty_type:
659-
result[f"{self.y_var}_CV{i}"] = (
660-
np.sqrt(result[f"{self.y_var}_VAR{i}"])
661-
/ result[f"{self.y_var}_EST"]
662-
* 100
663-
)
664-
665-
if "SE" in uncertainty_type:
666-
result[f"{self.y_var}_SE{i}"] = np.sqrt(result[f"{self.y_var}_VAR{i}"])
667-
668-
if "CI" in uncertainty_type:
669-
result[f"{self.y_var}_LB{i}"] = result[f"{self.y_var}_EST"] - (
670-
1.96 * np.sqrt(result[f"{self.y_var}_VAR{i}"])
671-
)
672-
result[f"{self.y_var}_UB{i}"] = result[f"{self.y_var}_EST"] + (
673-
1.96 * np.sqrt(result[f"{self.y_var}_VAR{i}"])
674-
)
675-
676-
if "VAR" not in uncertainty_type:
677-
result = result.drop([f"{self.y_var}_VAR{i}"], axis=1)
711+
result = self._clean_output(
712+
result,
713+
uncertainty_type=uncertainty_type,
714+
variance_type=variance_type,
715+
return_type=return_type,
716+
)
678717

679718
return result
680719

@@ -757,12 +796,12 @@ def _get_beta(stratum: str) -> Any:
757796
pop["beta"] = pop["_strata_var_mod"].apply(_get_beta)
758797

759798
# Calculate imputed values
760-
pop[f"{self.y_var}_imputed"] = pop["beta"] * pop[self.x_var]
799+
pop[f"{self.y_var}_imp"] = pop["beta"] * pop[self.x_var]
761800

762801
# Link in survey values
763802
id_to_yvar_map = utvalg.set_index(self.id_nr)[self.y_var]
764-
pop[f"{self.y_var}_imputed"] = (
765-
pop[self.id_nr].map(id_to_yvar_map).fillna(pop[f"{self.y_var}_imputed"])
803+
pop[f"{self.y_var}_imp"] = (
804+
pop[self.id_nr].map(id_to_yvar_map).fillna(pop[f"{self.y_var}_imp"])
766805
)
767806
pop_pd = pd.DataFrame(pop)
768807
return pop_pd
@@ -805,3 +844,11 @@ def _check_extreme_run(self) -> None:
805844
raise RuntimeError(
806845
"Model has not been fitted for calculating extreme values. Please re-run fit() with control_extremes = True"
807846
)
847+
848+
def _add_flag(self) -> None:
849+
"""Add flag in population data to say if unit is in the sample or not."""
850+
self.flag_var = f"{self.y_var}_flag_sample"
851+
sample_ids = set(self.sample_data[self.id_nr])
852+
self.pop_data[self.flag_var] = self.pop_data[self.id_nr].apply(
853+
lambda x: 1 if x in sample_ids else 0
854+
)

src/statstruk/ssbmodel.py

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -102,14 +102,6 @@ def _check_variable(
102102
f"Variable '{var_name}' contains missing values. Please fix and try again."
103103
)
104104

105-
def _add_flag(self) -> None:
106-
"""Add flag in population data to say if unit is in the sample or not."""
107-
self.flag_var = "_flag_sample"
108-
sample_ids = set(self.sample_data[self.id_nr])
109-
self.pop_data[self.flag_var] = self.pop_data[self.id_nr].apply(
110-
lambda x: 1 if x in sample_ids else 0
111-
)
112-
113105
def _check_model_run(self) -> None:
114106
"""Check to ensure that model has been run before proceeding with other functions."""
115107
if not hasattr(self, "strata_results"):

tests/test_ratemodel.py

Lines changed: 6 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -209,7 +209,7 @@ def test_statstruk_ratemodel_1obs(capfd):
209209
)
210210

211211

212-
def test_stastruk_ratemodel_check_neg() -> None:
212+
def test_statstruk_ratemodel_check_neg() -> None:
213213
s_data_new = pd.read_csv(sample_file)
214214
p_data = pd.read_csv(pop_file)
215215
s_data_new.iloc[0, 1] = -5
@@ -229,7 +229,7 @@ def test_stastruk_ratemodel_check_neg() -> None:
229229
)
230230

231231

232-
def test_stastruk_ratemodel_check_sample0(capfd) -> None:
232+
def test_statstruk_ratemodel_check_sample0(capfd) -> None:
233233
s_data_new = pd.read_csv(sample_file)
234234
s_data_new = s_data_new.loc[s_data_new.job_vacancies.notna(),]
235235

@@ -258,7 +258,7 @@ def test_stastruk_ratemodel_check_sample0(capfd) -> None:
258258
assert np.isnan(mod1.get_obs["B"]["G"])[0]
259259

260260

261-
def test_stastruk_ratemodel_check_pop0(capfd) -> None:
261+
def test_statstruk_ratemodel_check_pop0(capfd) -> None:
262262
sample1 = pd.read_csv(sample1_file)
263263
pop1 = pd.read_csv(pop1_file)
264264

@@ -272,7 +272,7 @@ def test_stastruk_ratemodel_check_pop0(capfd) -> None:
272272

273273
# check those with x=0 in population are imputed with zero
274274
imp = mod1.get_imputed()
275-
assert imp["job_vacancies_imputed"][0] == 0
275+
assert imp["job_vacancies_imp"][0] == 0
276276

277277

278278
def test_stastruk_ratemodel_check_allbutone0(capfd) -> None:
@@ -386,7 +386,7 @@ def test_stastruk_ratemodel_check_all0(capfd) -> None:
386386
assert all(np.isnan(mod1.get_obs["G"]["G"]))
387387

388388

389-
def test_stastruk_ratemodel_negative_variance(capfd) -> None:
389+
def test_stastruk_ratemodel_negative_variance() -> None:
390390
sample1 = pd.read_csv(sample_file)
391391
sample1 = sample1.loc[sample1.job_vacancies.notna(),]
392392
pop1 = pd.read_csv(pop_file)
@@ -398,7 +398,7 @@ def test_stastruk_ratemodel_negative_variance(capfd) -> None:
398398
sample1 = sample1.loc[(sample1.industry != "F") | (sample1.id.isin(sample_ids))]
399399
pop1 = pop1.loc[(pop1.industry != "F") | (pop1.id.isin(pop_ids))]
400400

401-
mod1 = ratemodel(pop1, sample1, id_nr="id")
401+
mod1 = ratemodel(pop1, sample1, id_nr="id", verbose=2)
402402
mod1.fit(
403403
x_var="employees",
404404
y_var="job_vacancies",
@@ -407,11 +407,5 @@ def test_stastruk_ratemodel_negative_variance(capfd) -> None:
407407
)
408408
res = mod1.get_estimates()
409409

410-
# Use capfd to capture the print output
411-
out, err = capfd.readouterr()
412-
413-
# Assert that the captured output matches the expected message
414-
assert out == "Negative variances calculated. These are being adjusted to 0.\n"
415-
416410
# Check that obs are given na
417411
assert res["job_vacancies_CV2"][4] == 0

0 commit comments

Comments
 (0)