Merge pull request #27 from statisticsnorway/dev_bugs_and_vars

sjentoft · web-flow · commit cc9272ed001e · 2024-04-26T12:13:13.000+02:00
Dev bugs and vars
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "ssb-statstruk"
-version = "0.0.5"
+version = "0.0.6"
 description = "SSB Statstruk"
 authors = ["Susie Jentoft <coo@ssb.no>"]
 license = "MIT"
diff --git a/src/statstruk/ratemodel.py b/src/statstruk/ratemodel.py
@@ -431,16 +431,37 @@ def _get_robust(self, strata: str) -> tuple[float, float, float]:
             V1 = sum(ai**2 * di_1) + sum(di_1) * ai
             V2 = sum(ai**2 * di_2) + sum(di_2) * ai
             V3 = sum(ai**2 * di_3) + sum(di_3) * ai
+
+            # Adjust negative variance to zero
+            if any(x < 0 for x in [V1, V2, V3]):
+                if self.verbose == 2:
+                    print(
+                        "Negative variances calculated. These are being adjusted to 0."
+                    )
+                V1 = max(V1, 0)
+                V2 = max(V2, 0)
+                V3 = max(V3, 0)
+
             return (V1, V2, V3)
         else:
             return (0, 0, 0)
 
-    def _get_variance(self, strata: str) -> Any:
+    def _get_standard_variance(self, strata: str) -> Any:
         """Get standard variance estimates."""
         x_pop = self.strata_results[strata]["x_sum_pop"]
         x_utv = self.strata_results[strata]["x_sum_sample"]
         s2 = self.strata_results[strata]["sigma2"]
-        V = x_pop**2 * (x_pop - x_utv) / x_pop * s2 / x_utv
+
+        if (x_pop > 0) and (x_utv > 0):
+            V = x_pop**2 * (x_pop - x_utv) / x_pop * s2 / x_utv
+        else:
+            V = np.nan
+
+        if V < 0:
+            if self.verbose == 2:
+                print("Negative variances calculated. These are being adjusted to 0.")
+            V = 0
+
         return V
 
     def _get_domain_estimates(self, domain: str, uncertainty_type: str) -> pd.DataFrame:
@@ -463,48 +484,98 @@ def _get_domain_estimates(self, domain: str, uncertainty_type: str) -> pd.DataFr
             # Get additional domain information on sample, pop sizes and estimates
             N = temp_dom.shape[0]
             n = np.sum(temp_dom[self.flag_var] == 1)
-            est = np.sum(temp_dom[f"{self.y_var}_imputed"])
+            est = np.sum(temp_dom[f"{self.y_var}_imp"])
 
             # Loop through strata to get the partial variances
             var = 0
+            x_sum_sample = 0
+            x_sum_pop = 0
             for s in strata_unique:
                 mask_s = (temp_dom[strata_var] == s) & (
                     temp_dom[self.flag_var] == 0
                 )  # Those not in sample
-                Uh_sh = np.sum(temp_dom.loc[mask_s, self.x_var])
-                xh = res[s]["x_sum_sample"]
+                Uh_sh = np.sum(
+                    temp_dom.loc[mask_s, self.x_var]
+                )  # Sum of x not in sample
+                xh = res[s]["x_sum_sample"]  # Sum of x in sample
                 s2 = res[s]["sigma2"]
-                var += s2 * (Uh_sh + xh) / xh * Uh_sh
+                x_sum_pop += np.sum(temp_dom.loc[temp_dom[strata_var] == s, self.x_var])
+                x_sum_sample += np.sum(
+                    temp_dom.loc[
+                        (temp_dom[strata_var] == s) & (temp_dom[self.flag_var] == 1),
+                        self.x_var,
+                    ]
+                )
+
+                # Add in variance for stratum if sum of x is greater than 0 and var is not na or inf!!!
+                if (xh > 0) & (not np.isinf(s2)) & (not np.isnan(s2)):
+                    var += s2 * Uh_sh * ((Uh_sh + xh) / xh)
 
             # Add calculations to domain dict
             domain_df[d] = {
                 "domain": d,
                 "N": N,
                 "n": n,
+                f"{self.x_var}_sum_pop": x_sum_pop,
+                f"{self.x_var}_sum_sample": x_sum_sample,
                 f"{self.y_var}_EST": est,
                 f"{self.y_var}_VAR": var,
-                f"{self.y_var}_SE": np.sqrt(var),
-                f"{self.y_var}_LB": est - (1.96 * np.sqrt(var)),
-                f"{self.y_var}_UB": est + (1.96 * np.sqrt(var)),
-                f"{self.y_var}_CV": np.sqrt(var) / est * 100,
             }
 
-        # Format and drop variables that are not asked for
-        domain_pd = pd.DataFrame(domain_df).T
+        # Convert to pandas
+        domain_pd = pd.DataFrame([v for k, v in domain_df.items()])
+        domain_pd = self._clean_output(
+            domain_pd,
+            uncertainty_type=uncertainty_type,
+            variance_type="standard",
+            return_type="unbiased",
+        )
+
+        return domain_pd
 
-        if "CV" not in uncertainty_type:
-            domain_pd = domain_pd.drop([f"{self.y_var}_CV"], axis=1)
+    def _clean_output(
+        self,
+        result: pd.DataFrame,
+        uncertainty_type: str,
+        variance_type: str,
+        return_type: str,
+    ) -> pd.DataFrame:
+        """Clean up results set to include the chosen return type."""
+        y_var = self.y_var
 
-        if "SE" not in uncertainty_type:
-            domain_pd = domain_pd.drop([f"{self.y_var}_SE"], axis=1)
+        # Format and add in CV, SE, CI
+        if variance_type == "standard":
+            variance_list = [""]
+        if (variance_type == "robust") & (return_type == "unbiased"):
+            variance_list = ["2"]
+        if (variance_type == "robust") & (return_type == "all"):
+            variance_list = ["1", "2", "3"]
 
-        if "CI" not in uncertainty_type:
-            domain_pd = domain_pd.drop([f"{self.y_var}_LB", f"{self.y_var}_UB"], axis=1)
+        for i in variance_list:
+            if "CV" in uncertainty_type:
+                result[f"{y_var}_CV{i}"] = (
+                    np.sqrt(result[f"{y_var}_VAR{i}"]) / result[f"{y_var}_EST"] * 100
+                )
 
-        if "VAR" not in uncertainty_type:
-            domain_pd = domain_pd.drop([f"{self.y_var}_VAR"], axis=1)
+            if "SE" in uncertainty_type:
+                result[f"{y_var}_SE{i}"] = np.sqrt(result[f"{y_var}_VAR{i}"])
 
-        return domain_pd
+            if "CI" in uncertainty_type:
+                result[f"{y_var}_LB{i}"] = result[f"{y_var}_EST"] - (
+                    1.96 * np.sqrt(result[f"{y_var}_VAR{i}"])
+                )
+                result[f"{y_var}_UB{i}"] = result[f"{y_var}_EST"] + (
+                    1.96 * np.sqrt(result[f"{y_var}_VAR{i}"])
+                )
+
+            if "VAR" not in uncertainty_type:
+                result = result.drop([f"{y_var}_VAR{i}"], axis=1)
+
+            if (return_type == "unbiased") & (variance_type == "robust"):
+                result = result.drop([f"{y_var}_VAR1"], axis=1)
+                result = result.drop([f"{y_var}_VAR3"], axis=1)
+
+        return result
 
     def _get_domain(self, domain: str) -> Any:
         """Get mapping of domain to the strata results."""
@@ -531,7 +602,7 @@ def _get_domain(self, domain: str) -> Any:
         # map key
         strata_res = pd.DataFrame(self.strata_results).T
         domain_mapped = strata_res["_strata_var_mod"].map(domain_key)
-        # print(f"domain mapped: {type(domain_mapped)}")
+
         return domain_mapped.str[0]
 
     def get_estimates(
@@ -577,8 +648,8 @@ def get_estimates(
         strata_df["N"] = pd.to_numeric(strata_df["N"])
         strata_df["n"] = pd.to_numeric(strata_df["n"])
 
-        strata_df["x_sum_sample"] = pd.to_numeric(strata_df["x_sum_sample"])
-        strata_df["x_sum_pop"] = pd.to_numeric(strata_df["x_sum_pop"])
+        strata_df[f"{self.x_var}_sum_pop"] = pd.to_numeric(strata_df["x_sum_pop"])
+        strata_df[f"{self.x_var}_sum_sample"] = pd.to_numeric(strata_df["x_sum_sample"])
 
         # Add estimates
         strata_df["beta"] = pd.to_numeric(strata_df["beta"])
@@ -590,12 +661,8 @@ def get_estimates(
         if variance_type == "standard":
             var1 = []
             for s in strata_df["_strata_var_mod"]:
-                var1.append(self._get_variance(s))
+                var1.append(self._get_standard_variance(s))
 
-            # Check for negative variances
-            if any(x < 0 for x in var1):
-                print("Negative variances calculated. These are being adjusted to 0.")
-                var1 = [i if i >= 0 else 0 for i in var1]
             strata_df[f"{self.y_var}_VAR"] = np.array(var1)
 
             # Aggregate to domain
@@ -615,18 +682,11 @@ def get_estimates(
                     var2.append(var[1])
                     var3.append(var[2])
 
-            # Adjust negative variance to zero
-            if any(x < 0 for x in var1 + var2 + var3):
-                print("Negative variances calculated. These are being adjusted to 0.")
-            strata_df[f"{self.y_var}_VAR1"] = np.array(
-                [i if i >= 0 else 0 for i in var1]
-            )
-            strata_df[f"{self.y_var}_VAR2"] = np.array(
-                [i if i >= 0 else 0 for i in var2]
-            )
-            strata_df[f"{self.y_var}_VAR3"] = np.array(
-                [i if i >= 0 else 0 for i in var3]
-            )
+            # Add to results
+            variables = ["VAR1", "VAR2", "VAR3"]
+            variance_list = [var1, var2, var3]
+            for var_name, data in zip(variables, variance_list, strict=False):
+                strata_df[f"{self.y_var}_{var_name}"] = data
 
             # Aggregate to domain
             result = (
@@ -635,6 +695,8 @@ def get_estimates(
                         domain,
                         "N",
                         "n",
+                        f"{self.x_var}_sum_pop",
+                        f"{self.x_var}_sum_sample",
                         f"{self.y_var}_EST",
                         f"{self.y_var}_VAR1",
                         f"{self.y_var}_VAR2",
@@ -646,35 +708,12 @@ def get_estimates(
             )
 
         # Format and add in CV, SE, CI
-        if variance_type == "standard":
-            variance_list = [""]
-        if (variance_type == "robust") & (return_type == "unbiased"):
-            variance_list = ["2"]
-        if (variance_type == "robust") & (return_type == "all"):
-            variance_list = ["1", "2", "3"]
-
-        for i in variance_list:
-
-            if "CV" in uncertainty_type:
-                result[f"{self.y_var}_CV{i}"] = (
-                    np.sqrt(result[f"{self.y_var}_VAR{i}"])
-                    / result[f"{self.y_var}_EST"]
-                    * 100
-                )
-
-            if "SE" in uncertainty_type:
-                result[f"{self.y_var}_SE{i}"] = np.sqrt(result[f"{self.y_var}_VAR{i}"])
-
-            if "CI" in uncertainty_type:
-                result[f"{self.y_var}_LB{i}"] = result[f"{self.y_var}_EST"] - (
-                    1.96 * np.sqrt(result[f"{self.y_var}_VAR{i}"])
-                )
-                result[f"{self.y_var}_UB{i}"] = result[f"{self.y_var}_EST"] + (
-                    1.96 * np.sqrt(result[f"{self.y_var}_VAR{i}"])
-                )
-
-            if "VAR" not in uncertainty_type:
-                result = result.drop([f"{self.y_var}_VAR{i}"], axis=1)
+        result = self._clean_output(
+            result,
+            uncertainty_type=uncertainty_type,
+            variance_type=variance_type,
+            return_type=return_type,
+        )
 
         return result
 
@@ -757,12 +796,12 @@ def _get_beta(stratum: str) -> Any:
         pop["beta"] = pop["_strata_var_mod"].apply(_get_beta)
 
         # Calculate imputed values
-        pop[f"{self.y_var}_imputed"] = pop["beta"] * pop[self.x_var]
+        pop[f"{self.y_var}_imp"] = pop["beta"] * pop[self.x_var]
 
         # Link in survey values
         id_to_yvar_map = utvalg.set_index(self.id_nr)[self.y_var]
-        pop[f"{self.y_var}_imputed"] = (
-            pop[self.id_nr].map(id_to_yvar_map).fillna(pop[f"{self.y_var}_imputed"])
+        pop[f"{self.y_var}_imp"] = (
+            pop[self.id_nr].map(id_to_yvar_map).fillna(pop[f"{self.y_var}_imp"])
         )
         pop_pd = pd.DataFrame(pop)
         return pop_pd
@@ -805,3 +844,11 @@ def _check_extreme_run(self) -> None:
             raise RuntimeError(
                 "Model has not been fitted for calculating extreme values. Please re-run fit() with control_extremes = True"
             )
+
+    def _add_flag(self) -> None:
+        """Add flag in population data to say if unit is in the sample or not."""
+        self.flag_var = f"{self.y_var}_flag_sample"
+        sample_ids = set(self.sample_data[self.id_nr])
+        self.pop_data[self.flag_var] = self.pop_data[self.id_nr].apply(
+            lambda x: 1 if x in sample_ids else 0
+        )
diff --git a/src/statstruk/ssbmodel.py b/src/statstruk/ssbmodel.py
@@ -102,14 +102,6 @@ def _check_variable(
                     f"Variable '{var_name}' contains missing values. Please fix and try again."
                 )
 
-    def _add_flag(self) -> None:
-        """Add flag in population data to say if unit is in the sample or not."""
-        self.flag_var = "_flag_sample"
-        sample_ids = set(self.sample_data[self.id_nr])
-        self.pop_data[self.flag_var] = self.pop_data[self.id_nr].apply(
-            lambda x: 1 if x in sample_ids else 0
-        )
-
     def _check_model_run(self) -> None:
         """Check to ensure that model has been run before proceeding with other functions."""
         if not hasattr(self, "strata_results"):
diff --git a/tests/test_ratemodel.py b/tests/test_ratemodel.py
@@ -209,7 +209,7 @@ def test_statstruk_ratemodel_1obs(capfd):
     )
 
 
-def test_stastruk_ratemodel_check_neg() -> None:
+def test_statstruk_ratemodel_check_neg() -> None:
     s_data_new = pd.read_csv(sample_file)
     p_data = pd.read_csv(pop_file)
     s_data_new.iloc[0, 1] = -5
@@ -229,7 +229,7 @@ def test_stastruk_ratemodel_check_neg() -> None:
     )
 
 
-def test_stastruk_ratemodel_check_sample0(capfd) -> None:
+def test_statstruk_ratemodel_check_sample0(capfd) -> None:
     s_data_new = pd.read_csv(sample_file)
     s_data_new = s_data_new.loc[s_data_new.job_vacancies.notna(),]
 
@@ -258,7 +258,7 @@ def test_stastruk_ratemodel_check_sample0(capfd) -> None:
     assert np.isnan(mod1.get_obs["B"]["G"])[0]
 
 
-def test_stastruk_ratemodel_check_pop0(capfd) -> None:
+def test_statstruk_ratemodel_check_pop0(capfd) -> None:
     sample1 = pd.read_csv(sample1_file)
     pop1 = pd.read_csv(pop1_file)
 
@@ -272,7 +272,7 @@ def test_stastruk_ratemodel_check_pop0(capfd) -> None:
 
     # check those with x=0 in population are imputed with zero
     imp = mod1.get_imputed()
-    assert imp["job_vacancies_imputed"][0] == 0
+    assert imp["job_vacancies_imp"][0] == 0
 
 
 def test_stastruk_ratemodel_check_allbutone0(capfd) -> None:
@@ -386,7 +386,7 @@ def test_stastruk_ratemodel_check_all0(capfd) -> None:
     assert all(np.isnan(mod1.get_obs["G"]["G"]))
 
 
-def test_stastruk_ratemodel_negative_variance(capfd) -> None:
+def test_stastruk_ratemodel_negative_variance() -> None:
     sample1 = pd.read_csv(sample_file)
     sample1 = sample1.loc[sample1.job_vacancies.notna(),]
     pop1 = pd.read_csv(pop_file)
@@ -398,7 +398,7 @@ def test_stastruk_ratemodel_negative_variance(capfd) -> None:
     sample1 = sample1.loc[(sample1.industry != "F") | (sample1.id.isin(sample_ids))]
     pop1 = pop1.loc[(pop1.industry != "F") | (pop1.id.isin(pop_ids))]
 
-    mod1 = ratemodel(pop1, sample1, id_nr="id")
+    mod1 = ratemodel(pop1, sample1, id_nr="id", verbose=2)
     mod1.fit(
         x_var="employees",
         y_var="job_vacancies",
@@ -407,11 +407,5 @@ def test_stastruk_ratemodel_negative_variance(capfd) -> None:
     )
     res = mod1.get_estimates()
 
-    # Use capfd to capture the print output
-    out, err = capfd.readouterr()
-
-    # Assert that the captured output matches the expected message
-    assert out == "Negative variances calculated. These are being adjusted to 0.\n"
-
     # Check that obs are given na
     assert res["job_vacancies_CV2"][4] == 0