diff --git a/.gitignore b/.gitignore index f5378e980..b3f916c9e 100644 --- a/.gitignore +++ b/.gitignore @@ -42,3 +42,5 @@ coverage.xml # pixi environments .pixi/* !.pixi/config.toml +SKILL.md +CLAUDE.md diff --git a/docs/_quarto.yml b/docs/_quarto.yml index 25dc32b2b..daa8faf4d 100644 --- a/docs/_quarto.yml +++ b/docs/_quarto.yml @@ -37,36 +37,36 @@ website: file: pyfixest-sprint.md - text: Learn more menu: - - text: "Regression Tables and Summary Statistics" - file: table-layout.qmd - - text: "Hypothesis Testing and Marginal Effects" - file: marginaleffects.qmd - - text: "Difference-in-Differences Estimation" - file: difference-in-differences.qmd - - file: multiple_testing.ipynb - text: "Multiple Testing Corrections" - - file: regression_decomposition.ipynb - text: "Regression Decomposition" - - file: ssc.qmd - text: "On Small Sample Corrections" - - file: quantile-regression.qmd - text: "Quantile Regression" - #- text: "Compare fixest & PyFixest" - # file: compare-fixest-pyfixest.qmd - - text: "Compare Stata & PyFixest" - file: stata-2-pyfixest.qmd - - text: "PyFixest on the GPU via CuPy" - file: pyfixest-gpu-cupy.ipynb - - text: "PyFixest on the GPU via JAX" - file: pyfixest_gpu.ipynb - - text: "Other Resources around PyFixest" - file: resources.qmd - - text: "Replicating 'The Effect' with PyFixest" - file: replicating-the-effect.qmd - - text: "Replicating 'The Mixtape' with PyFixest" - file: mixtape.ipynb - - text: "Replicating 'Causal Inference for the Brave and True' with PyFixest" - file: brave_true.ipynb + - text: "Regression Tables and Summary Statistics" + file: table-layout.qmd + - text: "Hypothesis Testing and Marginal Effects" + file: marginaleffects.qmd + - text: "Difference-in-Differences Estimation" + file: difference-in-differences.qmd + - file: multiple_testing.ipynb + text: "Multiple Testing Corrections" + - file: regression_decomposition.ipynb + text: "Regression Decomposition" + - file: ssc.qmd + text: "On Small Sample Corrections" + - file: quantile-regression.qmd + text: "Quantile Regression" + #- text: "Compare fixest & PyFixest" + # file: compare-fixest-pyfixest.qmd + - text: "Compare Stata & PyFixest" + file: stata-2-pyfixest.qmd + - text: "PyFixest on the GPU via CuPy" + file: pyfixest-gpu-cupy.ipynb + - text: "PyFixest on the GPU via JAX" + file: pyfixest_gpu.ipynb + - text: "Other Resources around PyFixest" + file: resources.qmd + - text: "Replicating 'The Effect' with PyFixest" + file: replicating-the-effect.qmd + - text: "Replicating 'The Mixtape' with PyFixest" + file: mixtape.ipynb + - text: "Replicating 'Causal Inference for the Brave and True' with PyFixest" + file: brave_true.ipynb quartodoc: package: pyfixest @@ -116,6 +116,13 @@ quartodoc: - report.coefplot - report.iplot - did.visualize.panelview + - title: Formula Parsing & Model Matrix + desc: | + Internal APIs for formula parsing and model matrix construction + contents: + - estimation.formula.parse.Formula + - estimation.formula.model_matrix.ModelMatrix + - estimation.formula.factor_interaction.factor_interaction - title: Misc / Utilities desc: | PyFixest internals and utilities diff --git a/docs/_sidebar.yml b/docs/_sidebar.yml index 0c696cfd1..8bc2f3b1d 100644 --- a/docs/_sidebar.yml +++ b/docs/_sidebar.yml @@ -32,6 +32,12 @@ website: - reference/report.iplot.qmd - reference/did.visualize.panelview.qmd section: Summarize and Visualize + - contents: + - reference/estimation.formula.parse.Formula.qmd + - reference/estimation.formula.parse.parse.qmd + - reference/estimation.formula.model_matrix.ModelMatrix.qmd + - reference/estimation.formula.factor_interaction.factor_interaction.qmd + section: Formula Parsing & Model Matrix - contents: - reference/estimation.demean.qmd - reference/estimation.detect_singletons.qmd diff --git a/docs/acknowledgements.md b/docs/acknowledgements.md index 2529b6462..dfef5edd1 100644 --- a/docs/acknowledgements.md +++ b/docs/acknowledgements.md @@ -22,7 +22,7 @@ More concretely, we have borrowed the following API conventions and ideas direct | **On the fly variance covariance adjustments** | As in `fixest`, you can adjust the vcov post estimation by calling a `vcov()` method on the results object (`Feols` in pyfixest and `fixest` in `fixest`) | | **Predict method for fixed effects** | The `predict()` and `fixef()` methods in PyFixest mirrors fixest's functionality for obtaining fitted values, fixed effects, and linear predictions | -You can learn more about fixest [on github](https://github.com/lrberge/fixest), via its [documentation](https://lrberge.github.io/fixest/), or by reading the [associated paper](https://arxiv.org/abs/2601.21749). +You can learn more about fixest [on github](https://github.com/lrberge/fixest), via its [documentation](https://lrberge.github.io/fixest/), or by reading the [associated paper](https://arxiv.org/abs/2601.21749). PyFixest is tested against fixest via **rpy2** to ensure numerical equivalence (usually `rtol = 1e-08`, `atol = 1e-08`) for coefficients, diff --git a/docs/quickstart.qmd b/docs/quickstart.qmd index 4fe552932..90d0bdae8 100644 --- a/docs/quickstart.qmd +++ b/docs/quickstart.qmd @@ -507,7 +507,7 @@ multi_fit.etable() You can access an individual model by its name - i.e. a formula - via the `all_fitted_models` attribute. ```{python} -multi_fit.all_fitted_models["Y~X1"].tidy() +multi_fit.all_fitted_models["Y ~ X1"].tidy() ``` or equivalently via the `fetch_model` method: diff --git a/pyfixest/did/did2s.py b/pyfixest/did/did2s.py index af20b83da..4381d6156 100644 --- a/pyfixest/did/did2s.py +++ b/pyfixest/did/did2s.py @@ -8,8 +8,8 @@ from pyfixest.did.did import DID from pyfixest.estimation import feols from pyfixest.estimation.feols_ import Feols -from pyfixest.estimation.FormulaParser import FixestFormulaParser -from pyfixest.estimation.model_matrix_fixest_ import model_matrix_fixest +from pyfixest.estimation.formula import model_matrix +from pyfixest.estimation.formula.parse import Formula class DID2S(DID): @@ -304,37 +304,48 @@ def _did2s_vcov( # some formula parsing to get the correct formula for the first and second stage model matrix first_stage_x, first_stage_fe = first_stage.split("|") - first_stage_fe_list = [f"C({i})" for i in first_stage_fe.split("+")] + first_stage_fe_list = [f"C({i.strip()})" for i in first_stage_fe.split("+")] first_stage_fe_fml = "+".join(first_stage_fe_list) - first_stage = f"{first_stage_x}+{first_stage_fe_fml}" - - second_stage = f"{second_stage}" + first_stage_fml = f"{first_stage_x}+{first_stage_fe_fml}" # note for future Alex: intercept needs to be dropped! it is not as fixed # effects are converted to dummies, hence has_fixed checks are False - FML1 = FixestFormulaParser(f"{yname} {first_stage}") - FML2 = FixestFormulaParser(f"{yname} {second_stage}") - FixestFormulaDict1 = FML1.FixestFormulaDict - FixestFormulaDict2 = FML2.FixestFormulaDict + # Create Formula objects for the new model_matrix system. + # First stage: use `- 1` so that C() dummy encoding keeps all levels, + # matching the feols demeaning approach (which implicitly includes all + # fixed-effect levels). Removing `- 1` would cause formulaic to drop + # reference levels, changing the GMM vcov standard errors. + FML1 = Formula( + _second_stage=f"{yname} ~ {first_stage_fml.replace('~', '').strip()} - 1", + ) + # Second stage: do NOT use `- 1`. Formulaic needs the intercept present + # for full-rank encoding (dropping a reference level for factors like + # i(treat)). The intercept column is then removed by drop_intercept=True + # below, matching what feols does in _did2s_estimate. + FML2 = Formula( + _second_stage=f"{yname} ~ {second_stage.replace('~', '').strip()}", + ) - mm_dict_first_stage = model_matrix_fixest( - FixestFormula=next(iter(FixestFormulaDict1.values()))[0], + mm_first_stage = model_matrix.create_model_matrix( + formula=FML1, data=data, weights=None, drop_singletons=False, - drop_intercept=False, + ensure_full_rank=True, + drop_intercept=True, ) - X1 = cast(pd.DataFrame, mm_dict_first_stage.get("X")) + X1 = mm_first_stage.independent - mm_second_stage = model_matrix_fixest( - FixestFormula=next(iter(FixestFormulaDict2.values()))[0], + mm_second_stage = model_matrix.create_model_matrix( + formula=FML2, data=data, weights=None, drop_singletons=False, + ensure_full_rank=True, drop_intercept=True, - ) # reference values not dropped, multicollinearity error - X2 = cast(pd.DataFrame, mm_second_stage.get("X")) + ) + X2 = mm_second_stage.independent X1 = csr_matrix(X1.to_numpy() * weights_array[:, None]) X2 = csr_matrix(X2.to_numpy() * weights_array[:, None]) @@ -359,10 +370,7 @@ def _did2s_vcov( X10 = X10.tocsr() X2 = X2.tocsr() # type: ignore - for ( - _, - g, - ) in enumerate(clustid): + for _, g in enumerate(clustid): idx_g: np.ndarray = cluster_col.values == g X10g = X10[idx_g, :] X2g = X2[idx_g, :] diff --git a/pyfixest/did/saturated_twfe.py b/pyfixest/did/saturated_twfe.py index d6c5153c8..815072643 100644 --- a/pyfixest/did/saturated_twfe.py +++ b/pyfixest/did/saturated_twfe.py @@ -203,15 +203,14 @@ def aggregate( treated_periods = list(period_set) df_agg = pd.DataFrame( - index=treated_periods, + index=pd.Index(treated_periods, name="period"), columns=["Estimate", "Std. Error", "t value", "Pr(>|t|)", "2.5%", "97.5%"], ) - df_agg.index.name = "period" for period in treated_periods: R = np.zeros(len(coefs)) for cohort in cohort_list: - cohort_pattern = rf"\[{re.escape(str(period))}\]:.*{re.escape(cohort)}$" + cohort_pattern = rf"^(?:.+)::{period}:(?:.+)::{cohort}$" match_idx = [ i for i, name in enumerate(coefnames) @@ -319,28 +318,20 @@ def _saturated_event_study( unit_id: str, cluster: Optional[str] = None, ): - cohort_dummies = pd.get_dummies( - df.first_treated_period, drop_first=True, prefix="cohort_dummy" + ff = f"{outcome} ~ i(rel_time, first_treated_period, ref = -1.0, ref2=0.0) | {unit_id} + {time_id}" + m = feols(fml=ff, data=df, vcov={"CRV1": cluster}) # type: ignore + res = m.tidy().reset_index() + res = res.join( + res["Coefficient"].str.extract( + r".+::(?P