Skip to content

Commit 5e48887

Browse files
authored
Merge pull request #18 from statisticsnorway/dev_docfix
Dev docfix
2 parents 6a5ede1 + 5d9b5f2 commit 5e48887

File tree

6 files changed

+613
-15
lines changed

6 files changed

+613
-15
lines changed

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[tool.poetry]
22
name = "ssb-statstruk"
3-
version = "0.0.3"
3+
version = "0.0.4"
44
description = "SSB Statstruk"
55
authors = ["Susie Jentoft <coo@ssb.no>"]
66
license = "MIT"

src/run.py

Lines changed: 342 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@
4747
# pop_df.head()
4848
# -
4949

50-
sample_df.head()
50+
sample_df.tail(10)
5151

5252

5353
# ## Standard run of rate model
@@ -146,3 +146,344 @@
146146

147147
pop_df.iloc[0, 0] = np.nan
148148
mod2 = ratemodel(pop_df, sample_df, id_nr="id")
149+
150+
151+
# ## Tests with data with 1 observation
152+
153+
# +
154+
pop1 = pd.read_csv("../tests/data/pop_data_1obs.csv")
155+
sample1 = pd.read_csv("../tests/data/sample_data_1obs.csv")
156+
157+
pop1["country"] = 1
158+
sample1["country"] = 1
159+
# -
160+
161+
sample1.tail(10)
162+
163+
mod1 = ratemodel(pop1, sample1, id_nr="id")
164+
mod1.fit(
165+
x_var="employees",
166+
y_var="job_vacancies",
167+
strata_var="industry",
168+
control_extremes=True,
169+
)
170+
171+
mod1 = ratemodel(pop1, sample1, id_nr="id")
172+
mod1.fit(
173+
x_var="employees",
174+
y_var="job_vacancies",
175+
strata_var="industry",
176+
exclude=[5, 9],
177+
control_extremes=False,
178+
)
179+
180+
mod1 = ratemodel(pop1, sample1, id_nr="id")
181+
mod1.fit(x_var="employees", y_var="job_vacancies", strata_var="industry")
182+
mod1.fit(
183+
x_var="employees",
184+
y_var="job_vacancies",
185+
strata_var="industry",
186+
exclude=[9855, 9912],
187+
control_extremes=False,
188+
)
189+
190+
191+
# ## Test when all x = 0 in one strata
192+
193+
# +
194+
pop1 = pd.read_csv("../tests/data/pop_data_1obs.csv")
195+
sample1 = pd.read_csv("../tests/data/sample_data_1obs.csv")
196+
197+
new_rows = pd.DataFrame(
198+
{
199+
"id": [10002, 10003],
200+
"employees": [50, 50],
201+
"employees_f": [20, 25],
202+
"employees_m": [30, 25],
203+
"turnover": [0, 0],
204+
"size": ["mid", "mid"],
205+
"industry": ["G", "G"],
206+
}
207+
)
208+
pop1 = pd.concat([pop1, new_rows], ignore_index=True)
209+
210+
new_rows2 = pd.DataFrame(
211+
{
212+
"id": [10002, 10003],
213+
"employees": [50, 50],
214+
"employees_f": [20, 25],
215+
"employees_m": [30, 25],
216+
"turnover": [0, 0],
217+
"size": ["mid", "mid"],
218+
"industry": ["G", "G"],
219+
"job_vacancies": [35, 45],
220+
"sick_days": [70, 65],
221+
"sick_days_f": [35, 25],
222+
"sick_days_m": [35, 40],
223+
}
224+
)
225+
sample1 = pd.concat([sample1, new_rows2], ignore_index=True)
226+
sample1.loc[sample1.id == 10001, "turnover"] = 0
227+
# -
228+
229+
mod1 = ratemodel(pop1, sample1, id_nr="id")
230+
mod1.fit(
231+
x_var="turnover",
232+
y_var="job_vacancies",
233+
strata_var="industry",
234+
control_extremes=True,
235+
)
236+
mod1.get_obs["G"]
237+
238+
# ## Test when one x=0
239+
240+
# +
241+
pop1 = pd.read_csv("../tests/data/pop_data_1obs.csv")
242+
sample1 = pd.read_csv("../tests/data/sample_data_1obs.csv")
243+
244+
new_rows = pd.DataFrame(
245+
{
246+
"id": [10005, 10006],
247+
"employees": [50, 50],
248+
"employees_f": [20, 25],
249+
"employees_m": [30, 25],
250+
"turnover": [1000, 0],
251+
"size": ["mid", "mid"],
252+
"industry": ["G", "G"],
253+
}
254+
)
255+
pop1 = pd.concat([pop1, new_rows], ignore_index=True)
256+
257+
new_rows2 = pd.DataFrame(
258+
{
259+
"id": [10005, 10006],
260+
"employees": [50, 50],
261+
"employees_f": [20, 25],
262+
"employees_m": [30, 25],
263+
"turnover": [1000, 0],
264+
"size": ["mid", "mid"],
265+
"industry": ["G", "G"],
266+
"job_vacancies": [35, 45],
267+
"sick_days": [70, 65],
268+
"sick_days_f": [35, 25],
269+
"sick_days_m": [35, 40],
270+
}
271+
)
272+
sample1 = pd.concat([sample1, new_rows2], ignore_index=True)
273+
274+
# -
275+
276+
mod1 = ratemodel(pop1, sample1, id_nr="id")
277+
mod1.fit(
278+
x_var="turnover",
279+
y_var="job_vacancies",
280+
strata_var="industry",
281+
control_extremes=True,
282+
)
283+
mod1.get_obs["G"]
284+
285+
# ## test when one x=0 in pop
286+
287+
# +
288+
pop1 = pd.read_csv("../tests/data/pop_data_1obs.csv")
289+
sample1 = pd.read_csv("../tests/data/sample_data_1obs.csv")
290+
291+
292+
# -
293+
294+
pop1.head(10)
295+
296+
mod1 = ratemodel(pop1, sample1, id_nr="id")
297+
mod1.fit(
298+
x_var="employees",
299+
y_var="job_vacancies",
300+
strata_var="industry",
301+
control_extremes=False,
302+
)
303+
imp = mod1.get_imputed()
304+
imp["job_vacancies_imputed"][0] == 0
305+
306+
307+
mod1.get_extremes(threshold_type="rstud", rbound=15)
308+
309+
# ## Test when one x is negative
310+
311+
# +
312+
pop1 = pd.read_csv("../tests/data/pop_data_1obs.csv")
313+
sample1 = pd.read_csv("../tests/data/sample_data_1obs.csv")
314+
315+
sample1.loc[sample1.id == 5, "turnover"] = -10
316+
pop1.loc[pop1.id == 5, "turnover"] = -10
317+
# -
318+
319+
mod1 = ratemodel(pop1, sample1, id_nr="id")
320+
mod1.fit(
321+
x_var="turnover",
322+
y_var="job_vacancies",
323+
strata_var="industry",
324+
control_extremes=True,
325+
)
326+
327+
# ## Test when there is one y=0
328+
329+
pop1 = pd.read_csv("../tests/data/pop_data_1obs.csv")
330+
sample1 = pd.read_csv("../tests/data/sample_data_1obs.csv")
331+
sample1.iloc[0, 7] = 0
332+
333+
mod1 = ratemodel(pop1, sample1, id_nr="id")
334+
mod1.fit(
335+
x_var="employees",
336+
y_var="job_vacancies",
337+
strata_var="industry",
338+
control_extremes=True,
339+
)
340+
341+
342+
# ## Test when all y = 0 in one strata
343+
344+
# +
345+
pop1 = pd.read_csv("../tests/data/pop_data_1obs.csv")
346+
sample1 = pd.read_csv("../tests/data/sample_data_1obs.csv")
347+
348+
new_rows = pd.DataFrame(
349+
{
350+
"id": [10006, 10007],
351+
"employees": [50, 50],
352+
"employees_f": [20, 25],
353+
"employees_m": [30, 25],
354+
"turnover": [15000, 15000],
355+
"size": ["mid", "mid"],
356+
"industry": ["G", "G"],
357+
}
358+
)
359+
pop1 = pd.concat([pop1, new_rows], ignore_index=True)
360+
361+
new_rows2 = pd.DataFrame(
362+
{
363+
"id": [10006, 10007],
364+
"employees": [50, 50],
365+
"employees_f": [20, 25],
366+
"employees_m": [30, 25],
367+
"turnover": [15000, 15000],
368+
"size": ["mid", "mid"],
369+
"industry": ["G", "G"],
370+
"job_vacancies": [0, 0],
371+
"sick_days": [70, 65],
372+
"sick_days_f": [35, 25],
373+
"sick_days_m": [35, 40],
374+
}
375+
)
376+
sample1 = pd.concat([sample1, new_rows2], ignore_index=True)
377+
sample1.loc[sample1.id == 10001, "job_vacancies"] = 0
378+
sample1 = sample1.loc[sample1.industry == "G"]
379+
pop1 = pop1.loc[pop1.industry == "G"]
380+
# -
381+
382+
mod1 = ratemodel(pop1, sample1, id_nr="id")
383+
mod1.fit(
384+
x_var="employees",
385+
y_var="job_vacancies",
386+
strata_var="industry",
387+
control_extremes=True,
388+
)
389+
390+
391+
mod1.get_estimates()
392+
393+
mod1.get_obs
394+
395+
# ## Test y=0 for all but one in strata
396+
397+
# +
398+
pop1 = pd.read_csv("../tests/data/pop_data_1obs.csv")
399+
sample1 = pd.read_csv("../tests/data/sample_data_1obs.csv")
400+
401+
new_rows = pd.DataFrame(
402+
{
403+
"id": [10006, 10007],
404+
"employees": [50, 50],
405+
"employees_f": [20, 25],
406+
"employees_m": [30, 25],
407+
"turnover": [15000, 15000],
408+
"size": ["mid", "mid"],
409+
"industry": ["G", "G"],
410+
}
411+
)
412+
pop1 = pd.concat([pop1, new_rows], ignore_index=True)
413+
414+
new_rows2 = pd.DataFrame(
415+
{
416+
"id": [10006, 10007],
417+
"employees": [50, 50],
418+
"employees_f": [20, 25],
419+
"employees_m": [30, 25],
420+
"turnover": [15000, 15000],
421+
"size": ["mid", "mid"],
422+
"industry": ["G", "G"],
423+
"job_vacancies": [0, 0],
424+
"sick_days": [70, 65],
425+
"sick_days_f": [35, 25],
426+
"sick_days_m": [35, 40],
427+
}
428+
)
429+
sample1 = pd.concat([sample1, new_rows2], ignore_index=True)
430+
431+
# -
432+
433+
mod1 = ratemodel(pop1, sample1, id_nr="id")
434+
mod1.fit(
435+
x_var="employees",
436+
y_var="job_vacancies",
437+
strata_var="industry",
438+
control_extremes=True,
439+
)
440+
441+
mod1.get_obs["G"]
442+
443+
444+
# ## Test for only 2 in one strata
445+
446+
# +
447+
pop1 = pd.read_csv("../tests/data/pop_data_1obs.csv")
448+
sample1 = pd.read_csv("../tests/data/sample_data_1obs.csv")
449+
450+
new_rows = pd.DataFrame(
451+
{
452+
"id": [10006],
453+
"employees": [45],
454+
"employees_f": [20],
455+
"employees_m": [30],
456+
"turnover": [15000],
457+
"size": ["mid"],
458+
"industry": ["G"],
459+
}
460+
)
461+
pop1 = pd.concat([pop1, new_rows], ignore_index=True)
462+
463+
new_rows2 = pd.DataFrame(
464+
{
465+
"id": [10006],
466+
"employees": [45],
467+
"employees_f": [20],
468+
"employees_m": [30],
469+
"turnover": [15000],
470+
"size": ["mid"],
471+
"industry": ["G"],
472+
"job_vacancies": [0],
473+
"sick_days": [70],
474+
"sick_days_f": [35],
475+
"sick_days_m": [35],
476+
}
477+
)
478+
sample1 = pd.concat([sample1, new_rows2], ignore_index=True)
479+
480+
mod1 = ratemodel(pop1, sample1, id_nr="id")
481+
mod1.fit(
482+
x_var="employees",
483+
y_var="job_vacancies",
484+
strata_var="industry",
485+
control_extremes=True,
486+
)
487+
# -
488+
489+
mod1.get_obs["G"]

0 commit comments

Comments
 (0)