Skip to content

Commit 75dcc37

Browse files
committed
adding fact data modeling content to handbook
1 parent 12f623a commit 75dcc37

16 files changed

+389
-0
lines changed
Lines changed: 138 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,138 @@
1+
# Byte-compiled / optimized / DLL files
2+
__pycache__/
3+
*.py[cod]
4+
*$py.class
5+
6+
# C extensions
7+
*.so
8+
9+
# Distribution / packaging
10+
.Python
11+
build/
12+
develop-eggs/
13+
dist/
14+
downloads/
15+
eggs/
16+
.eggs/
17+
lib/
18+
lib64/
19+
parts/
20+
sdist/
21+
var/
22+
wheels/
23+
pip-wheel-metadata/
24+
share/python-wheels/
25+
*.egg-info/
26+
.installed.cfg
27+
*.egg
28+
MANIFEST
29+
30+
# PyInstaller
31+
# Usually these files are written by a python script from a template
32+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
33+
*.manifest
34+
*.spec
35+
36+
# Installer logs
37+
pip-log.txt
38+
pip-delete-this-directory.txt
39+
40+
# Unit test / coverage reports
41+
htmlcov/
42+
.tox/
43+
.nox/
44+
.coverage
45+
.coverage.*
46+
.cache
47+
nosetests.xml
48+
coverage.xml
49+
*.cover
50+
*.py,cover
51+
.hypothesis/
52+
.pytest_cache/
53+
54+
# Translations
55+
*.mo
56+
*.pot
57+
58+
# Django stuff:
59+
*.log
60+
local_settings.py
61+
db.sqlite3
62+
db.sqlite3-journal
63+
64+
# Flask stuff:
65+
instance/
66+
.webassets-cache
67+
68+
# Scrapy stuff:
69+
.scrapy
70+
71+
# Sphinx documentation
72+
docs/_build/
73+
74+
# PyBuilder
75+
target/
76+
77+
# Jupyter Notebook
78+
.ipynb_checkpoints
79+
80+
# IPython
81+
profile_default/
82+
ipython_config.py
83+
84+
# pyenv
85+
.python-version
86+
87+
# pipenv
88+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
89+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
90+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
91+
# install all needed dependencies.
92+
#Pipfile.lock
93+
94+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
95+
__pypackages__/
96+
97+
# Celery stuff
98+
celerybeat-schedule
99+
celerybeat.pid
100+
101+
# SageMath parsed files
102+
*.sage.py
103+
104+
# Environments
105+
.env
106+
.venv
107+
env/
108+
venv/
109+
ENV/
110+
env.bak/
111+
venv.bak/
112+
113+
# Spyder project settings
114+
.spyderproject
115+
.spyproject
116+
117+
# Rope project settings
118+
.ropeproject
119+
120+
# mkdocs documentation
121+
/site
122+
123+
# mypy
124+
.mypy_cache/
125+
.dmypy.json
126+
dmypy.json
127+
128+
# Pyre type checker
129+
.pyre/
130+
131+
dump.sql
132+
133+
# Personal workspace files
134+
.idea/*
135+
.vscode/*
136+
137+
postgres-data/*
138+
homework/your_username
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
# Week 2 Fact Data Modeling
2+
3+
This repo follows the same setup as week 1. Please go to the dimensional data modeling [README](../1-dimensional-data-modeling/README.md) for instructions.

bootcamp/materials/2-fact-data-modeling/homework/.gitkeep

Whitespace-only changes.
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
# Week 2 Fact Data Modeling
2+
The homework this week will be using the `devices` and `events` dataset
3+
4+
Construct the following eight queries:
5+
6+
- A query to deduplicate `game_details` from Day 1 so there's no duplicates
7+
8+
- A DDL for an `user_devices_cumulated` table that has:
9+
- a `device_activity_datelist` which tracks a users active days by `browser_type`
10+
- data type here should look similar to `MAP<STRING, ARRAY[DATE]>`
11+
- or you could have `browser_type` as a column with multiple rows for each user (either way works, just be consistent!)
12+
13+
- A cumulative query to generate `device_activity_datelist` from `events`
14+
15+
- A `datelist_int` generation query. Convert the `device_activity_datelist` column into a `datelist_int` column
16+
17+
- A DDL for `hosts_cumulated` table
18+
- a `host_activity_datelist` which logs to see which dates each host is experiencing any activity
19+
20+
- The incremental query to generate `host_activity_datelist`
21+
22+
- A monthly, reduced fact table DDL `host_activity_reduced`
23+
- month
24+
- host
25+
- hit_array - think COUNT(1)
26+
- unique_visitors array - think COUNT(DISTINCT user_id)
27+
28+
- An incremental query that loads `host_activity_reduced`
29+
- day-by-day
30+
31+
Please add these queries into a folder, zip them up and submit [here](https://bootcamp.techcreator.io)
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
WITH starter AS (
2+
SELECT uc.dates_active @> ARRAY [DATE(d.valid_date)] AS is_active,
3+
EXTRACT(
4+
DAY FROM DATE('2023-03-31') - d.valid_date) AS days_since,
5+
uc.user_id
6+
FROM users_cumulated uc
7+
CROSS JOIN
8+
(SELECT generate_series('2023-02-28', '2023-03-31', INTERVAL '1 day') AS valid_date) as d
9+
WHERE date = DATE('2023-03-31')
10+
),
11+
bits AS (
12+
SELECT user_id,
13+
SUM(CASE
14+
WHEN is_active THEN POW(2, 32 - days_since)
15+
ELSE 0 END)::bigint::bit(32) AS datelist_int
16+
FROM starter
17+
GROUP BY user_id
18+
)
19+
20+
SELECT
21+
user_id,
22+
datelist_int,
23+
BIT_COUNT(datelist_int) > 0 AS monthly_active,
24+
BIT_COUNT(datelist_int) AS l32,
25+
BIT_COUNT(datelist_int &
26+
CAST('11111110000000000000000000000000' AS BIT(32))) > 0 AS weekly_active,
27+
BIT_COUNT(datelist_int &
28+
CAST('11111110000000000000000000000000' AS BIT(32))) AS l7,
29+
30+
BIT_COUNT(datelist_int &
31+
CAST('00000001111111000000000000000000' AS BIT(32))) > 0 AS weekly_active_previous_week
32+
FROM bits;
33+
34+
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
2+
WITH starter AS (
3+
SELECT uc.dates_active @> ARRAY [DATE(d.valid_date)] AS is_active,
4+
EXTRACT(
5+
DAY FROM DATE('2023-03-31') - d.valid_date) AS days_since,
6+
uc.user_id
7+
FROM users_cumulated uc
8+
CROSS JOIN
9+
(SELECT generate_series('2023-02-28', '2023-03-31', INTERVAL '1 day') AS valid_date) as d
10+
WHERE date = DATE('2023-03-31')
11+
),
12+
bits AS (
13+
SELECT user_id,
14+
SUM(CASE
15+
WHEN is_active THEN POW(2, 32 - days_since)
16+
ELSE 0 END)::bigint::bit(32) AS datelist_int,
17+
DATE('2023-03-31') as date
18+
FROM starter
19+
GROUP BY user_id
20+
)
21+
22+
INSERT INTO user_datelist_int
23+
SELECT * FROM bits
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
WITH yesterday AS (
2+
SELECT *
3+
FROM monthly_user_site_hits
4+
WHERE date_partition = '2023-03-02'
5+
),
6+
today AS (
7+
SELECT user_id,
8+
DATE_TRUNC('day', event_time) AS today_date,
9+
COUNT(1) as num_hits
10+
FROM events
11+
WHERE DATE_TRUNC('day', event_time) = DATE('2023-03-03')
12+
AND user_id IS NOT NULL
13+
GROUP BY user_id, DATE_TRUNC('day', event_time)
14+
)
15+
INSERT INTO monthly_user_site_hits
16+
SELECT
17+
COALESCE(y.user_id, t.user_id) AS user_id,
18+
COALESCE(y.hit_array,
19+
array_fill(NULL::BIGINT, ARRAY[DATE('2023-03-03') - DATE('2023-03-01')]))
20+
|| ARRAY[t.num_hits] AS hits_array,
21+
DATE('2023-03-01') as month_start,
22+
CASE WHEN y.first_found_date < t.today_date
23+
THEN y.first_found_date
24+
ELSE t.today_date
25+
END as first_found_date,
26+
DATE('2023-03-03') AS date_partition
27+
FROM yesterday y
28+
FULL OUTER JOIN today t
29+
ON y.user_id = t.user_id
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
SELECT
2+
month_start,
3+
SUM(hit_array[1]) as num_hits_mar_1,
4+
SUM(hit_array[2]) AS num_hits_mar_2
5+
FROM monthly_user_site_hits
6+
WHERE date_partition = DATE('2023-03-03')
7+
GROUP BY 1
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
WITH yesterday AS (
2+
SELECT * FROM users_cumulated
3+
WHERE date = DATE('2023-03-30')
4+
),
5+
today AS (
6+
SELECT user_id,
7+
DATE_TRUNC('day', event_time) AS today_date,
8+
COUNT(1) AS num_events FROM events
9+
WHERE DATE_TRUNC('day', event_time) = DATE('2023-03-31')
10+
AND user_id IS NOT NULL
11+
GROUP BY user_id, DATE_TRUNC('day', event_time)
12+
)
13+
INSERT INTO users_cumulated
14+
SELECT
15+
COALESCE(t.user_id, y.user_id),
16+
COALESCE(y.dates_active,
17+
ARRAY[]::DATE[])
18+
|| CASE WHEN
19+
t.user_id IS NOT NULL
20+
THEN ARRAY[t.today_date]
21+
ELSE ARRAY[]::DATE[]
22+
END AS date_list,
23+
COALESCE(t.today_date, y.date + Interval '1 day') as date
24+
FROm yesterday y
25+
FULL OUTER JOIN
26+
today t ON t.user_id = y.user_id;
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
CREATE TABLE devices (
2+
device_id BIGINT,
3+
browser_type TEXT,
4+
browser_version_major BIGINT,
5+
browser_version_minor BIGINT,
6+
browser_version_patch BIGINT,
7+
device_type TEXT,
8+
device_version_major BIGINT,
9+
device_version_minor BIGINT,
10+
device_version_patch BIGINT,
11+
os_type TEXT,
12+
os_version_major BIGINT,
13+
os_version_minor BIGINT,
14+
os_version_patch BIGINT
15+
)

0 commit comments

Comments
 (0)