Skip to content

Commit 7001695

Browse files
authored
Merge pull request #214 from DataExpert-io/feature/analytical-patterns
adding analytical patterns to handbook
2 parents b3ce15b + f284b23 commit 7001695

File tree

8 files changed

+227
-0
lines changed

8 files changed

+227
-0
lines changed
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
# 4 Applying Analytics Patterns and Advanced SQL
2+
3+
We are going to be talking about a few different analytical patterns
4+
5+
- State Change Tracking
6+
- Smoothing of trend lines
7+
- J curve analysis
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
# Week 4 Applying Analytical Patterns
2+
The homework this week will be using the `players`, `players_scd`, and `player_seasons` tables from week 1
3+
4+
- A query that does state change tracking for `players`
5+
- A player entering the league should be `New`
6+
- A player leaving the league should be `Retired`
7+
- A player staying in the league should be `Continued Playing`
8+
- A player that comes out of retirement should be `Returned from Retirement`
9+
- A player that stays out of the league should be `Stayed Retired`
10+
11+
- A query that uses `GROUPING SETS` to do efficient aggregations of `game_details` data
12+
- Aggregate this dataset along the following dimensions
13+
- player and team
14+
- Answer questions like who scored the most points playing for one team?
15+
- player and season
16+
- Answer questions like who scored the most points in one season?
17+
- team
18+
- Answer questions like which team has won the most games?
19+
20+
- A query that uses window functions on `game_details` to find out the following things:
21+
- What is the most games a team has won in a 90 game stretch?
22+
- How many games in a row did LeBron James score over 10 points a game?
23+
24+
25+
Please add these queries into a folder `homework/<discord-username>`
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
WITH deduped_events AS (
2+
SELECT
3+
url, host, user_id,event_time
4+
FROM events
5+
GROUP BY 1,2,3,4
6+
),
7+
clean_events AS (
8+
SELECT *, DATE(event_time) as event_date FROM deduped_events
9+
WHERE user_id IS NOT NULL
10+
ORDER BY user_id, event_time
11+
),
12+
converted AS (
13+
SELECT ce1.user_id,
14+
ce1.event_time,
15+
ce1.url,
16+
COUNT(DISTINCT CASE WHEN ce2.url = '/api/v1/user' THEN ce2.url END) as converted
17+
FROM clean_events ce1
18+
JOIN clean_events ce2
19+
ON ce2.user_id = ce1.user_id
20+
AND ce2.event_date = ce1.event_date
21+
AND ce2.event_time > ce1.event_time
22+
23+
GROUP BY 1, 2,3
24+
)
25+
26+
SELECT url, COUNT(1), CAST(SUM(converted) AS REAL)/COUNT(1)
27+
FROM converted
28+
GROUP BY 1
29+
HAVING CAST(SUM(converted) AS REAL)/COUNT(1) > 0
30+
AND COUNT(1) > 100
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
CREATE TABLE device_hits_dashboard AS
2+
3+
WITH events_augmented AS (
4+
SELECT COALESCE(d.os_type, 'unknown') AS os_type,
5+
COALESCE(d.device_type, 'unknown') AS device_type,
6+
COALESCE(d.browser_type, 'unknown') AS browser_type,
7+
url,
8+
user_id
9+
FROM events e
10+
JOIN devices d on e.device_id = d.device_id
11+
)
12+
13+
SELECT
14+
CASE
15+
WHEN GROUPING(os_type) = 0
16+
AND GROUPING(device_type) = 0
17+
AND GROUPING(browser_type) = 0
18+
THEN 'os_type__device_type__browser'
19+
WHEN GROUPING(browser_type) = 0 THEN 'browser_type'
20+
WHEN GROUPING(device_type) = 0 THEN 'device_type'
21+
WHEN GROUPING(os_type) = 0 THEN 'os_type'
22+
END as aggregation_level,
23+
COALESCE(os_type, '(overall)') as os_type,
24+
COALESCE(device_type, '(overall)') as device_type,
25+
COALESCE(browser_type, '(overall)') as browser_type,
26+
COUNT(1) as number_of_hits
27+
FROM events_augmented
28+
GROUP BY GROUPING SETS (
29+
(browser_type, device_type, os_type),
30+
(browser_type),
31+
(os_type),
32+
(device_type)
33+
)
34+
ORDER BY COUNT(1) DESC
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
WITH yesterday AS (
2+
SELECT * FROM users_growth_accounting
3+
WHERE date = DATE('2023-03-09')
4+
),
5+
today AS (
6+
SELECT
7+
CAST(user_id AS TEXT) as user_id,
8+
DATE_TRUNC('day', event_time::timestamp) as today_date,
9+
COUNT(1)
10+
FROM events
11+
WHERE DATE_TRUNC('day', event_time::timestamp) = DATE('2023-03-10')
12+
AND user_id IS NOT NULL
13+
GROUP BY user_id, DATE_TRUNC('day', event_time::timestamp)
14+
)
15+
16+
SELECT COALESCE(t.user_id, y.user_id) as user_id,
17+
COALESCE(y.first_active_date, t.today_date) AS first_active_date,
18+
COALESCE(t.today_date, y.last_active_date) AS last_active_date,
19+
CASE
20+
WHEN y.user_id IS NULL THEN 'New'
21+
WHEN y.last_active_date = t.today_date - Interval '1 day' THEN 'Retained'
22+
WHEN y.last_active_date < t.today_date - Interval '1 day' THEN 'Resurrected'
23+
WHEN t.today_date IS NULL AND y.last_active_date = y.date THEN 'Churned'
24+
ELSE 'Stale'
25+
END as daily_active_state,
26+
CASE
27+
WHEN y.user_id IS NULL THEN 'New'
28+
WHEN y.last_active_date < t.today_date - Interval '7 day' THEN 'Resurrected'
29+
WHEN
30+
t.today_date IS NULL
31+
AND y.last_active_date = y.date - interval '7 day' THEN 'Churned'
32+
WHEN COALESCE(t.today_date, y.last_active_date) + INTERVAL '7 day' >= y.date THEN 'Retained'
33+
ELSE 'Stale'
34+
END as weekly_active_state,
35+
COALESCE(y.dates_active,
36+
ARRAY []::DATE[])
37+
|| CASE
38+
WHEN
39+
t.user_id IS NOT NULL
40+
THEN ARRAY [t.today_date]
41+
ELSE ARRAY []::DATE[]
42+
END AS date_list,
43+
COALESCE(t.today_date, y.date + Interval '1 day') as date
44+
FROM today t
45+
FULL OUTER JOIN yesterday y
46+
ON t.user_id = y.user_id
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
SELECT
2+
date - first_active_date AS days_since_first_active,
3+
CAST(COUNT(CASE
4+
WHEN daily_active_state
5+
IN ('Retained', 'Resurrected', 'New') THEN 1 END) AS REAL)/COUNT(1) as pct_active,
6+
COUNT(1) FROM users_growth_accounting
7+
GROUP BY date - first_active_date;
Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
WITH events_augmented AS (
2+
SELECT COALESCE(d.os_type, 'unknown') AS os_type,
3+
COALESCE(d.device_type, 'unknown') AS device_type,
4+
COALESCE(d.browser_type, 'unknown') AS browser_type,
5+
url,
6+
user_id,
7+
CASE
8+
WHEN referrer like '%linkedin%' THEN 'Linkedin'
9+
WHEN referrer like '%t.co%' THEN 'Twitter'
10+
WHEN referrer like '%google%' THEN 'Google'
11+
WHEN referrer like '%lnkd%' THEN 'Linkedin'
12+
WHEN referrer like '%eczachly%' THEN 'On Site'
13+
WHEN referrer LIKE '%zachwilson%' THEN 'On Site'
14+
ELSE referrer
15+
END as referrer,
16+
DATE(event_time) AS event_date
17+
FROM events e
18+
JOIN devices d on e.device_id = d.device_id
19+
),
20+
aggregated AS (
21+
SELECT url, referrer, event_date, COUNT(1) as count
22+
FROM events_augmented
23+
GROUP BY url, referrer, event_date
24+
),
25+
windowed AS (
26+
SELECT referrer,
27+
url,
28+
event_date,
29+
count,
30+
SUM(count) OVER (
31+
PARTITION BY referrer, url, DATE_TRUNC('month', event_date)
32+
ORDER BY event_date
33+
ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING
34+
) AS monthly_cumulative_sum,
35+
SUM(count) OVER (
36+
PARTITION BY referrer, url
37+
ORDER BY event_date
38+
) AS rolling_cumulative_sum,
39+
SUM(count) OVER (
40+
PARTITION BY referrer, url
41+
ORDER BY event_date
42+
ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING
43+
) AS total_cumulative_sum,
44+
SUM(count) OVER (
45+
PARTITION BY referrer, url
46+
ORDER BY event_date
47+
ROWS BETWEEN 6 preceding AND CURRENT ROW
48+
) AS weekly_rolling_count,
49+
SUM(count) OVER (
50+
PARTITION BY referrer, url
51+
ORDER BY event_date
52+
ROWS BETWEEN 13 preceding AND 6 preceding
53+
) AS previous_weekly_rolling_count
54+
FROM aggregated
55+
ORDER BY referrer, url, event_date
56+
)
57+
58+
SELECT referrer,
59+
url,
60+
event_date,
61+
count,
62+
weekly_rolling_count,
63+
previous_weekly_rolling_count,
64+
CAST(count AS REAL) / monthly_cumulative_sum as pct_of_month,
65+
CAST(count AS REAL) / total_cumulative_sum as pct_of_total
66+
FROM windowed
67+
WHERE total_cumulative_sum > 500
68+
AND referrer IS NOT NULL
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
CREATE TABLE users_growth_accounting (
2+
user_id TEXT,
3+
first_active_date DATE,
4+
last_active_date DATE,
5+
daily_active_state TEXT,
6+
weekly_active_state TEXT,
7+
dates_active DATE[],
8+
date DATE,
9+
PRIMARY KEY (user_id, date)
10+
);

0 commit comments

Comments
 (0)