Skip to content

Commit a11e404

Browse files
committed
tech detections per origins
1 parent d0c9143 commit a11e404

File tree

1 file changed

+88
-71
lines changed

1 file changed

+88
-71
lines changed
Lines changed: 88 additions & 71 deletions
Original file line numberDiff line numberDiff line change
@@ -1,127 +1,144 @@
11
const pastMonth = constants.fnPastMonth(constants.currentMonth)
22

33
publish('tech_detections', {
4-
type: 'table',
5-
description: 'Used in dashboard: https://lookerstudio.google.com/u/7/reporting/1jh_ScPlCIbSYTf2r2Y6EftqmX9SQy4Gn/page/p_an38lbzywc/edit',
4+
description: 'Used in dashboard: https://lookerstudio.google.com/u/7/reporting/1jh_ScPlCIbSYTf2r2Y6EftqmX9SQy4Gn/origin/p_an38lbzywc/edit',
65
schema: 'wappalyzer',
6+
type: 'incremental',
7+
protected: true,
8+
bigquery: {
9+
partitionBy: 'date',
10+
},
711
tags: ['crawl_complete']
8-
}).query(ctx => `
12+
}).preOps(ctx => `
13+
DELETE FROM ${ctx.self()}
14+
WHERE date = '${constants.currentMonth}';
15+
`).query(ctx => `
916
WITH source AS (
1017
SELECT DISTINCT
1118
date,
12-
root_page AS page,
19+
root_page AS origin,
1320
tech.technology
1421
FROM ${ctx.ref('crawl', 'pages')},
1522
UNNEST(technologies) AS tech
16-
WHERE date >= "${pastMonth}" ${constants.devRankFilter}
23+
WHERE date IN ('${pastMonth}', '${constants.currentMonth}') ${constants.devRankFilter}
1724
),
18-
-- Technology in the previous month (August)
25+
-- Technology in the previous month
1926
tech_before AS (
2027
SELECT
21-
page,
28+
origin,
2229
technology
2330
FROM source
24-
WHERE date = "${pastMonth}"
31+
WHERE date = '${pastMonth}'
2532
),
26-
-- Technology in the current month (September)
33+
-- Technology in the current month
2734
tech_current AS (
2835
SELECT
29-
page,
36+
origin,
3037
technology
3138
FROM source
32-
WHERE date = "${constants.currentMonth}"
39+
WHERE date = '${constants.currentMonth}'
3340
),
34-
-- Summary of technology and categories per page in the previous month
41+
-- Summary of technology per origin in the previous month
3542
tech_before_summary AS (
3643
SELECT
3744
technology,
38-
COUNT(DISTINCT page) AS total_pages_before
45+
COUNT(DISTINCT origin) AS total_origins_before
3946
FROM tech_before
4047
GROUP BY technology
4148
),
42-
-- Pages that existed last month but introduced the technology in the current month
43-
tech_introduced_existing_pages AS (
49+
-- origins that persisted across both months and adopted the technology in the current month
50+
tech_adopted_existing_origins AS (
4451
SELECT
45-
tech_current.technology,
46-
COUNT(DISTINCT tech_current.page) AS total_pages_introduced_existing,
47-
STRING_AGG(DISTINCT tech_current.page LIMIT 5) AS sample_pages_introduced_existing
48-
FROM tech_current
49-
JOIN tech_before
50-
USING (page)
52+
persisted_origins.technology,
53+
COUNT(DISTINCT persisted_origins.origin) AS total_origins_adopted_existing,
54+
STRING_AGG(DISTINCT persisted_origins.origin LIMIT 5) AS sample_origins_adopted_existing
55+
FROM (
56+
SELECT DISTINCT
57+
tech_current.technology,
58+
tech_current.origin
59+
FROM tech_before
60+
JOIN tech_current
61+
USING (origin)
62+
) as persisted_origins
5163
LEFT JOIN tech_before AS tb
52-
ON tech_current.page = tb.page AND tech_current.technology = tb.technology
53-
WHERE tb.page IS NULL -- Technology was not detected last month
54-
GROUP BY tech_current.technology
64+
ON persisted_origins.origin = tb.origin AND persisted_origins.technology = tb.technology
65+
WHERE tb.origin IS NULL -- Technology was not detected last month
66+
GROUP BY 1
5567
),
56-
-- Pages that were not in the dataset last month but appeared this month with the technology
57-
tech_introduced_new_pages AS (
68+
-- origins that arrived to CrUX in the current month and their detected technologies
69+
tech_adopted_new_origins AS (
5870
SELECT
5971
tech_current.technology,
60-
COUNT(DISTINCT tech_current.page) AS total_pages_introduced_new,
61-
STRING_AGG(DISTINCT tech_current.page LIMIT 5) AS sample_pages_introduced_new
72+
COUNT(DISTINCT tech_current.origin) AS total_origins_adopted_new,
73+
--STRING_AGG(DISTINCT tech_current.origin LIMIT 5) AS sample_origins_adopted_new
6274
FROM tech_current
6375
LEFT JOIN tech_before
64-
USING (page)
65-
WHERE tech_before.page IS NULL -- Page was not present last month
66-
GROUP BY tech_current.technology
76+
USING (origin)
77+
WHERE tech_before.origin IS NULL -- origin was not present last month
78+
GROUP BY 1
6779
),
68-
-- Pages that existed this month but no longer have the technology
69-
tech_deprecated_existing_pages AS (
80+
-- origins that persisted across both months and deprecated the technology usage in the current month
81+
tech_deprecated_existing_origins AS (
7082
SELECT
71-
tech_before.technology,
72-
COUNT(DISTINCT tech_before.page) AS total_pages_deprecated_existing,
73-
STRING_AGG(DISTINCT tech_before.page LIMIT 5) AS sample_pages_deprecated_existing
74-
FROM tech_before
75-
JOIN tech_current
76-
USING (page)
83+
persisted_origins.technology,
84+
COUNT(DISTINCT persisted_origins.origin) AS total_origins_deprecated_existing,
85+
STRING_AGG(DISTINCT persisted_origins.origin LIMIT 5) AS sample_origins_deprecated_existing
86+
FROM (
87+
SELECT DISTINCT
88+
tech_before.technology,
89+
tech_before.origin
90+
FROM tech_before
91+
JOIN tech_current
92+
USING (origin)
93+
) as persisted_origins
7794
LEFT JOIN tech_current AS tc
78-
ON tech_before.page = tc.page AND tech_before.technology = tc.technology
79-
WHERE tc.page IS NULL -- Technology is not detected in the current month
80-
GROUP BY tech_before.technology
95+
ON persisted_origins.origin = tc.origin AND persisted_origins.technology = tc.technology
96+
WHERE tc.origin IS NULL -- Technology is not detected in the current month
97+
GROUP BY 1
8198
),
82-
-- Pages that no longer exist in the current dataset
83-
tech_deprecated_gone_pages AS (
99+
-- origins that were dropped from CrUX in the current dataset, and thus the technology was not detected anymore
100+
tech_deprecated_gone_origins AS (
84101
SELECT
85102
tech_before.technology,
86-
COUNT(DISTINCT tech_before.page) AS total_pages_deprecated_gone,
87-
STRING_AGG(DISTINCT tech_before.page LIMIT 5) AS sample_pages_deprecated_gone
103+
COUNT(DISTINCT tech_before.origin) AS total_origins_deprecated_gone,
104+
--STRING_AGG(DISTINCT tech_before.origin LIMIT 5) AS sample_origins_deprecated_gone
88105
FROM tech_before
89106
LEFT JOIN tech_current
90-
USING (page)
91-
WHERE tech_current.page IS NULL -- Page no longer exists in current dataset
92-
GROUP BY tech_before.technology
107+
USING (origin)
108+
WHERE tech_current.origin IS NULL -- origin no longer exists in current dataset
109+
GROUP BY 1
93110
)
94111
95-
-- Final aggregation and comparison of technology adoption/deprecation metrics
112+
-- aggregation of technology adoption/deprecation metrics
96113
SELECT
97-
COALESCE(before_summary.technology, tech_introduced_existing_pages.technology, tech_introduced_new_pages.technology, apps.name) AS technology,
114+
DATE('${constants.currentMonth}') AS date,
115+
COALESCE(before_summary.technology, tech_adopted_existing_origins.technology, tech_adopted_new_origins.technology, apps.name) AS technology,
98116
99-
-- Pages summary
100-
0-COALESCE(total_pages_deprecated_existing, 0) AS total_pages_deprecated_existing,
101-
0-COALESCE(total_pages_deprecated_gone, 0) AS total_pages_deprecated_gone,
117+
-- origins summary
118+
0-COALESCE(total_origins_deprecated_existing, 0) AS total_origins_deprecated_existing,
119+
0-COALESCE(total_origins_deprecated_gone, 0) AS total_origins_deprecated_gone,
102120
103-
COALESCE(total_pages_before, 0) - COALESCE(total_pages_deprecated_existing, 0) - COALESCE(total_pages_deprecated_gone, 0) AS total_pages_persisted,
121+
COALESCE(total_origins_before, 0) - COALESCE(total_origins_deprecated_existing, 0) - COALESCE(total_origins_deprecated_gone, 0) AS total_origins_persisted,
104122
105-
COALESCE(total_pages_introduced_existing, 0) AS total_pages_introduced_existing,
106-
COALESCE(total_pages_introduced_new, 0) AS total_pages_introduced_new,
123+
COALESCE(total_origins_adopted_existing, 0) AS total_origins_adopted_existing,
124+
COALESCE(total_origins_adopted_new, 0) AS total_origins_adopted_new,
107125
108-
-- Sample pages
109-
COALESCE(sample_pages_deprecated_existing, "") AS sample_pages_deprecated_existing,
110-
COALESCE(sample_pages_deprecated_gone, "") AS sample_pages_deprecated_gone,
126+
-- Sample origins
127+
COALESCE(sample_origins_deprecated_existing, "") AS sample_origins_deprecated_existing,
128+
--COALESCE(sample_origins_deprecated_gone, "") AS sample_origins_deprecated_gone,
111129
112-
COALESCE(tech_introduced_existing_pages.sample_pages_introduced_existing, "") AS sample_pages_introduced_existing,
113-
COALESCE(tech_introduced_new_pages.sample_pages_introduced_new, "") AS sample_pages_introduced_new
130+
COALESCE(tech_adopted_existing_origins.sample_origins_adopted_existing, "") AS sample_origins_adopted_existing,
131+
--COALESCE(tech_adopted_new_origins.sample_origins_adopted_new, "") AS sample_origins_adopted_new
114132
115133
FROM tech_before_summary before_summary
116-
FULL OUTER JOIN tech_introduced_existing_pages
117-
ON before_summary.technology = tech_introduced_existing_pages.technology
118-
FULL OUTER JOIN tech_introduced_new_pages
119-
ON before_summary.technology = tech_introduced_new_pages.technology
120-
LEFT JOIN tech_deprecated_existing_pages
121-
ON before_summary.technology = tech_deprecated_existing_pages.technology
122-
LEFT JOIN tech_deprecated_gone_pages
123-
ON before_summary.technology = tech_deprecated_gone_pages.technology
134+
FULL OUTER JOIN tech_adopted_existing_origins
135+
ON before_summary.technology = tech_adopted_existing_origins.technology
136+
FULL OUTER JOIN tech_adopted_new_origins
137+
ON before_summary.technology = tech_adopted_new_origins.technology
138+
LEFT JOIN tech_deprecated_existing_origins
139+
ON before_summary.technology = tech_deprecated_existing_origins.technology
140+
LEFT JOIN tech_deprecated_gone_origins
141+
ON before_summary.technology = tech_deprecated_gone_origins.technology
124142
FULL OUTER JOIN wappalyzer.apps
125143
ON before_summary.technology = apps.name
126-
ORDER BY total_pages_persisted DESC
127144
`)

0 commit comments

Comments
 (0)