|
1 | 1 | const pastMonth = constants.fnPastMonth(constants.currentMonth)
|
2 | 2 |
|
3 | 3 | publish('tech_detections', {
|
4 |
| - type: 'table', |
5 |
| - description: 'Used in dashboard: https://lookerstudio.google.com/u/7/reporting/1jh_ScPlCIbSYTf2r2Y6EftqmX9SQy4Gn/page/p_an38lbzywc/edit', |
| 4 | + description: 'Used in dashboard: https://lookerstudio.google.com/u/7/reporting/1jh_ScPlCIbSYTf2r2Y6EftqmX9SQy4Gn/origin/p_an38lbzywc/edit', |
6 | 5 | schema: 'wappalyzer',
|
| 6 | + type: 'incremental', |
| 7 | + protected: true, |
| 8 | + bigquery: { |
| 9 | + partitionBy: 'date', |
| 10 | + }, |
7 | 11 | tags: ['crawl_complete']
|
8 |
| -}).query(ctx => ` |
| 12 | +}).preOps(ctx => ` |
| 13 | +DELETE FROM ${ctx.self()} |
| 14 | +WHERE date = '${constants.currentMonth}'; |
| 15 | +`).query(ctx => ` |
9 | 16 | WITH source AS (
|
10 | 17 | SELECT DISTINCT
|
11 | 18 | date,
|
12 |
| - root_page AS page, |
| 19 | + root_page AS origin, |
13 | 20 | tech.technology
|
14 | 21 | FROM ${ctx.ref('crawl', 'pages')},
|
15 | 22 | UNNEST(technologies) AS tech
|
16 |
| - WHERE date >= "${pastMonth}" ${constants.devRankFilter} |
| 23 | + WHERE date IN ('${pastMonth}', '${constants.currentMonth}') ${constants.devRankFilter} |
17 | 24 | ),
|
18 |
| --- Technology in the previous month (August) |
| 25 | +-- Technology in the previous month |
19 | 26 | tech_before AS (
|
20 | 27 | SELECT
|
21 |
| - page, |
| 28 | + origin, |
22 | 29 | technology
|
23 | 30 | FROM source
|
24 |
| - WHERE date = "${pastMonth}" |
| 31 | + WHERE date = '${pastMonth}' |
25 | 32 | ),
|
26 |
| --- Technology in the current month (September) |
| 33 | +-- Technology in the current month |
27 | 34 | tech_current AS (
|
28 | 35 | SELECT
|
29 |
| - page, |
| 36 | + origin, |
30 | 37 | technology
|
31 | 38 | FROM source
|
32 |
| - WHERE date = "${constants.currentMonth}" |
| 39 | + WHERE date = '${constants.currentMonth}' |
33 | 40 | ),
|
34 |
| --- Summary of technology and categories per page in the previous month |
| 41 | +-- Summary of technology per origin in the previous month |
35 | 42 | tech_before_summary AS (
|
36 | 43 | SELECT
|
37 | 44 | technology,
|
38 |
| - COUNT(DISTINCT page) AS total_pages_before |
| 45 | + COUNT(DISTINCT origin) AS total_origins_before |
39 | 46 | FROM tech_before
|
40 | 47 | GROUP BY technology
|
41 | 48 | ),
|
42 |
| --- Pages that existed last month but introduced the technology in the current month |
43 |
| -tech_introduced_existing_pages AS ( |
| 49 | +-- origins that persisted across both months and adopted the technology in the current month |
| 50 | +tech_adopted_existing_origins AS ( |
44 | 51 | SELECT
|
45 |
| - tech_current.technology, |
46 |
| - COUNT(DISTINCT tech_current.page) AS total_pages_introduced_existing, |
47 |
| - STRING_AGG(DISTINCT tech_current.page LIMIT 5) AS sample_pages_introduced_existing |
48 |
| - FROM tech_current |
49 |
| - JOIN tech_before |
50 |
| - USING (page) |
| 52 | + persisted_origins.technology, |
| 53 | + COUNT(DISTINCT persisted_origins.origin) AS total_origins_adopted_existing, |
| 54 | + STRING_AGG(DISTINCT persisted_origins.origin LIMIT 5) AS sample_origins_adopted_existing |
| 55 | + FROM ( |
| 56 | + SELECT DISTINCT |
| 57 | + tech_current.technology, |
| 58 | + tech_current.origin |
| 59 | + FROM tech_before |
| 60 | + JOIN tech_current |
| 61 | + USING (origin) |
| 62 | + ) as persisted_origins |
51 | 63 | LEFT JOIN tech_before AS tb
|
52 |
| - ON tech_current.page = tb.page AND tech_current.technology = tb.technology |
53 |
| - WHERE tb.page IS NULL -- Technology was not detected last month |
54 |
| - GROUP BY tech_current.technology |
| 64 | + ON persisted_origins.origin = tb.origin AND persisted_origins.technology = tb.technology |
| 65 | + WHERE tb.origin IS NULL -- Technology was not detected last month |
| 66 | + GROUP BY 1 |
55 | 67 | ),
|
56 |
| --- Pages that were not in the dataset last month but appeared this month with the technology |
57 |
| -tech_introduced_new_pages AS ( |
| 68 | +-- origins that arrived to CrUX in the current month and their detected technologies |
| 69 | +tech_adopted_new_origins AS ( |
58 | 70 | SELECT
|
59 | 71 | tech_current.technology,
|
60 |
| - COUNT(DISTINCT tech_current.page) AS total_pages_introduced_new, |
61 |
| - STRING_AGG(DISTINCT tech_current.page LIMIT 5) AS sample_pages_introduced_new |
| 72 | + COUNT(DISTINCT tech_current.origin) AS total_origins_adopted_new, |
| 73 | + --STRING_AGG(DISTINCT tech_current.origin LIMIT 5) AS sample_origins_adopted_new |
62 | 74 | FROM tech_current
|
63 | 75 | LEFT JOIN tech_before
|
64 |
| - USING (page) |
65 |
| - WHERE tech_before.page IS NULL -- Page was not present last month |
66 |
| - GROUP BY tech_current.technology |
| 76 | + USING (origin) |
| 77 | + WHERE tech_before.origin IS NULL -- origin was not present last month |
| 78 | + GROUP BY 1 |
67 | 79 | ),
|
68 |
| --- Pages that existed this month but no longer have the technology |
69 |
| -tech_deprecated_existing_pages AS ( |
| 80 | +-- origins that persisted across both months and deprecated the technology usage in the current month |
| 81 | +tech_deprecated_existing_origins AS ( |
70 | 82 | SELECT
|
71 |
| - tech_before.technology, |
72 |
| - COUNT(DISTINCT tech_before.page) AS total_pages_deprecated_existing, |
73 |
| - STRING_AGG(DISTINCT tech_before.page LIMIT 5) AS sample_pages_deprecated_existing |
74 |
| - FROM tech_before |
75 |
| - JOIN tech_current |
76 |
| - USING (page) |
| 83 | + persisted_origins.technology, |
| 84 | + COUNT(DISTINCT persisted_origins.origin) AS total_origins_deprecated_existing, |
| 85 | + STRING_AGG(DISTINCT persisted_origins.origin LIMIT 5) AS sample_origins_deprecated_existing |
| 86 | + FROM ( |
| 87 | + SELECT DISTINCT |
| 88 | + tech_before.technology, |
| 89 | + tech_before.origin |
| 90 | + FROM tech_before |
| 91 | + JOIN tech_current |
| 92 | + USING (origin) |
| 93 | + ) as persisted_origins |
77 | 94 | LEFT JOIN tech_current AS tc
|
78 |
| - ON tech_before.page = tc.page AND tech_before.technology = tc.technology |
79 |
| - WHERE tc.page IS NULL -- Technology is not detected in the current month |
80 |
| - GROUP BY tech_before.technology |
| 95 | + ON persisted_origins.origin = tc.origin AND persisted_origins.technology = tc.technology |
| 96 | + WHERE tc.origin IS NULL -- Technology is not detected in the current month |
| 97 | + GROUP BY 1 |
81 | 98 | ),
|
82 |
| --- Pages that no longer exist in the current dataset |
83 |
| -tech_deprecated_gone_pages AS ( |
| 99 | +-- origins that were dropped from CrUX in the current dataset, and thus the technology was not detected anymore |
| 100 | +tech_deprecated_gone_origins AS ( |
84 | 101 | SELECT
|
85 | 102 | tech_before.technology,
|
86 |
| - COUNT(DISTINCT tech_before.page) AS total_pages_deprecated_gone, |
87 |
| - STRING_AGG(DISTINCT tech_before.page LIMIT 5) AS sample_pages_deprecated_gone |
| 103 | + COUNT(DISTINCT tech_before.origin) AS total_origins_deprecated_gone, |
| 104 | + --STRING_AGG(DISTINCT tech_before.origin LIMIT 5) AS sample_origins_deprecated_gone |
88 | 105 | FROM tech_before
|
89 | 106 | LEFT JOIN tech_current
|
90 |
| - USING (page) |
91 |
| - WHERE tech_current.page IS NULL -- Page no longer exists in current dataset |
92 |
| - GROUP BY tech_before.technology |
| 107 | + USING (origin) |
| 108 | + WHERE tech_current.origin IS NULL -- origin no longer exists in current dataset |
| 109 | + GROUP BY 1 |
93 | 110 | )
|
94 | 111 |
|
95 |
| --- Final aggregation and comparison of technology adoption/deprecation metrics |
| 112 | +-- aggregation of technology adoption/deprecation metrics |
96 | 113 | SELECT
|
97 |
| - COALESCE(before_summary.technology, tech_introduced_existing_pages.technology, tech_introduced_new_pages.technology, apps.name) AS technology, |
| 114 | + DATE('${constants.currentMonth}') AS date, |
| 115 | + COALESCE(before_summary.technology, tech_adopted_existing_origins.technology, tech_adopted_new_origins.technology, apps.name) AS technology, |
98 | 116 |
|
99 |
| - -- Pages summary |
100 |
| - 0-COALESCE(total_pages_deprecated_existing, 0) AS total_pages_deprecated_existing, |
101 |
| - 0-COALESCE(total_pages_deprecated_gone, 0) AS total_pages_deprecated_gone, |
| 117 | + -- origins summary |
| 118 | + 0-COALESCE(total_origins_deprecated_existing, 0) AS total_origins_deprecated_existing, |
| 119 | + 0-COALESCE(total_origins_deprecated_gone, 0) AS total_origins_deprecated_gone, |
102 | 120 |
|
103 |
| - COALESCE(total_pages_before, 0) - COALESCE(total_pages_deprecated_existing, 0) - COALESCE(total_pages_deprecated_gone, 0) AS total_pages_persisted, |
| 121 | + COALESCE(total_origins_before, 0) - COALESCE(total_origins_deprecated_existing, 0) - COALESCE(total_origins_deprecated_gone, 0) AS total_origins_persisted, |
104 | 122 |
|
105 |
| - COALESCE(total_pages_introduced_existing, 0) AS total_pages_introduced_existing, |
106 |
| - COALESCE(total_pages_introduced_new, 0) AS total_pages_introduced_new, |
| 123 | + COALESCE(total_origins_adopted_existing, 0) AS total_origins_adopted_existing, |
| 124 | + COALESCE(total_origins_adopted_new, 0) AS total_origins_adopted_new, |
107 | 125 |
|
108 |
| - -- Sample pages |
109 |
| - COALESCE(sample_pages_deprecated_existing, "") AS sample_pages_deprecated_existing, |
110 |
| - COALESCE(sample_pages_deprecated_gone, "") AS sample_pages_deprecated_gone, |
| 126 | + -- Sample origins |
| 127 | + COALESCE(sample_origins_deprecated_existing, "") AS sample_origins_deprecated_existing, |
| 128 | + --COALESCE(sample_origins_deprecated_gone, "") AS sample_origins_deprecated_gone, |
111 | 129 |
|
112 |
| - COALESCE(tech_introduced_existing_pages.sample_pages_introduced_existing, "") AS sample_pages_introduced_existing, |
113 |
| - COALESCE(tech_introduced_new_pages.sample_pages_introduced_new, "") AS sample_pages_introduced_new |
| 130 | + COALESCE(tech_adopted_existing_origins.sample_origins_adopted_existing, "") AS sample_origins_adopted_existing, |
| 131 | + --COALESCE(tech_adopted_new_origins.sample_origins_adopted_new, "") AS sample_origins_adopted_new |
114 | 132 |
|
115 | 133 | FROM tech_before_summary before_summary
|
116 |
| -FULL OUTER JOIN tech_introduced_existing_pages |
117 |
| - ON before_summary.technology = tech_introduced_existing_pages.technology |
118 |
| -FULL OUTER JOIN tech_introduced_new_pages |
119 |
| - ON before_summary.technology = tech_introduced_new_pages.technology |
120 |
| -LEFT JOIN tech_deprecated_existing_pages |
121 |
| - ON before_summary.technology = tech_deprecated_existing_pages.technology |
122 |
| -LEFT JOIN tech_deprecated_gone_pages |
123 |
| - ON before_summary.technology = tech_deprecated_gone_pages.technology |
| 134 | +FULL OUTER JOIN tech_adopted_existing_origins |
| 135 | + ON before_summary.technology = tech_adopted_existing_origins.technology |
| 136 | +FULL OUTER JOIN tech_adopted_new_origins |
| 137 | + ON before_summary.technology = tech_adopted_new_origins.technology |
| 138 | +LEFT JOIN tech_deprecated_existing_origins |
| 139 | + ON before_summary.technology = tech_deprecated_existing_origins.technology |
| 140 | +LEFT JOIN tech_deprecated_gone_origins |
| 141 | + ON before_summary.technology = tech_deprecated_gone_origins.technology |
124 | 142 | FULL OUTER JOIN wappalyzer.apps
|
125 | 143 | ON before_summary.technology = apps.name
|
126 |
| -ORDER BY total_pages_persisted DESC |
127 | 144 | `)
|
0 commit comments