Skip to content

Commit 959ad5e

Browse files
clean no valid technologies (#53)
1 parent 88387c6 commit 959ad5e

File tree

1 file changed

+11
-9
lines changed

1 file changed

+11
-9
lines changed

definitions/output/crawl/pages.js

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@ WHERE date = '${constants.currentMonth}' AND
7373
${constants.devRankFilter}
7474
`).postOps(ctx => `
7575
CREATE TEMP TABLE technologies_cleaned AS (
76-
WITH wappalyzer AS (
76+
WITH technologies AS (
7777
SELECT DISTINCT
7878
name AS technology,
7979
categories
@@ -101,13 +101,13 @@ CREATE TEMP TABLE technologies_cleaned AS (
101101
LEFT JOIN pages.categories AS category
102102
WHERE
103103
-- Technology is corrupted
104-
technology NOT IN (SELECT DISTINCT technology FROM wappalyzer) OR
104+
technology NOT IN (SELECT DISTINCT technology FROM technologies) OR
105105
-- Technology's category is corrupted
106106
CONCAT(technology, category) NOT IN (
107107
SELECT DISTINCT
108108
CONCAT(technology, category)
109-
FROM wappalyzer
110-
LEFT JOIN wappalyzer.categories AS category
109+
FROM technologies
110+
INNER JOIN technologies.categories AS category
111111
)
112112
),
113113
@@ -118,14 +118,14 @@ CREATE TEMP TABLE technologies_cleaned AS (
118118
page,
119119
ARRAY_AGG(STRUCT(
120120
pages.technology,
121-
wappalyzer.categories,
121+
technologies.categories,
122122
pages.info
123123
)) AS technologies
124124
FROM pages
125125
INNER JOIN impacted_pages
126126
USING (client, page)
127-
INNER JOIN wappalyzer
128-
ON pages.technology = wappalyzer.technology
127+
INNER JOIN technologies
128+
USING (technology)
129129
GROUP BY
130130
client,
131131
page
@@ -134,8 +134,10 @@ CREATE TEMP TABLE technologies_cleaned AS (
134134
SELECT
135135
client,
136136
page,
137-
technologies
138-
FROM reconstructed_technologies
137+
reconstructed_technologies.technologies
138+
FROM impacted_pages
139+
LEFT JOIN reconstructed_technologies
140+
USING(client,page)
139141
);
140142
141143
-- Update the crawl.pages table with the cleaned and restored technologies

0 commit comments

Comments
 (0)