Skip to content

Commit b44e7e3

Browse files
authored
Merge pull request #4313 from JetBrains/ktl-1516-migration-ga-in-search
KTL-1516: migration ga in search
2 parents 5c0f322 + c5c3efd commit b44e7e3

File tree

5 files changed

+149
-66
lines changed

5 files changed

+149
-66
lines changed

.teamcity/builds/kotlinlang/buidTypes/BuildSearchIndex.kt

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -11,13 +11,11 @@ import jetbrains.buildServer.configs.kotlin.buildSteps.script
1111
import jetbrains.buildServer.configs.kotlin.triggers.schedule
1212
import vcsRoots.KotlinLangOrg
1313

14-
1514
object BuildSearchIndex : BuildType({
1615
name = "Build Site Search Index"
1716
description = "Build search index for Algolia using Google Analytics data"
1817

1918
params {
20-
param("env.KEY_FILE_LOCATION", "/secrets/google-credentials.json")
2119
param("virtualenv.folder", "_environment")
2220
param("env.WH_INDEX_NAME", SEARCH_INDEX_NAME)
2321
param("env.WH_SEARCH_USER", SEARCH_APP_ID)
@@ -33,12 +31,9 @@ object BuildSearchIndex : BuildType({
3331

3432
steps {
3533
script {
34+
name = "Push search index"
3635
scriptContent = """
37-
#!/bin/bash
38-
39-
## refresh packages
40-
pip install -r requirements.txt
41-
36+
#!/bin/bash
4237
python kotlin-website.py index
4338
""".trimIndent()
4439
dockerImage = "%dep.Kotlin_KotlinSites_Builds_KotlinlangOrg_BuildPythonContainer.kotlin-website-image%"
@@ -71,6 +66,17 @@ object BuildSearchIndex : BuildType({
7166
onDependencyFailure = FailureAction.FAIL_TO_START
7267
onDependencyCancel = FailureAction.CANCEL
7368
}
69+
70+
dependency(PageViews) {
71+
snapshot {}
72+
73+
artifacts {
74+
artifactRules = """
75+
page_views_map.json => data/
76+
""".trimIndent()
77+
}
78+
}
79+
7480
dependency(BuildSitePages) {
7581
snapshot {}
7682

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,58 @@
11
package builds.kotlinlang.buidTypes
22

3+
import jetbrains.buildServer.configs.kotlin.AbsoluteId
34
import jetbrains.buildServer.configs.kotlin.BuildType
5+
import jetbrains.buildServer.configs.kotlin.buildSteps.ScriptBuildStep
6+
import jetbrains.buildServer.configs.kotlin.buildSteps.script
7+
import jetbrains.buildServer.configs.kotlin.triggers.finishBuildTrigger
8+
import java.io.File
9+
import java.nio.file.Paths
10+
11+
private fun readScript(name: String): String {
12+
val file = File(Paths.get("scripts/$name.mjs").toAbsolutePath().toString())
13+
return file.readText()
14+
}
15+
16+
private val pageViewsCollectId = AbsoluteId("WebTeam_BuildsForDeploymentJetBrainsCom_Algolia_PageViewsFromGoogle")
417

518
object PageViews : BuildType({
619
name = "Fetch Page Views"
720
description = "Build data files with page views statistics for kotlin websites"
21+
22+
artifactRules = """
23+
page_views_list.json
24+
page_views_map.json
25+
""".trimIndent()
26+
27+
triggers {
28+
finishBuildTrigger {
29+
buildType = pageViewsCollectId.absoluteId
30+
branchFilter = "+:<default>"
31+
successfulOnly = true
32+
}
33+
}
34+
35+
steps {
36+
script {
37+
name = "Prepare page views"
38+
scriptContent = """
39+
#!/usr/bin/env bash
40+
":" //# comment; exec /usr/bin/env node --input-type=module - "${'$'}@" < "${'$'}0"
41+
42+
${readScript("stats/pageviews")}
43+
""".trimIndent()
44+
dockerImage = "node:lts-slim"
45+
dockerImagePlatform = ScriptBuildStep.ImagePlatform.Linux
46+
dockerPull = true
47+
}
48+
}
49+
50+
dependencies {
51+
artifacts(pageViewsCollectId) {
52+
buildRule = lastSuccessful()
53+
artifactRules = """
54+
+:unique_pageviews_pages_000000000000.json => data
55+
""".trimIndent()
56+
}
57+
}
858
})

.teamcity/scripts/stats/pageviews.mjs

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
import { open } from 'node:fs/promises';
2+
3+
const INPUT_FILE_PATH = 'data/unique_pageviews_pages_000000000000.json';
4+
const input = await open(INPUT_FILE_PATH, 'r');
5+
6+
async function openReportFile(name) {
7+
const file = await open(name, 'w');
8+
await file.truncate(0);
9+
return file;
10+
}
11+
12+
const [listViews, mapViews] = await Promise.all([
13+
openReportFile('page_views_list.json'),
14+
openReportFile('page_views_map.json'),
15+
]);
16+
17+
try {
18+
async function append(line) {
19+
const { webpage: url, unique_pageviews: views } = JSON.parse(line);
20+
21+
const pageviews = Number(views);
22+
23+
if (views === '' || isNaN(pageviews)) {
24+
console.warn(`${url} has incorrect unique_pageviews=${views}`);
25+
return;
26+
}
27+
28+
if (pageviews < 1) return;
29+
if (!(new URL(url).host.includes('kotlinlang.org'))) return;
30+
31+
await Promise.all([
32+
listViews.appendFile(JSON.stringify({ url, pageviews }) + ','),
33+
mapViews.appendFile(`${JSON.stringify(url)}: ${pageviews},`),
34+
]);
35+
}
36+
37+
const lines = [];
38+
39+
await Promise.all([
40+
listViews.write('['),
41+
mapViews.write('{'),
42+
]);
43+
44+
const readlineInterface = input.readLines();
45+
46+
readlineInterface.on('line', line => {
47+
lines.push(append(line));
48+
});
49+
50+
const waitInputRead = new Promise(resolve => {
51+
readlineInterface.on('close', () => {
52+
resolve();
53+
});
54+
});
55+
56+
await waitInputRead;
57+
await Promise.all(lines);
58+
59+
async function replaceLastCharacter(file, ch) {
60+
const { size } = await file.stat();
61+
file.write(ch, size - 1);
62+
}
63+
64+
await Promise.all([
65+
replaceLastCharacter(listViews, ']'),
66+
replaceLastCharacter(mapViews, '}'),
67+
]);
68+
} finally {
69+
await Promise.all([
70+
input.close(),
71+
listViews.close(),
72+
mapViews.close(),
73+
]);
74+
}

requirements.txt

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,5 +14,4 @@ git+https://github.com/pik-software/geocoder.git@yandex-api-key#egg=geocoder
1414
ruamel.yaml==0.17.21
1515
PyYAML==5.4.1
1616
algoliasearch==1.20.0
17-
google-api-python-client==1.6.2
1817
Werkzeug==2.3.8

src/search.py

Lines changed: 12 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -1,72 +1,24 @@
1+
import json
12
import os
23
from typing import Dict, List, Iterator
34

45
from algoliasearch import algoliasearch
56
from algoliasearch.index import Index
67
from bs4 import Tag
7-
from googleapiclient.discovery import build, Resource
8-
from oauth2client.service_account import ServiceAccountCredentials
98

109
from src.api import get_api_page
1110
from src.dist import get_dist_page_xml, dist_path
1211

1312

14-
def initialize_analyticsreporting() -> Resource:
15-
credentials = ServiceAccountCredentials.from_json_keyfile_name(
16-
os.environ['KEY_FILE_LOCATION'], scopes='https://www.googleapis.com/auth/analytics.readonly')
17-
analytics = build('analyticsreporting', 'v4', credentials=credentials)
18-
return analytics
19-
20-
21-
def get_report(analytics: Resource) -> Dict:
22-
return analytics.reports().batchGet(
23-
body={
24-
"reportRequests":
25-
[
26-
{
27-
"viewId": "85132606",
28-
"samplingLevel": "LARGE",
29-
"filtersExpression": "ga:hostname==kotlinlang.org;ga:pagepath!@?",
30-
"pageSize": 10000,
31-
"orderBys": [
32-
{
33-
"fieldName": "ga:uniquepageviews",
34-
"sortOrder": "DESCENDING"
35-
}
36-
],
37-
"dateRanges":
38-
[
39-
{
40-
"startDate": "30daysAgo",
41-
"endDate": "yesterday"
42-
}
43-
],
44-
"metrics":
45-
[
46-
{
47-
"expression": "ga:uniquepageviews",
48-
"alias": ""
49-
}
50-
],
51-
"dimensions":
52-
[
53-
{
54-
"name": "ga:pagePath"
55-
}
56-
]
57-
}
58-
]
59-
}).execute()
13+
def get_page_views_statistic() -> Dict[str, int]:
14+
print("Acquiring page view statistic")
6015

16+
file = open("data/page_views_map.json", "r")
17+
page_views = json.load(file)
18+
file.close()
6119

62-
def get_page_views_statistic() -> Dict[str, int]:
63-
print("Acquiring page view statistic from google")
64-
page_views = {}
65-
analytics = initialize_analyticsreporting()
66-
report = get_report(analytics)
67-
for row in report["reports"][0]["data"]["rows"]:
68-
page_views[row["dimensions"][0]] = int(row['metrics'][0]["values"][0])
6920
print("Page view statistic acquired")
21+
7022
return page_views
7123

7224

@@ -110,7 +62,8 @@ def get_valuable_content(page_path, content: Iterator[Tag]) -> List[str]:
11062
valuable_content.append(child.text)
11163
elif child.name in ['ul', 'ol', 'blockquote', 'div', 'section', 'dl']:
11264
valuable_content += get_valuable_content(page_path, child.children)
113-
elif child.name in ['figure', 'iframe', 'pre', 'code', 'hr', 'table', 'script', 'link', 'a', 'br', 'i', 'img', 'object']:
65+
elif child.name in ['figure', 'iframe', 'pre', 'code', 'hr', 'table', 'script', 'link', 'a', 'br', 'i', 'img',
66+
'object']:
11467
continue
11568
else:
11669
raise Exception('Unknown tag "' + child.name + '" in ' + page_path)
@@ -243,8 +196,9 @@ def build_search_indices(pages):
243196
page_path = get_page_path_from_url(url)
244197
page_views = 0
245198

246-
if url in page_views_statistic:
247-
page_views = page_views_statistic[url]
199+
public_url = "https://kotlinlang.org" + url
200+
if public_url in page_views_statistic:
201+
page_views = page_views_statistic[public_url]
248202

249203
if type == 'Page_Community':
250204
page_type = 'Community'

0 commit comments

Comments
 (0)