Skip to content

Commit fb25798

Browse files
authored
Analytics: add status and regiter analytics from client only (#12346)
We are moving the pageview for 404 pages to the client side, similar to what we are doing for 200 pages. This should reduce the load that bots produce when scanning our docs and hitting a bunch of 404 pages. The new approach allows us to enable analytics 200/404 for all the projects without a feature flag. Closes #12307
1 parent b47ab80 commit fb25798

File tree

7 files changed

+118
-357
lines changed

7 files changed

+118
-357
lines changed

readthedocs/analytics/proxied_api.py

Lines changed: 47 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -6,18 +6,20 @@
66
import structlog
77
from django.http import JsonResponse
88
from django.shortcuts import get_object_or_404
9-
from rest_framework import status
9+
from rest_framework import status as status_codes
1010
from rest_framework.response import Response
1111
from rest_framework.views import APIView
1212

1313
from readthedocs.analytics.models import PageView
1414
from readthedocs.api.v2.permissions import IsAuthorizedToViewVersion
1515
from readthedocs.core.mixins import CDNCacheControlMixin
16+
from readthedocs.core.unresolver import InvalidPathForVersionedProjectError
1617
from readthedocs.core.unresolver import UnresolverError
17-
from readthedocs.core.unresolver import unresolve
18+
from readthedocs.core.unresolver import unresolver
1819
from readthedocs.core.utils.extend import SettingsOverrideObject
1920
from readthedocs.core.utils.requests import is_suspicious_request
2021
from readthedocs.projects.models import Project
22+
from readthedocs.proxito.views.hosting import IsAuthorizedToViewProject
2123

2224

2325
log = structlog.get_logger(__name__) # noqa
@@ -38,7 +40,7 @@ class BaseAnalyticsView(CDNCacheControlMixin, APIView):
3840
# so we capture all views/interactions.
3941
cache_response = False
4042
http_method_names = ["get"]
41-
permission_classes = [IsAuthorizedToViewVersion]
43+
permission_classes = [IsAuthorizedToViewProject | IsAuthorizedToViewVersion]
4244

4345
@lru_cache(maxsize=1)
4446
def _get_project(self):
@@ -50,31 +52,38 @@ def _get_project(self):
5052
def _get_version(self):
5153
version_slug = self.request.GET.get("version")
5254
project = self._get_project()
53-
version = get_object_or_404(
54-
project.versions.all(),
55-
slug=version_slug,
56-
)
55+
# Do not call `get_object_or_404` because there may be some invalid URLs without versions.
56+
# We do want to track those 404 pages as well. In that case, the `filename` attribute is the `path`.
57+
version = project.versions.filter(slug=version_slug).first()
5758
return version
5859

5960
def get(self, request, *args, **kwargs):
6061
# TODO: Use absolute_uri only, we don't need project and version.
6162
project = self._get_project()
6263
version = self._get_version()
6364
absolute_uri = self.request.GET.get("absolute_uri")
65+
status = self.request.GET.get("status", "200")
6466
if not absolute_uri:
6567
return JsonResponse(
6668
{"error": "'absolute_uri' GET attribute is required"},
67-
status=status.HTTP_400_BAD_REQUEST,
69+
status=status_codes.HTTP_400_BAD_REQUEST,
70+
)
71+
72+
if status not in ("200", "404"):
73+
return JsonResponse(
74+
{"error": "'status' GET attribute should be 200 or 404"},
75+
status=status_codes.HTTP_400_BAD_REQUEST,
6876
)
6977

7078
self.increase_page_view_count(
7179
project=project,
7280
version=version,
7381
absolute_uri=absolute_uri,
82+
status=status,
7483
)
75-
return Response(status=status.HTTP_204_NO_CONTENT)
84+
return Response(status=status_codes.HTTP_204_NO_CONTENT)
7685

77-
def increase_page_view_count(self, project, version, absolute_uri):
86+
def increase_page_view_count(self, project, version, absolute_uri, status):
7887
"""Increase the page view count for the given project."""
7988
if is_suspicious_request(self.request):
8089
log.info(
@@ -83,28 +92,49 @@ def increase_page_view_count(self, project, version, absolute_uri):
8392
)
8493
return
8594

95+
# Don't track 200 if the version doesn't exist
96+
if status == "200" and not version:
97+
return
98+
8699
# Don't allow tracking page views from external domains.
87100
if self.request.unresolved_domain.is_from_external_domain:
88101
return
89102

103+
# Don't track external versions.
104+
if version and version.is_external:
105+
return
106+
107+
absolute_uri_parsed = urlparse(absolute_uri)
90108
try:
91-
unresolved = unresolve(absolute_uri)
109+
unresolved = unresolver.unresolve_url(absolute_uri)
110+
filename = unresolved.filename
111+
absolute_uri_project = unresolved.project
112+
except InvalidPathForVersionedProjectError as exc:
113+
# If the version is missing, we still want to log this request.
114+
#
115+
# If we don't have a version, the filename is the path,
116+
# otherwise it would be empty.
117+
filename = exc.path
118+
absolute_uri_project = exc.project
92119
except UnresolverError:
93120
# If we were unable to resolve the URL, it
94121
# isn't pointing to a valid RTD project.
95122
return
96123

97-
# Don't track external versions.
98-
if version.is_external or not unresolved.filename:
124+
if absolute_uri_project.slug != project.slug:
125+
log.warning(
126+
"Skipping page view count since projects don't match",
127+
project_slug=project.slug,
128+
uri_project_slug=absolute_uri_project.slug,
129+
)
99130
return
100131

101-
path = urlparse(absolute_uri).path
102132
PageView.objects.register_page_view(
103133
project=project,
104134
version=version,
105-
filename=unresolved.filename,
106-
path=path,
107-
status=200,
135+
filename=filename,
136+
path=absolute_uri_parsed.path,
137+
status=status,
108138
)
109139

110140

readthedocs/analytics/tests.py

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,41 @@ def test_invalid_uri(self):
135135
self.client.get(url, headers={"host": self.host})
136136
assert PageView.objects.all().count() == 0
137137

138+
def test_uri_for_another_project(self):
139+
other_project = get(
140+
Project,
141+
slug="other",
142+
)
143+
other_project.versions.all().update(privacy_level=PUBLIC)
144+
145+
# Host and ``absolute_uri`` are from different projects
146+
assert PageView.objects.all().count() == 0
147+
url = (
148+
reverse("analytics_api")
149+
+ f"?project={self.project.slug}&version=latest"
150+
f"&absolute_uri=https://other.readthedocs.io/en/latest/"
151+
)
152+
self.client.get(url, headers={"host": self.host})
153+
assert PageView.objects.all().count() == 0
154+
155+
# Host and ``absolute_uri`` are from different projects with no ``?version`` attribute
156+
url = (
157+
reverse("analytics_api")
158+
+ f"?project={self.project.slug}"
159+
f"&absolute_uri=https://other.readthedocs.io/en/latest/"
160+
)
161+
self.client.get(url, headers={"host": self.host})
162+
assert PageView.objects.all().count() == 0
163+
164+
# Host and ``absolute_uri`` are from the same project
165+
url = (
166+
reverse("analytics_api")
167+
+ f"?project=other&version=latest"
168+
f"&absolute_uri=https://other.readthedocs.io/en/latest/"
169+
)
170+
self.client.get(url, headers={"host": "other.readthedocs.io"})
171+
assert PageView.objects.all().count() == 1
172+
138173
def test_cache_headers(self):
139174
resp = self.client.get(self.url, headers={"host": self.host})
140175
self.assertEqual(resp.status_code, 204)
@@ -207,3 +242,33 @@ def test_dont_track_external_domains(self):
207242
r = self.client.get(self.url, headers={"host": host})
208243
self.assertEqual(r.status_code, 204)
209244
self.assertEqual(PageView.objects.all().count(), 0)
245+
246+
def test_notfound_404_pages(self):
247+
self.assertEqual(PageView.objects.all().count(), 0)
248+
url = self.url + "&status=404"
249+
resp = self.client.get(url, headers={"host": self.host})
250+
self.assertEqual(resp.status_code, 204)
251+
self.assertEqual(PageView.objects.all().count(), 1)
252+
self.assertEqual(PageView.objects.filter(status=404).count(), 1)
253+
254+
def test_notfound_404_page_without_version(self):
255+
self.assertEqual(PageView.objects.all().count(), 0)
256+
absolute_uri = (
257+
f"https://{self.project.slug}.readthedocs.io/index.html"
258+
)
259+
url = (
260+
reverse("analytics_api")
261+
+ f"?project={self.project.slug}&version=null"
262+
f"&absolute_uri={absolute_uri}"
263+
"&status=404"
264+
)
265+
266+
resp = self.client.get(url, headers={"host": self.host})
267+
pageview = PageView.objects.all().first()
268+
269+
self.assertEqual(resp.status_code, 204)
270+
self.assertEqual(PageView.objects.all().count(), 1)
271+
self.assertIsNone(pageview.version)
272+
self.assertEqual(pageview.project.slug, self.project.slug)
273+
self.assertEqual(pageview.path, "/index.html")
274+
self.assertEqual(pageview.status, 404)

readthedocs/projects/models.py

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1950,7 +1950,6 @@ def add_features(sender, **kwargs):
19501950

19511951
# Feature constants - this is not a exhaustive list of features, features
19521952
# may be added by other packages
1953-
RECORD_404_PAGE_VIEWS = "record_404_page_views"
19541953
DISABLE_PAGEVIEWS = "disable_pageviews"
19551954
RESOLVE_PROJECT_FROM_HEADER = "resolve_project_from_header"
19561955
USE_PROXIED_APIS_WITH_PREFIX = "use_proxied_apis_with_prefix"
@@ -1975,10 +1974,6 @@ def add_features(sender, **kwargs):
19751974
BUILD_NO_ACKS_LATE = "build_no_acks_late"
19761975

19771976
FEATURES = (
1978-
(
1979-
RECORD_404_PAGE_VIEWS,
1980-
_("Proxito: Record 404s page views."),
1981-
),
19821977
(
19831978
DISABLE_PAGEVIEWS,
19841979
_("Proxito: Disable all page views"),

readthedocs/projects/views/private.py

Lines changed: 6 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,6 @@
7171
from readthedocs.projects.models import Domain
7272
from readthedocs.projects.models import EmailHook
7373
from readthedocs.projects.models import EnvironmentVariable
74-
from readthedocs.projects.models import Feature
7574
from readthedocs.projects.models import Project
7675
from readthedocs.projects.models import ProjectRelationship
7776
from readthedocs.projects.models import WebHook
@@ -1229,15 +1228,12 @@ def get_context_data(self, **kwargs):
12291228

12301229
# Count of views for top pages over the month
12311230
top_pages_200 = PageView.top_viewed_pages(project, limit=25)
1232-
track_404 = project.has_feature(Feature.RECORD_404_PAGE_VIEWS)
1233-
top_pages_404 = []
1234-
if track_404:
1235-
top_pages_404 = PageView.top_viewed_pages(
1236-
project,
1237-
limit=25,
1238-
status=404,
1239-
per_version=True,
1240-
)
1231+
top_pages_404 = PageView.top_viewed_pages(
1232+
project,
1233+
limit=25,
1234+
status=404,
1235+
per_version=True,
1236+
)
12411237

12421238
# Aggregate pageviews grouped by day
12431239
page_data = PageView.page_views_by_date(
@@ -1249,7 +1245,6 @@ def get_context_data(self, **kwargs):
12491245
"top_pages_200": top_pages_200,
12501246
"page_data": page_data,
12511247
"top_pages_404": top_pages_404,
1252-
"track_404": track_404,
12531248
}
12541249
)
12551250

0 commit comments

Comments
 (0)