Skip to content

Commit 0b8d8db

Browse files
committed
feat: add spam detection endpoints: api/spam/get-latest-batch/ and api/spam/update. A SpamModeration record with status SCHEDULED_FOR_CHECK is stored on every Job, Event, Codebase submission. A decoupled external service will query for these objects to check them for spam.
1 parent 1604bc6 commit 0b8d8db

File tree

10 files changed

+464
-15
lines changed

10 files changed

+464
-15
lines changed

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ SECRETS_DIR=${BUILD_DIR}/secrets
1212
DB_PASSWORD_PATH=${SECRETS_DIR}/db_password
1313
PGPASS_PATH=${SECRETS_DIR}/.pgpass
1414
SECRET_KEY_PATH=${SECRETS_DIR}/django_secret_key
15-
EXT_SECRETS=hcaptcha_secret github_client_secret orcid_client_secret discourse_api_key discourse_sso_secret mail_api_key
15+
EXT_SECRETS=hcaptcha_secret github_client_secret orcid_client_secret discourse_api_key discourse_sso_secret mail_api_key llm_spam_check_api_key
1616
GENERATED_SECRETS=$(DB_PASSWORD_PATH) $(PGPASS_PATH) $(SECRET_KEY_PATH)
1717

1818
ENVREPLACE := deploy/scripts/envreplace

base.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,7 @@ services:
6868
- orcid_client_secret
6969
- hcaptcha_secret
7070
- mail_api_key
71+
- llm_spam_check_api_key
7172
volumes:
7273
- ./deploy/elasticsearch.conf.d:/etc/elasticsearch
7374
- ./docker/shared:/shared
@@ -101,6 +102,8 @@ secrets:
101102
file: ./build/secrets/mail_api_key
102103
orcid_client_secret:
103104
file: ./build/secrets/orcid_client_secret
105+
llm_spam_check_api_key:
106+
file: ./build/secrets/llm_spam_check_api_key
104107

105108
volumes:
106109
esdata:

django/core/mixins.py

Lines changed: 23 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -295,27 +295,37 @@ def mark_spam(self, request, **kwargs):
295295
return redirect(instance.get_list_url())
296296

297297
def handle_spam_detection(self, serializer: serializers.Serializer):
298-
if "spam_context" in serializer.context:
299-
try:
300-
self._validate_content_object(serializer.instance)
301-
self._record_spam(
302-
serializer.instance, serializer.context["spam_context"]
303-
)
304-
except ValueError as e:
305-
logger.warning("Cannot flag %s as spam: %s", serializer.instance, e)
298+
try:
299+
self._validate_content_object(serializer.instance)
300+
self._record_spam(
301+
serializer.instance,
302+
(
303+
serializer.context["spam_context"]
304+
if "spam_context" in serializer.context
305+
else None
306+
),
307+
)
308+
except ValueError as e:
309+
logger.warning("Cannot flag %s as spam: %s", serializer.instance, e)
306310

307-
def _record_spam(self, instance, spam_context: dict):
311+
def _record_spam(self, instance, spam_context: dict = None):
308312
content_type = ContentType.objects.get_for_model(type(instance))
313+
309314
# SpamModeration updates the content instance on save
310315
spam_moderation, created = SpamModeration.objects.get_or_create(
311316
content_type=content_type,
312317
object_id=instance.id,
313318
defaults={
314-
"status": SpamModeration.Status.UNREVIEWED,
315-
"detection_method": spam_context["detection_method"],
316-
"detection_details": spam_context["detection_details"],
319+
"status": SpamModeration.Status.SCHEDULED_FOR_CHECK,
320+
"detection_method": (
321+
spam_context["detection_method"] if spam_context else ""
322+
),
323+
"detection_details": (
324+
spam_context["detection_details"] if spam_context else ""
325+
),
317326
},
318327
)
328+
319329
if not created:
320-
spam_moderation.status = SpamModeration.Status.UNREVIEWED
330+
spam_moderation.status = SpamModeration.Status.SCHEDULED_FOR_CHECK
321331
spam_moderation.save()

django/core/models.py

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -137,6 +137,9 @@ class Status(models.TextChoices):
137137
UNREVIEWED = "unreviewed", _("Unreviewed")
138138
SPAM = "spam", _("Confirmed spam")
139139
NOT_SPAM = "not_spam", _("Confirmed not spam")
140+
SCHEDULED_FOR_CHECK = "scheduled_for_check", _("Scheduled for check by LLM")
141+
SPAM_LIKELY = "spam_likely", _("Marked spam by LLM")
142+
NOT_SPAM_LIKELY = "not_spam_likely", _("Marked as not spam by LLM")
140143

141144
status = models.CharField(
142145
choices=Status.choices,
@@ -166,6 +169,16 @@ class Status(models.TextChoices):
166169
blank=True,
167170
)
168171

172+
# detection_details is a JSON field
173+
def mark_as_spam_by_llm(self, status: Status, detection_details=None):
174+
logger.info("Marking %s as %s by LLM", self, status)
175+
self.reviewer = None
176+
self.status = status
177+
self.detection_method = "LLM"
178+
if detection_details:
179+
self.detection_details = detection_details
180+
self.save()
181+
169182
def mark_not_spam(self, reviewer: User, detection_details=None):
170183
logger.info("user %s marking %s as not spam", reviewer, self)
171184
self.status = self.Status.NOT_SPAM
@@ -182,7 +195,7 @@ def update_related_object(self):
182195
related_object = self.content_object
183196
if hasattr(related_object, "is_marked_spam"):
184197
related_object.spam_moderation = self
185-
related_object.is_marked_spam = self.status != self.Status.NOT_SPAM
198+
related_object.is_marked_spam = self.status == self.Status.SPAM
186199
related_object.save()
187200

188201
def __str__(self):

django/core/settings/defaults.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -527,6 +527,8 @@ def is_test(self):
527527
DISCOURSE_API_KEY = read_secret("discourse_api_key", "unconfigured")
528528
DISCOURSE_API_USERNAME = os.getenv("DISCOURSE_API_USERNAME", "unconfigured")
529529

530+
LLM_SPAM_CHECK_API_KEY = read_secret("llm_spam_check_api_key", "unconfigured")
531+
530532
# https://docs.djangoproject.com/en/4.2/ref/settings/#templates
531533
TEMPLATES = [
532534
{

django/curator/auth.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
from rest_framework.authentication import BaseAuthentication
2+
from rest_framework.exceptions import AuthenticationFailed, NotAuthenticated
3+
from django.conf import settings
4+
from rest_framework.permissions import BasePermission
5+
6+
7+
class APIKeyAuthentication(BaseAuthentication):
8+
def authenticate_header(self, request):
9+
return "X-API-Key"
10+
11+
def authenticate(self, request):
12+
api_key = request.META.get("HTTP_X_API_KEY")
13+
14+
if not api_key:
15+
raise AuthenticationFailed("No API key")
16+
17+
if api_key != settings.LLM_SPAM_CHECK_API_KEY:
18+
raise AuthenticationFailed("Invalid API key")
19+
20+
return (None, None)

django/curator/serializers.py

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
from rest_framework import serializers
2+
from core.models import SpamModeration
3+
from django.contrib.contenttypes.models import ContentType
4+
from rest_framework import serializers
5+
from core.models import Event, Job, SpamModeration
6+
from django.contrib.contenttypes.models import ContentType
7+
8+
from library.models import Codebase
9+
10+
11+
class SpamModerationSerializer(serializers.ModelSerializer):
12+
content_type = serializers.CharField(source="content_type.model")
13+
14+
class Meta:
15+
model = SpamModeration
16+
fields = [
17+
"id",
18+
# "status",
19+
"content_type",
20+
"object_id",
21+
# "date_created",
22+
# "last_modified",
23+
# "notes",
24+
# "detection_method",
25+
# "detection_details",
26+
]
27+
28+
29+
class MinimalJobSerializer(serializers.ModelSerializer):
30+
class Meta:
31+
model = Job
32+
fields = [
33+
"id",
34+
"title",
35+
"summary",
36+
"description",
37+
"external_url",
38+
]
39+
40+
41+
class MinimalEventSerializer(serializers.ModelSerializer):
42+
class Meta:
43+
model = Event
44+
fields = [
45+
"id",
46+
"title",
47+
"summary",
48+
"description",
49+
"external_url",
50+
]
51+
52+
53+
class MinimalCodebaseSerializer(serializers.ModelSerializer):
54+
class Meta:
55+
model = Codebase
56+
fields = [
57+
"id",
58+
"title",
59+
"description",
60+
]
61+
62+
63+
class SpamUpdateSerializer(serializers.Serializer):
64+
object_id = serializers.IntegerField()
65+
is_spam = serializers.BooleanField()
66+
spam_indicators = serializers.ListField(
67+
child=serializers.CharField(), required=False
68+
)
69+
reasoning = serializers.CharField(required=False)
70+
confidence = serializers.FloatField(required=False)

0 commit comments

Comments
 (0)