Skip to content
This repository was archived by the owner on Apr 26, 2024. It is now read-only.

Commit 7e8083e

Browse files
committed
Add check_media_file_for_spam spam checker hook
1 parent afa18f1 commit 7e8083e

File tree

6 files changed

+210
-6
lines changed

6 files changed

+210
-6
lines changed

changelog.d/9311.feature

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Add hook to spam checker modules that allow checking file uploads and remote downloads.

docs/spam_checker.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,9 @@ class ExampleSpamChecker:
6161

6262
async def check_registration_for_spam(self, email_threepid, username, request_info):
6363
return RegistrationBehaviour.ALLOW # allow all registrations
64+
65+
async def check_media_file_for_spam(self, file_wrapper, file_info):
66+
return False # allow all media
6467
```
6568

6669
## Configuration

synapse/events/spamcheck.py

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@
1717
import inspect
1818
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
1919

20+
from synapse.rest.media.v1._base import FileInfo
21+
from synapse.rest.media.v1.media_storage import ReadableFileWrapper
2022
from synapse.spam_checker_api import RegistrationBehaviour
2123
from synapse.types import Collection
2224
from synapse.util.async_helpers import maybe_awaitable
@@ -214,3 +216,48 @@ async def check_registration_for_spam(
214216
return behaviour
215217

216218
return RegistrationBehaviour.ALLOW
219+
220+
async def check_media_file_for_spam(
221+
self, file_wrapper: ReadableFileWrapper, file_info: FileInfo
222+
) -> bool:
223+
"""Checks if a piece of newly uploaded media should be blocked.
224+
225+
This will be called for local uploads, downloads of remote media, each
226+
thumbnail generated for those, and web pages/images used for URL
227+
previews.
228+
229+
Note that care should be taken to not do blocking IO operations in the
230+
main thread. For example, to get the contents of a file a module
231+
should do::
232+
233+
async def check_media_file_for_spam(
234+
self, file: ReadableFileWrapper, file_info: FileInfo
235+
) -> bool:
236+
buffer = BytesIO()
237+
await file.write_chunks_to(buffer.write)
238+
239+
if buffer.getvalue() == b"Hello World":
240+
return True
241+
242+
return False
243+
244+
245+
Args:
246+
file: An object that allows reading the contents of the media.
247+
file_info: Metadata about the file.
248+
249+
Returns:
250+
True if the media should be blocked or False if it should be
251+
allowed.
252+
"""
253+
254+
for spam_checker in self.spam_checkers:
255+
# For backwards compatibility, only run if the method exists on the
256+
# spam checker
257+
checker = getattr(spam_checker, "check_media_file_for_spam", None)
258+
if checker:
259+
spam = await maybe_awaitable(checker(file_wrapper, file_info))
260+
if spam:
261+
return True
262+
263+
return False

synapse/rest/media/v1/media_storage.py

Lines changed: 56 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,13 +16,17 @@
1616
import logging
1717
import os
1818
import shutil
19-
from typing import IO, TYPE_CHECKING, Any, Optional, Sequence
19+
from typing import IO, TYPE_CHECKING, Any, Callable, Optional, Sequence
20+
21+
import attr
2022

2123
from twisted.internet.defer import Deferred
2224
from twisted.internet.interfaces import IConsumer
2325
from twisted.protocols.basic import FileSender
2426

27+
from synapse.api.errors import NotFoundError
2528
from synapse.logging.context import defer_to_thread, make_deferred_yieldable
29+
from synapse.util import Clock
2630
from synapse.util.file_consumer import BackgroundFileConsumer
2731

2832
from ._base import FileInfo, Responder
@@ -58,6 +62,8 @@ def __init__(
5862
self.local_media_directory = local_media_directory
5963
self.filepaths = filepaths
6064
self.storage_providers = storage_providers
65+
self.spam_checker = hs.get_spam_checker()
66+
self.clock = hs.get_clock()
6167

6268
async def store_file(self, source: IO, file_info: FileInfo) -> str:
6369
"""Write `source` to the on disk media store, and also any other
@@ -127,18 +133,29 @@ async def finish():
127133
f.flush()
128134
f.close()
129135

136+
spam = await self.spam_checker.check_media_file_for_spam(
137+
ReadableFileWrapper(self.clock, fname), file_info
138+
)
139+
if spam:
140+
logger.info("Blocking media due to spam checker")
141+
# Note that we'll delete the stored media, due to the
142+
# try/except below. The media also won't be stored in
143+
# the DB.
144+
raise SpamMediaException()
145+
130146
for provider in self.storage_providers:
131147
await provider.store_file(path, file_info)
132148

133149
finished_called[0] = True
134150

135151
yield f, fname, finish
136-
except Exception:
152+
except Exception as e:
137153
try:
138154
os.remove(fname)
139155
except Exception:
140156
pass
141-
raise
157+
158+
raise e from None
142159

143160
if not finished_called:
144161
raise Exception("Finished callback not called")
@@ -302,3 +319,39 @@ def write_to_consumer(self, consumer: IConsumer) -> Deferred:
302319

303320
def __exit__(self, exc_type, exc_val, exc_tb):
304321
self.open_file.close()
322+
323+
324+
class SpamMediaException(NotFoundError):
325+
"""The media was blocked by a spam checker, so we simply 404 the request (in
326+
the same way as if it was quarantined).
327+
"""
328+
329+
330+
@attr.s(slots=True)
331+
class ReadableFileWrapper:
332+
"""Wrapper that allows reading a file in chunks, yielding to the reactor,
333+
and writing to a callback.
334+
335+
This is simplified `FileSender` that takes an IO object rather than an
336+
`IConsumer`.
337+
"""
338+
339+
CHUNK_SIZE = 2 ** 14
340+
341+
clock = attr.ib(type=Clock)
342+
path = attr.ib(type=str)
343+
344+
async def write_chunks_to(self, callback: Callable[[bytes], None]):
345+
"""Reads the file in chunks and calls the callback with each chunk.
346+
"""
347+
348+
with open(self.path, "rb") as file:
349+
while True:
350+
chunk = file.read(self.CHUNK_SIZE)
351+
if not chunk:
352+
break
353+
354+
callback(chunk)
355+
356+
# We yield to the reactor by sleeping for 0 seconds.
357+
await self.clock.sleep(0)

synapse/rest/media/v1/upload_resource.py

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
from synapse.api.errors import Codes, SynapseError
2323
from synapse.http.server import DirectServeJsonResource, respond_with_json
2424
from synapse.http.servlet import parse_string
25+
from synapse.rest.media.v1.media_storage import SpamMediaException
2526

2627
if TYPE_CHECKING:
2728
from synapse.app.homeserver import HomeServer
@@ -86,9 +87,14 @@ async def _async_render_POST(self, request: Request) -> None:
8687
# disposition = headers.getRawHeaders(b"Content-Disposition")[0]
8788
# TODO(markjh): parse content-dispostion
8889

89-
content_uri = await self.media_repo.create_content(
90-
media_type, upload_name, request.content, content_length, requester.user
91-
)
90+
try:
91+
content_uri = await self.media_repo.create_content(
92+
media_type, upload_name, request.content, content_length, requester.user
93+
)
94+
except SpamMediaException:
95+
# For uploading of media we want to respond with a 400, instead of
96+
# the default 404, as that would just be confusing.
97+
raise SynapseError(400, "Bad content")
9298

9399
logger.info("Uploaded content with URI %r", content_uri)
94100

tests/rest/media/v1/test_media_storage.py

Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,13 +30,16 @@
3030
from twisted.internet.defer import Deferred
3131

3232
from synapse.logging.context import make_deferred_yieldable
33+
from synapse.rest import admin
34+
from synapse.rest.client.v1 import login
3335
from synapse.rest.media.v1._base import FileInfo
3436
from synapse.rest.media.v1.filepath import MediaFilePaths
3537
from synapse.rest.media.v1.media_storage import MediaStorage
3638
from synapse.rest.media.v1.storage_provider import FileStorageProviderBackend
3739

3840
from tests import unittest
3941
from tests.server import FakeSite, make_request
42+
from tests.utils import default_config
4043

4144

4245
class MediaStorageTests(unittest.HomeserverTestCase):
@@ -398,3 +401,94 @@ def test_x_robots_tag_header(self):
398401
headers.getRawHeaders(b"X-Robots-Tag"),
399402
[b"noindex, nofollow, noarchive, noimageindex"],
400403
)
404+
405+
406+
class TestSpamChecker:
407+
"""A spam checker module that rejects all media that includes the bytes
408+
`evil`.
409+
"""
410+
411+
def __init__(self, config, api):
412+
self.config = config
413+
self.api = api
414+
415+
def parse_config(config):
416+
return config
417+
418+
async def check_event_for_spam(self, foo):
419+
return False # allow all events
420+
421+
async def user_may_invite(self, inviter_userid, invitee_userid, room_id):
422+
return True # allow all invites
423+
424+
async def user_may_create_room(self, userid):
425+
return True # allow all room creations
426+
427+
async def user_may_create_room_alias(self, userid, room_alias):
428+
return True # allow all room aliases
429+
430+
async def user_may_publish_room(self, userid, room_id):
431+
return True # allow publishing of all rooms
432+
433+
async def check_media_file_for_spam(self, file_wrapper, file_info) -> bool:
434+
buf = BytesIO()
435+
await file_wrapper.write_chunks_to(buf.write)
436+
437+
return b"evil" in buf.getvalue()
438+
439+
440+
class SpamCheckerTestCase(unittest.HomeserverTestCase):
441+
servlets = [
442+
login.register_servlets,
443+
admin.register_servlets,
444+
]
445+
446+
def prepare(self, reactor, clock, hs):
447+
self.user = self.register_user("user", "pass")
448+
self.tok = self.login("user", "pass")
449+
450+
# Allow for uploading and downloading to/from the media repo
451+
self.media_repo = hs.get_media_repository_resource()
452+
self.download_resource = self.media_repo.children[b"download"]
453+
self.upload_resource = self.media_repo.children[b"upload"]
454+
455+
def default_config(self):
456+
config = default_config("test")
457+
458+
config.update(
459+
{
460+
"spam_checker": [
461+
{
462+
"module": TestSpamChecker.__module__ + ".TestSpamChecker",
463+
"config": {},
464+
}
465+
]
466+
}
467+
)
468+
469+
return config
470+
471+
def test_upload_innocent(self):
472+
"""Attempt to upload some innocent data that should be allowed.
473+
"""
474+
475+
image_data = unhexlify(
476+
b"89504e470d0a1a0a0000000d4948445200000001000000010806"
477+
b"0000001f15c4890000000a49444154789c63000100000500010d"
478+
b"0a2db40000000049454e44ae426082"
479+
)
480+
481+
self.helper.upload_media(
482+
self.upload_resource, image_data, tok=self.tok, expect_code=200
483+
)
484+
485+
def test_upload_ban(self):
486+
"""Attempt to upload some data that includes bytes "evil", which should
487+
get rejected by the spam checker.
488+
"""
489+
490+
data = b"Some evil data"
491+
492+
self.helper.upload_media(
493+
self.upload_resource, data, tok=self.tok, expect_code=400
494+
)

0 commit comments

Comments
 (0)