Skip to content

Commit e271315

Browse files
authored
PYTHON-2998 Remove md5 checksums from gridfs and remove disable_md5 (#776)
Speed up gridfs tests (shaves off about 2 minutes on macOS).
1 parent 89f41cf commit e271315

12 files changed

+94
-129
lines changed

doc/changelog.rst

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -170,12 +170,15 @@ Breaking Changes in 4.0
170170
are passed to the server as-is rather than the previous behavior which
171171
substituted in a projection of ``{"_id": 1}``. This means that an empty
172172
projection will now return the entire document, not just the ``"_id"`` field.
173-
- :class:`~pymongo.mongo_client.MongoClient` now raises a :exc:`~pymongo.errors.ConfigurationError`
174-
when more than one URI is passed into the ``hosts`` argument.
173+
- :class:`~pymongo.mongo_client.MongoClient` now raises a
174+
:exc:`~pymongo.errors.ConfigurationError` when more than one URI is passed
175+
into the ``hosts`` argument.
175176
- :class:`~pymongo.mongo_client.MongoClient`` now raises an
176177
:exc:`~pymongo.errors.InvalidURI` exception
177178
when it encounters unescaped percent signs in username and password when
178179
parsing MongoDB URIs.
180+
- Removed the `disable_md5` parameter for :class:`~gridfs.GridFSBucket` and
181+
:class:`~gridfs.GridFS`. See :ref:`removed-gridfs-checksum` for details.
179182

180183
Notable improvements
181184
....................

doc/migrate-to-pymongo4.rst

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -819,6 +819,32 @@ Changed the default JSON encoding representation from legacy to relaxed.
819819
The json_mode parameter for :const:`bson.json_util.dumps` now defaults to
820820
:const:`~bson.json_util.RELAXED_JSON_OPTIONS`.
821821

822+
GridFS changes
823+
--------------
824+
825+
.. _removed-gridfs-checksum:
826+
827+
disable_md5 parameter is removed
828+
................................
829+
830+
Removed the `disable_md5` option for :class:`~gridfs.GridFSBucket` and
831+
:class:`~gridfs.GridFS`. GridFS no longer generates checksums.
832+
Applications that desire a file digest should implement it outside GridFS
833+
and store it with other file metadata. For example::
834+
835+
import hashlib
836+
my_db = MongoClient().test
837+
fs = GridFSBucket(my_db)
838+
grid_in = fs.open_upload_stream("test_file")
839+
file_data = b'...'
840+
sha356 = hashlib.sha256(file_data).hexdigest()
841+
grid_in.write(file_data)
842+
grid_in.sha356 = sha356 # Set the custom 'sha356' field
843+
grid_in.close()
844+
845+
Note that for large files, the checksum may need to be computed in chunks
846+
to avoid the excessive memory needed to load the entire file at once.
847+
822848
Removed features with no migration path
823849
---------------------------------------
824850

gridfs/__init__.py

Lines changed: 22 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@
3939
class GridFS(object):
4040
"""An instance of GridFS on top of a single Database.
4141
"""
42-
def __init__(self, database, collection="fs", disable_md5=False):
42+
def __init__(self, database, collection="fs"):
4343
"""Create a new instance of :class:`GridFS`.
4444
4545
Raises :class:`TypeError` if `database` is not an instance of
@@ -48,14 +48,18 @@ def __init__(self, database, collection="fs", disable_md5=False):
4848
:Parameters:
4949
- `database`: database to use
5050
- `collection` (optional): root collection to use
51-
- `disable_md5` (optional): When True, MD5 checksums will not be
52-
computed for uploaded files. Useful in environments where MD5
53-
cannot be used for regulatory or other reasons. Defaults to False.
51+
52+
.. versionchanged:: 4.0
53+
Removed the `disable_md5` parameter. See
54+
:ref:`removed-gridfs-checksum` for details.
5455
5556
.. versionchanged:: 3.11
5657
Running a GridFS operation in a transaction now always raises an
5758
error. GridFS does not support multi-document transactions.
5859
60+
.. versionchanged:: 3.7
61+
Added the `disable_md5` parameter.
62+
5963
.. versionchanged:: 3.1
6064
Indexes are only ensured on the first write to the DB.
6165
@@ -77,7 +81,6 @@ def __init__(self, database, collection="fs", disable_md5=False):
7781
self.__collection = database[collection]
7882
self.__files = self.__collection.files
7983
self.__chunks = self.__collection.chunks
80-
self.__disable_md5 = disable_md5
8184

8285
def new_file(self, **kwargs):
8386
"""Create a new file in GridFS.
@@ -93,8 +96,7 @@ def new_file(self, **kwargs):
9396
:Parameters:
9497
- `**kwargs` (optional): keyword arguments for file creation
9598
"""
96-
return GridIn(
97-
self.__collection, disable_md5=self.__disable_md5, **kwargs)
99+
return GridIn(self.__collection, **kwargs)
98100

99101
def put(self, data, **kwargs):
100102
"""Put data in GridFS as a new file.
@@ -126,8 +128,7 @@ def put(self, data, **kwargs):
126128
.. versionchanged:: 3.0
127129
w=0 writes to GridFS are now prohibited.
128130
"""
129-
grid_file = GridIn(
130-
self.__collection, disable_md5=self.__disable_md5, **kwargs)
131+
grid_file = GridIn(self.__collection, **kwargs)
131132
try:
132133
grid_file.write(data)
133134
finally:
@@ -423,7 +424,7 @@ class GridFSBucket(object):
423424

424425
def __init__(self, db, bucket_name="fs",
425426
chunk_size_bytes=DEFAULT_CHUNK_SIZE, write_concern=None,
426-
read_preference=None, disable_md5=False):
427+
read_preference=None):
427428
"""Create a new instance of :class:`GridFSBucket`.
428429
429430
Raises :exc:`TypeError` if `database` is not an instance of
@@ -442,13 +443,17 @@ def __init__(self, db, bucket_name="fs",
442443
(the default) db.write_concern is used.
443444
- `read_preference` (optional): The read preference to use. If
444445
``None`` (the default) db.read_preference is used.
445-
- `disable_md5` (optional): When True, MD5 checksums will not be
446-
computed for uploaded files. Useful in environments where MD5
447-
cannot be used for regulatory or other reasons. Defaults to False.
446+
447+
.. versionchanged:: 4.0
448+
Removed the `disable_md5` parameter. See
449+
:ref:`removed-gridfs-checksum` for details.
448450
449451
.. versionchanged:: 3.11
450-
Running a GridFS operation in a transaction now always raises an
451-
error. GridFSBucket does not support multi-document transactions.
452+
Running a GridFSBucket operation in a transaction now always raises
453+
an error. GridFSBucket does not support multi-document transactions.
454+
455+
.. versionchanged:: 3.7
456+
Added the `disable_md5` parameter.
452457
453458
.. versionadded:: 3.1
454459
@@ -465,8 +470,6 @@ def __init__(self, db, bucket_name="fs",
465470

466471
self._bucket_name = bucket_name
467472
self._collection = db[bucket_name]
468-
self._disable_md5 = disable_md5
469-
470473
self._chunks = self._collection.chunks.with_options(
471474
write_concern=write_concern,
472475
read_preference=read_preference)
@@ -522,11 +525,7 @@ def open_upload_stream(self, filename, chunk_size_bytes=None,
522525
if metadata is not None:
523526
opts["metadata"] = metadata
524527

525-
return GridIn(
526-
self._collection,
527-
session=session,
528-
disable_md5=self._disable_md5,
529-
**opts)
528+
return GridIn(self._collection, session=session, **opts)
530529

531530
def open_upload_stream_with_id(
532531
self, file_id, filename, chunk_size_bytes=None, metadata=None,
@@ -579,11 +578,7 @@ def open_upload_stream_with_id(
579578
if metadata is not None:
580579
opts["metadata"] = metadata
581580

582-
return GridIn(
583-
self._collection,
584-
session=session,
585-
disable_md5=self._disable_md5,
586-
**opts)
581+
return GridIn(self._collection, session=session, **opts)
587582

588583
def upload_from_stream(self, filename, source, chunk_size_bytes=None,
589584
metadata=None, session=None):

gridfs/grid_file.py

Lines changed: 8 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,6 @@
1414

1515
"""Tools for representing files stored in GridFS."""
1616
import datetime
17-
import hashlib
1817
import io
1918
import math
2019
import os
@@ -115,8 +114,7 @@ def _disallow_transactions(session):
115114
class GridIn(object):
116115
"""Class to write data to GridFS.
117116
"""
118-
def __init__(
119-
self, root_collection, session=None, disable_md5=False, **kwargs):
117+
def __init__(self, root_collection, session=None, **kwargs):
120118
"""Write a file to GridFS
121119
122120
Application developers should generally not need to
@@ -152,12 +150,15 @@ def __init__(
152150
- `session` (optional): a
153151
:class:`~pymongo.client_session.ClientSession` to use for all
154152
commands
155-
- `disable_md5` (optional): When True, an MD5 checksum will not be
156-
computed for the uploaded file. Useful in environments where
157-
MD5 cannot be used for regulatory or other reasons. Defaults to
158-
False.
159153
- `**kwargs` (optional): file level options (see above)
160154
155+
.. versionchanged:: 4.0
156+
Removed the `disable_md5` parameter. See
157+
:ref:`removed-gridfs-checksum` for details.
158+
159+
.. versionchanged:: 3.7
160+
Added the `disable_md5` parameter.
161+
161162
.. versionchanged:: 3.6
162163
Added ``session`` parameter.
163164
@@ -183,8 +184,6 @@ def __init__(
183184
coll = _clear_entity_type_registry(
184185
root_collection, read_preference=ReadPreference.PRIMARY)
185186

186-
if not disable_md5:
187-
kwargs["md5"] = hashlib.md5()
188187
# Defaults
189188
kwargs["_id"] = kwargs.get("_id", ObjectId())
190189
kwargs["chunkSize"] = kwargs.get("chunkSize", DEFAULT_CHUNK_SIZE)
@@ -271,9 +270,6 @@ def __flush_data(self, data):
271270
"""Flush `data` to a chunk.
272271
"""
273272
self.__ensure_indexes()
274-
if 'md5' in self._file:
275-
self._file['md5'].update(data)
276-
277273
if not data:
278274
return
279275
assert(len(data) <= self.chunk_size)
@@ -301,9 +297,6 @@ def __flush(self):
301297
"""
302298
try:
303299
self.__flush_buffer()
304-
305-
if "md5" in self._file:
306-
self._file["md5"] = self._file["md5"].hexdigest()
307300
# The GridFS spec says length SHOULD be an Int64.
308301
self._file["length"] = Int64(self._position)
309302
self._file["uploadDate"] = datetime.datetime.utcnow()

test/__init__.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -949,6 +949,13 @@ def setUpClass(cls):
949949
else:
950950
cls.credentials = {}
951951

952+
def cleanup_colls(self, *collections):
953+
"""Cleanup collections faster than drop_collection."""
954+
for c in collections:
955+
c = self.client[c.database.name][c.name]
956+
c.delete_many({})
957+
c.drop_indexes()
958+
952959
def patch_system_certs(self, ca_certs):
953960
patcher = SystemCertsPatcher(ca_certs)
954961
self.addCleanup(patcher.disable)

test/gridfs/upload.json

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,6 @@
2929
"length": 0,
3030
"chunkSize": 4,
3131
"uploadDate": "*actual",
32-
"md5": "d41d8cd98f00b204e9800998ecf8427e",
3332
"filename": "filename"
3433
}
3534
]
@@ -62,7 +61,6 @@
6261
"length": 1,
6362
"chunkSize": 4,
6463
"uploadDate": "*actual",
65-
"md5": "47ed733b8d10be225eceba344d533586",
6664
"filename": "filename"
6765
}
6866
]
@@ -108,7 +106,6 @@
108106
"length": 3,
109107
"chunkSize": 4,
110108
"uploadDate": "*actual",
111-
"md5": "bafae3a174ab91fc70db7a6aa50f4f52",
112109
"filename": "filename"
113110
}
114111
]
@@ -154,7 +151,6 @@
154151
"length": 4,
155152
"chunkSize": 4,
156153
"uploadDate": "*actual",
157-
"md5": "7e7c77cff5705d1f7574a25ef6662117",
158154
"filename": "filename"
159155
}
160156
]
@@ -200,7 +196,6 @@
200196
"length": 5,
201197
"chunkSize": 4,
202198
"uploadDate": "*actual",
203-
"md5": "283d4fea5dded59cf837d3047328f5af",
204199
"filename": "filename"
205200
}
206201
]
@@ -254,7 +249,6 @@
254249
"length": 8,
255250
"chunkSize": 4,
256251
"uploadDate": "*actual",
257-
"md5": "dd254cdc958e53abaa67da9f797125f5",
258252
"filename": "filename"
259253
}
260254
]
@@ -309,7 +303,6 @@
309303
"length": 1,
310304
"chunkSize": 4,
311305
"uploadDate": "*actual",
312-
"md5": "47ed733b8d10be225eceba344d533586",
313306
"filename": "filename",
314307
"contentType": "image/jpeg"
315308
}
@@ -359,7 +352,6 @@
359352
"length": 1,
360353
"chunkSize": 4,
361354
"uploadDate": "*actual",
362-
"md5": "47ed733b8d10be225eceba344d533586",
363355
"filename": "filename",
364356
"metadata": {
365357
"x": 1

test/test_custom_types.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -672,7 +672,7 @@ def test_grid_out_custom_opts(self):
672672
self.assertEqual(["foo"], two.aliases)
673673
self.assertEqual({"foo": 'red', "bar": 'blue'}, two.metadata)
674674
self.assertEqual(3, two.bar)
675-
self.assertEqual("5eb63bbbe01eeed093cb22bb8f5acdc3", two.md5)
675+
self.assertEqual(None, two.md5)
676676

677677
for attr in ["_id", "name", "content_type", "length", "chunk_size",
678678
"upload_date", "aliases", "metadata", "md5"]:

test/test_grid_file.py

Lines changed: 5 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -80,8 +80,7 @@ def test_grid_in_custom_opts(self):
8080
class TestGridFile(IntegrationTest):
8181

8282
def setUp(self):
83-
self.db.drop_collection('fs.files')
84-
self.db.drop_collection('fs.chunks')
83+
self.cleanup_colls(self.db.fs.files, self.db.fs.chunks)
8584

8685
def test_basic(self):
8786
f = GridIn(self.db.fs, filename="test")
@@ -112,7 +111,7 @@ def test_md5(self):
112111
f = GridIn(self.db.fs)
113112
f.write(b"hello world\n")
114113
f.close()
115-
self.assertEqual("6f5902ac237024bdd0c176cb93063dc4", f.md5)
114+
self.assertEqual(None, f.md5)
116115

117116
def test_alternate_collection(self):
118117
self.db.alt.files.delete_many({})
@@ -128,9 +127,6 @@ def test_alternate_collection(self):
128127
g = GridOut(self.db.alt, f._id)
129128
self.assertEqual(b"hello world", g.read())
130129

131-
# test that md5 still works...
132-
self.assertEqual("5eb63bbbe01eeed093cb22bb8f5acdc3", g.md5)
133-
134130
def test_grid_in_default_opts(self):
135131
self.assertRaises(TypeError, GridIn, "foo")
136132

@@ -194,7 +190,7 @@ def test_grid_in_default_opts(self):
194190

195191
self.assertEqual({"foo": 1}, a.metadata)
196192

197-
self.assertEqual("d41d8cd98f00b204e9800998ecf8427e", a.md5)
193+
self.assertEqual(None, a.md5)
198194
self.assertRaises(AttributeError, setattr, a, "md5", 5)
199195

200196
# Make sure custom attributes that were set both before and after
@@ -225,7 +221,7 @@ def test_grid_out_default_opts(self):
225221
self.assertTrue(isinstance(b.upload_date, datetime.datetime))
226222
self.assertEqual(None, b.aliases)
227223
self.assertEqual(None, b.metadata)
228-
self.assertEqual("d41d8cd98f00b204e9800998ecf8427e", b.md5)
224+
self.assertEqual(None, b.md5)
229225

230226
for attr in ["_id", "name", "content_type", "length", "chunk_size",
231227
"upload_date", "aliases", "metadata", "md5"]:
@@ -266,7 +262,7 @@ def test_grid_out_custom_opts(self):
266262
self.assertEqual(["foo"], two.aliases)
267263
self.assertEqual({"foo": 1, "bar": 2}, two.metadata)
268264
self.assertEqual(3, two.bar)
269-
self.assertEqual("5eb63bbbe01eeed093cb22bb8f5acdc3", two.md5)
265+
self.assertEqual(None, two.md5)
270266

271267
for attr in ["_id", "name", "content_type", "length", "chunk_size",
272268
"upload_date", "aliases", "metadata", "md5"]:

0 commit comments

Comments
 (0)