Skip to content

Commit c8cf8f3

Browse files
bug CORE-4225: mongodb url bug (#2662)
The mongodb redact method was created because we wanted part of the url to be exposed to the user during logging. Thus it did not use the dataclass `enhanced_field(sensitive=True)` solution. This changes it to use our standard redacted solution. This also minimizes the amount of work to be done in platform.
1 parent 9ae838e commit c8cf8f3

File tree

7 files changed

+15
-83
lines changed

7 files changed

+15
-83
lines changed

CHANGELOG.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
## 0.13.0-dev12
1+
## 0.13.0-dev13
22

33
### Enhancements
44

@@ -20,6 +20,7 @@
2020
* **Fix OneDrive dates with inconsistent formatting** Adds logic to conditionally support dates returned by office365 that may vary in date formatting or may be a datetime rather than a string. See previous fix for SharePoint
2121
* **Adds tracking for AstraDB** Adds tracking info so AstraDB can see what source called their api.
2222
* **Support AWS Bedrock Embeddings in ingest CLI** The configs required to instantiate the bedrock embedding class are now exposed in the api and the version of boto being used meets the minimum requirement to introduce the bedrock runtime required to hit the service.
23+
* **Change MongoDB redacting** Original redact secrets solution is causing issues in platform. This fix uses our standard logging redact solution.
2324

2425
## 0.12.6
2526

test_unstructured_ingest/dest/vectara.sh

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ RANDOM_SUFFIX=$((RANDOM % 100000 + 1))
1414
CORPUS_NAME="test-corpus-vectara-"$RANDOM_SUFFIX
1515

1616
# Expected size of the uploaded document
17-
EXPECTED_CORPUS_SIZE=8830076
17+
EXPECTED_CORPUS_SIZE=8842684
1818

1919
if [ -z "$VECTARA_OAUTH_CLIENT_ID" ] && [ -z "$VECTARA_OAUTH_SECRET" ] && [ -z "$VECTARA_CUSTOMER_ID" ]; then
2020
echo "Skipping VECTARA ingest test because VECTARA_OAUTH_CLIENT_ID, VECTARA_OAUTH_SECRET, or VECTARA_CUSTOMER_ID env var is not set."
@@ -89,5 +89,6 @@ if [ "$corpus_size" == "$EXPECTED_CORPUS_SIZE" ]; then
8989
echo "Corpus size is as expected: $corpus_size"
9090
else
9191
echo "Corpus size is not as expected: $corpus_size"
92+
echo "vs $EXPECTED_CORPUS_SIZE"
9293
exit 1
9394
fi

test_unstructured_ingest/unit/test_common.py

Lines changed: 0 additions & 16 deletions
This file was deleted.

unstructured/__version__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.13.0-dev12" # pragma: no cover
1+
__version__ = "0.13.0-dev13" # pragma: no cover

unstructured/ingest/cli/common.py

Lines changed: 1 addition & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,7 @@
11
import logging
22

3-
from unstructured.ingest.logger import ingest_log_streaming_init, logger
4-
5-
6-
def options_redactions(options: dict) -> dict:
7-
# handle any logic needed to redact not already caught by the logging filter
8-
options = options.copy()
9-
if "uri" in options and options["uri"].startswith("mongodb"):
10-
from unstructured.ingest.connector.mongodb import redact
11-
12-
options["uri"] = redact(options["uri"])
13-
return options
3+
from unstructured.ingest.logger import ingest_log_streaming_init
144

155

166
def log_options(options: dict, verbose=False):
177
ingest_log_streaming_init(logging.DEBUG if verbose else logging.INFO)
18-
logger.debug(f"options: {options_redactions(options)}")

unstructured/ingest/connector/mongodb.py

Lines changed: 8 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -3,12 +3,12 @@
33
from dataclasses import dataclass, field
44
from pathlib import Path
55

6-
from dataclasses_json.core import Json
7-
86
from unstructured.__version__ import __version__ as unstructured_version
7+
from unstructured.ingest.enhanced_dataclass import enhanced_field
98
from unstructured.ingest.enhanced_dataclass.core import _asdict
109
from unstructured.ingest.error import DestinationConnectionError, SourceConnectionError, WriteError
1110
from unstructured.ingest.interfaces import (
11+
AccessConfig,
1212
BaseConnectorConfig,
1313
BaseDestinationConnector,
1414
BaseIngestDocBatch,
@@ -34,72 +34,29 @@ def parse_userinfo(userinfo: str) -> t.Tuple[str, str]:
3434
return user, passwd
3535

3636

37-
def redact(uri: str, redacted_text="***REDACTED***") -> str:
38-
"""
39-
Cherry pick code from pymongo.uri_parser.parse_uri to only extract password and
40-
redact without needing to import pymongo library
41-
"""
42-
43-
SCHEME = "mongodb://"
44-
SRV_SCHEME = "mongodb+srv://"
45-
if uri.startswith(SCHEME):
46-
scheme_free = uri[len(SCHEME) :] # noqa: E203
47-
elif uri.startswith(SRV_SCHEME):
48-
scheme_free = uri[len(SRV_SCHEME) :] # noqa: E203
49-
else:
50-
raise ValueError(f"Invalid URI scheme: URI must begin with '{SCHEME}' or '{SRV_SCHEME}'")
51-
52-
passwd = None
53-
54-
host_part, _, path_part = scheme_free.partition("/")
55-
if not host_part:
56-
host_part = path_part
57-
path_part = ""
58-
59-
if not path_part:
60-
# There was no slash in scheme_free, check for a sole "?".
61-
host_part, _, _ = host_part.partition("?")
62-
63-
if "@" in host_part:
64-
userinfo, _, hosts = host_part.rpartition("@")
65-
_, passwd = parse_userinfo(userinfo)
66-
67-
if passwd:
68-
uri = uri.replace(passwd, redacted_text)
69-
return uri
37+
@dataclass
38+
class MongoDBAccessConfig(AccessConfig):
39+
uri: t.Optional[str] = enhanced_field(sensitive=True, default=None)
7040

7141

7242
@dataclass
7343
class SimpleMongoDBConfig(BaseConnectorConfig):
74-
uri: t.Optional[str] = None
44+
access_config: MongoDBAccessConfig
7545
host: t.Optional[str] = None
7646
database: t.Optional[str] = None
7747
collection: t.Optional[str] = None
7848
port: int = 27017
7949
batch_size: int = 100
8050

81-
def to_dict(
82-
self, redact_sensitive=False, redacted_text="***REDACTED***", **kwargs
83-
) -> t.Dict[str, Json]:
84-
d = super().to_dict(
85-
redact_sensitive=redact_sensitive, redacted_text=redacted_text, **kwargs
86-
)
87-
if redact_sensitive:
88-
if self.host:
89-
d["host"] = redact(uri=self.host, redacted_text=redacted_text)
90-
if self.uri:
91-
d["uri"] = redact(uri=self.uri, redacted_text=redacted_text)
92-
return d
93-
9451
@requires_dependencies(["pymongo"], extras="mongodb")
9552
def generate_client(self) -> "MongoClient":
9653
from pymongo import MongoClient
9754
from pymongo.driver_info import DriverInfo
9855
from pymongo.server_api import ServerApi
9956

100-
if self.uri:
57+
if self.access_config.uri:
10158
return MongoClient(
102-
self.uri,
59+
self.access_config.uri,
10360
server_api=ServerApi(version=SERVER_API_VERSION),
10461
driver=DriverInfo(name="unstructured", version=unstructured_version),
10562
)

unstructured/ingest/runner/mongodb.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ class MongoDBRunner(Runner):
1717

1818
def update_read_config(self):
1919
hashed_dir_name = hashlib.sha256(
20-
str(self.connector_config.uri).encode("utf-8"),
20+
str(self.connector_config.access_config.uri).encode("utf-8"),
2121
)
2222
self.read_config.download_dir = update_download_dir_hash(
2323
connector_name="mongodb",

0 commit comments

Comments
 (0)