Skip to content

Commit 231d50c

Browse files
committed
Merge branch 'feature/PI-617-split_ldif_by_party_key' into release/2024-11-13
2 parents 33c0dd8 + 14fa4e5 commit 231d50c

File tree

2 files changed

+152
-13
lines changed

2 files changed

+152
-13
lines changed

src/layers/etl_utils/ldif/ldif.py

Lines changed: 84 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
import re
2+
from collections import defaultdict
23
from io import BytesIO
34
from types import FunctionType
4-
from typing import IO, TYPE_CHECKING, Callable, Generator
5+
from typing import IO, TYPE_CHECKING, Callable, Generator, Protocol
56

67
from etl_utils.ldif.model import DistinguishedName
78
from smart_open import open as _smart_open
@@ -67,6 +68,13 @@ def ldif_dump(fp: IO, obj: list[PARSED_RECORD]) -> str:
6768
)
6869

6970

71+
class _StreamBlock(Protocol):
72+
def flush(self) -> str: ...
73+
def reset(self): ...
74+
def parse(self, line: bytes): ...
75+
def __bool__(self): ...
76+
77+
7078
class StreamBlock:
7179
def __init__(self, filter_terms: list[tuple[str, str]]):
7280
self.data = BytesIO()
@@ -80,7 +88,7 @@ def flush(self) -> str:
8088
self.data.write(self.buffer)
8189
self.reset()
8290

83-
def reset(self) -> str:
91+
def reset(self):
8492
self.buffer = bytes()
8593
self.keep = False
8694

@@ -93,20 +101,41 @@ def __bool__(self):
93101
return bool(self.buffer) and self.keep
94102

95103

96-
def filter_ldif_from_s3_by_property(
97-
s3_path, filter_terms: list[tuple[str, str]], s3_client: "S3Client"
98-
) -> memoryview:
99-
"""
100-
Efficiently streams a file from S3 directly into a bytes memoryview,
101-
filtering out any LDIF record without any (attribute_name, attribute_value)
102-
matching at least one of the filter terms.
104+
class GroupedStreamBlock:
105+
def __init__(self, group_field: str, filter_terms: list[tuple[str, str]]):
106+
self.data = defaultdict(BytesIO)
107+
self.filters: list[FunctionType] = [
108+
re.compile(rf"(?i)^({key}): ({value})\n$".encode()).match
109+
for key, value in filter_terms
110+
]
111+
self.group_filter = re.compile(rf"(?i)^({group_field}): (.*)\n$".encode()).match
112+
self.reset()
103113

104-
The output of this function can then be parsed using'
105-
'parse_ldif(file_opener=BytesIO, path_or_data=filtered_ldif)'
106-
"""
114+
def flush(self) -> str:
115+
if self.group is None:
116+
raise Exception
117+
self.data[self.group].write(self.buffer)
118+
self.reset()
107119

108-
stream_block = StreamBlock(filter_terms)
120+
def reset(self) -> str:
121+
self.buffer = bytes()
122+
self.keep = False
123+
self.group = None
124+
125+
def parse(self, line: bytes):
126+
group_match = self.group_filter(line)
127+
if group_match:
128+
(_, self.group) = group_match.groups()
129+
130+
if not self.keep and any(filter(line) for filter in self.filters):
131+
self.keep = True
132+
self.buffer += line
109133

134+
def __bool__(self):
135+
return bool(self.buffer) and self.keep
136+
137+
138+
def stream_to_block(s3_path: str, s3_client: "S3Client", stream_block: _StreamBlock):
110139
with _smart_open(s3_path, mode="rb", transport_params={"client": s3_client}) as f:
111140
for line in f.readlines():
112141
line_is_empty = line.strip() == EMPTY_BYTESTRING
@@ -118,4 +147,46 @@ def filter_ldif_from_s3_by_property(
118147

119148
if stream_block:
120149
stream_block.flush()
150+
151+
152+
def filter_ldif_from_s3_by_property(
153+
s3_path, filter_terms: list[tuple[str, str]], s3_client: "S3Client"
154+
) -> memoryview:
155+
"""
156+
Efficiently streams a file from S3 directly into a bytes memoryview,
157+
filtering out any LDIF record without any (attribute_name, attribute_value)
158+
matching at least one of the filter terms.
159+
160+
The output of this function can then be parsed using'
161+
'parse_ldif(file_opener=BytesIO, path_or_data=filtered_ldif)'
162+
"""
163+
stream_block = StreamBlock(filter_terms)
164+
stream_to_block(s3_path=s3_path, s3_client=s3_client, stream_block=stream_block)
121165
return stream_block.data.getbuffer()
166+
167+
168+
def filter_and_group_ldif_from_s3_by_property(
169+
s3_path,
170+
group_field: str,
171+
filter_terms: list[tuple[str, str]],
172+
s3_client: "S3Client",
173+
) -> memoryview:
174+
"""
175+
Efficiently streams a file from S3 directly into a bytes memoryview,
176+
filtering out any LDIF record without any (attribute_name, attribute_value)
177+
matching at least one of the filter terms, and then also grouping records
178+
by the group_field.
179+
180+
The output of this function can then be parsed using'
181+
'parse_ldif(file_opener=BytesIO, path_or_data=filtered_and_grouped_ldif)'
182+
"""
183+
184+
stream_block = GroupedStreamBlock(
185+
group_field=group_field, filter_terms=filter_terms
186+
)
187+
stream_to_block(s3_path=s3_path, s3_client=s3_client, stream_block=stream_block)
188+
189+
data = BytesIO()
190+
for group in stream_block.data.values():
191+
data.write(group.getbuffer())
192+
return data.getbuffer()

src/layers/etl_utils/ldif/tests/test_ldif.py

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import pytest
66
from etl_utils.ldif.ldif import (
77
DistinguishedName,
8+
filter_and_group_ldif_from_s3_by_property,
89
filter_ldif_from_s3_by_property,
910
ldif_dump,
1011
parse_ldif,
@@ -247,6 +248,54 @@
247248
},
248249
)
249250

251+
LDIF_TO_FILTER_AND_GROUP_EXAMPLE = """
252+
dn: uniqueIdentifier=AAA1
253+
myField: AAA
254+
myOtherField: 123
255+
256+
dn: uniqueIdentifier=BBB1
257+
myfield: BBB
258+
myOtherField: 123
259+
260+
dn: uniqueIdentifier=BBB2
261+
myfield: BBB
262+
myOtherField: 123
263+
264+
dn: uniqueIdentifier=AAA2
265+
myfield: AAA
266+
myOtherField: 123
267+
268+
dn: uniqueIdentifier=AAA3
269+
myField: AAA
270+
myOtherField: 234
271+
272+
dn: uniqueIdentifier=BBB3
273+
myfield: BBB
274+
myOtherField: 123
275+
"""
276+
277+
FILTERED_AND_GROUPED_LDIF_TO_FILTER_AND_GROUP_EXAMPLE = """
278+
dn: uniqueIdentifier=AAA1
279+
myField: AAA
280+
myOtherField: 123
281+
282+
dn: uniqueIdentifier=AAA2
283+
myfield: AAA
284+
myOtherField: 123
285+
286+
dn: uniqueIdentifier=BBB1
287+
myfield: BBB
288+
myOtherField: 123
289+
290+
dn: uniqueIdentifier=BBB2
291+
myfield: BBB
292+
myOtherField: 123
293+
294+
dn: uniqueIdentifier=BBB3
295+
myfield: BBB
296+
myOtherField: 123
297+
"""
298+
250299

251300
@pytest.mark.parametrize(
252301
("raw_distinguished_name", "parsed_distinguished_name"),
@@ -322,6 +371,25 @@ def test_filter_ldif_from_s3_by_property(mocked_open):
322371
)
323372

324373

374+
@mock.patch(
375+
"etl_utils.ldif.ldif._smart_open",
376+
return_value=BytesIO(LDIF_TO_FILTER_AND_GROUP_EXAMPLE.encode()),
377+
)
378+
def test_filter_and_group_ldif_from_s3_by_property(mocked_open):
379+
with mock_aws():
380+
s3_client = boto3.client("s3")
381+
filtered_ldif = filter_and_group_ldif_from_s3_by_property(
382+
s3_client=s3_client,
383+
s3_path="s3://dummy_bucket/dummy_key",
384+
group_field="myField",
385+
filter_terms=[("myOtherField", "123")],
386+
)
387+
assert (
388+
filtered_ldif.tobytes().decode()
389+
== FILTERED_AND_GROUPED_LDIF_TO_FILTER_AND_GROUP_EXAMPLE
390+
)
391+
392+
325393
@pytest.mark.parametrize(
326394
["raw_ldif", "parsed_ldif"],
327395
[

0 commit comments

Comments
 (0)