Skip to content

Commit 8ce015b

Browse files
Pablu23mhoff
andauthored
Allow custom http headers to be copied into logs (#912)
* First integration of config and actual usage, with more type hints * Fix ng tests * Add tests to see if custom headers work, and to see that they overwrite default headers * Correct collect_meta config, add warning when overwriting event Data, add tests * Update logprep/connector/http/input.py Co-authored-by: Michael Hoff <mail@michael-hoff.net> * Improve documentation, rename constant to all uppercase, add tests and fix tests, remove kwargs and batch_size input as it is not part of queue implementation * Update config to be set and not only type annotated * dirty hack to give low powered machines like gh actions more time * change list to set, change default to factory, remove typing and change to collections.abc and use lowercase built in types where possible * Update logprep/connector/http/input.py Co-authored-by: Michael Hoff <mail@michael-hoff.net> * Change test text aswell * Update changelog --------- Co-authored-by: Michael Hoff <mail@michael-hoff.net>
1 parent b8ab160 commit 8ce015b

File tree

7 files changed

+236
-38
lines changed

7 files changed

+236
-38
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
### Breaking
33

44
### Features
5+
* Headers from incoming http requests can now be copied into events via `copy_headers_to_log` config in http input, `collect_meta` will be deprecated in the future
56

67
### Improvements
78

logprep/connector/http/input.py

Lines changed: 81 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -93,8 +93,8 @@
9393
import zlib
9494
from abc import ABC
9595
from base64 import b64encode
96+
from collections.abc import Callable, Mapping
9697
from functools import cached_property
97-
from typing import Callable, List, Mapping, Tuple, Type, Union
9898

9999
import falcon.asgi
100100
import msgspec
@@ -172,24 +172,42 @@ async def func_wrapper(*args, **kwargs):
172172
return func_wrapper
173173

174174

175+
DEFAULT_META_HEADERS = frozenset(
176+
[
177+
"url",
178+
"remote_addr",
179+
"user-agent",
180+
]
181+
)
182+
183+
175184
def add_metadata(func: Callable):
176185
"""Decorator to add metadata to resulting http event.
177186
Uses attribute collect_meta of endpoint class to decide over metadata collection
178187
Uses attribute metafield_name to define key name for metadata
179188
"""
180189

181190
async def func_wrapper(*args, **kwargs):
182-
req = args[1]
183-
endpoint = args[0]
184-
if endpoint.collect_meta:
185-
metadata = {
186-
"url": req.url,
187-
"remote_addr": req.remote_addr,
188-
"user_agent": req.user_agent,
189-
}
190-
kwargs["metadata"] = {endpoint.metafield_name: metadata}
191-
else:
191+
req: falcon.Request = args[1]
192+
endpoint: HttpEndpoint = args[0]
193+
194+
if not endpoint.collect_meta or len(endpoint.copy_headers_to_logs) == 0:
192195
kwargs["metadata"] = {}
196+
else:
197+
metadata = {}
198+
for header in endpoint.copy_headers_to_logs:
199+
# remote_addr and url are special cases, because those are not copied 1 to 1 from headers
200+
match header:
201+
case "remote_addr":
202+
metadata[header] = req.remote_addr
203+
case "url":
204+
metadata[header] = req.url
205+
case _:
206+
key = header.replace("-", "_").lower()
207+
metadata[key] = req.get_header(header, required=False, default=None)
208+
209+
kwargs["metadata"] = {endpoint.metafield_name: metadata}
210+
193211
func_wrapper = await func(*args, **kwargs)
194212
return func_wrapper
195213

@@ -231,9 +249,13 @@ def __init__(
231249
metafield_name: str,
232250
credentials: Credentials,
233251
metrics: "HttpInput.Metrics",
252+
copy_headers_to_logs: set[str],
234253
) -> None:
235254
self.messages = messages
236255
self.original_event_field = original_event_field
256+
self.copy_headers_to_logs = copy_headers_to_logs
257+
258+
# Deprecated
237259
self.collect_meta = collect_meta
238260
self.metafield_name = metafield_name
239261
self.credentials = credentials
@@ -271,6 +293,12 @@ async def get_data(self, req: falcon.Request) -> bytes:
271293
data = zlib.decompress(data, 31)
272294
return data
273295

296+
def put_message(self, event: dict, metadata: dict):
297+
"""Puts message to internal queue"""
298+
if self.metafield_name in event:
299+
logger.warning("metadata field was in event and got overwritten")
300+
self.messages.put(event | metadata, block=False)
301+
274302

275303
class JSONHttpEndpoint(HttpEndpoint):
276304
""":code:`json` endpoint to get json from request"""
@@ -293,7 +321,7 @@ async def __call__(self, req, resp, **kwargs): # pylint: disable=arguments-diff
293321
)
294322
event = {}
295323
add_fields_to(event, {target_field: event_value})
296-
self.messages.put(event | kwargs["metadata"], block=False)
324+
self.put_message(event, kwargs["metadata"])
297325

298326

299327
class JSONLHttpEndpoint(HttpEndpoint):
@@ -317,7 +345,8 @@ async def __call__(self, req, resp, **kwargs): # pylint: disable=arguments-diff
317345
)
318346
event = {}
319347
add_fields_to(event, {target_field: event_value})
320-
self.messages.put(event | kwargs["metadata"], block=False, batch_size=len(events))
348+
349+
self.put_message(event, kwargs["metadata"])
321350

322351

323352
class PlaintextHttpEndpoint(HttpEndpoint):
@@ -339,7 +368,7 @@ async def __call__(self, req, resp, **kwargs): # pylint: disable=arguments-diff
339368
)
340369
event = {}
341370
add_fields_to(event, {target_field: event_value})
342-
self.messages.put(event | kwargs["metadata"], block=False)
371+
self.put_message(event, kwargs["metadata"])
343372

344373

345374
class HttpInput(Input):
@@ -369,7 +398,7 @@ class Metrics(Input.Metrics):
369398
class Config(Input.Config):
370399
"""Config for HTTPInput"""
371400

372-
uvicorn_config: Mapping[str, Union[str, int]] = field(
401+
uvicorn_config: Mapping[str, str | int] = field(
373402
validator=[
374403
validators.instance_of(dict),
375404
validators.deep_mapping(
@@ -432,8 +461,32 @@ class Config(Input.Config):
432461
be smaller than default value of 15.000 messages.
433462
"""
434463

435-
collect_meta: str = field(validator=validators.instance_of(bool), default=True)
436-
"""Defines if metadata should be collected
464+
copy_headers_to_logs: set[str] = field(
465+
validator=validators.deep_iterable(
466+
member_validator=validators.instance_of(str),
467+
iterable_validator=validators.or_(
468+
validators.instance_of(set), validators.instance_of(list)
469+
),
470+
),
471+
converter=set,
472+
factory=lambda: set(DEFAULT_META_HEADERS),
473+
)
474+
"""Defines what metadata should be collected from Http Headers
475+
Special cases:
476+
- remote_addr (Gets the inbound client ip instead of header)
477+
- url (Get the requested url from http request and not technically a header)
478+
479+
Defaults:
480+
- remote_addr
481+
- url
482+
- User-Agent
483+
484+
The output header names in Events are stored as json strings, and are transformed from "User-Agent" to "user_agent"
485+
"""
486+
487+
collect_meta: bool = field(validator=validators.instance_of(bool), default=True)
488+
"""Deprecated use copy_headers_to_logs instead, to turn off collecting metadata set copy_headers_to_logs to an empty list ([]).
489+
Defines if metadata should be collected
437490
- :code:`True`: Collect metadata
438491
- :code:`False`: Won't collect metadata
439492
@@ -445,11 +498,15 @@ class Config(Input.Config):
445498
"""
446499

447500
metafield_name: str = field(validator=validators.instance_of(str), default="@metadata")
448-
"""Defines the name of the key for the collected metadata fields"""
501+
"""Defines the name of the key for the collected metadata fields
502+
Logs a Warning if metadata field overwrites preexisting field in Event
503+
"""
449504

450505
original_event_field: dict = field(
451506
validator=[
507+
# type: ignore
452508
validators.optional(
509+
# type: ignore
453510
validators.deep_mapping(
454511
key_validator=validators.in_(["format", "target_field"]),
455512
value_validator=validators.instance_of(str),
@@ -469,11 +526,11 @@ def __attrs_post_init__(self):
469526
"Cannot configure both add_full_event_to_target_field and original_event_field."
470527
)
471528

472-
__slots__: List[str] = ["target", "app", "http_server"]
529+
__slots__: list[str] = ["target", "app", "http_server"]
473530

474531
messages: mp.Queue = None
475532

476-
_endpoint_registry: Mapping[str, Type[HttpEndpoint]] = {
533+
_endpoint_registry: Mapping[str, type[HttpEndpoint]] = {
477534
"json": JSONHttpEndpoint,
478535
"plaintext": PlaintextHttpEndpoint,
479536
"jsonl": JSONLHttpEndpoint,
@@ -506,6 +563,7 @@ def setup(self):
506563

507564
endpoints_config = {}
508565
collect_meta = self._config.collect_meta
566+
copy_headers_to_logs = self._config.copy_headers_to_logs
509567
metafield_name = self._config.metafield_name
510568
original_event_field = self._config.original_event_field
511569
cred_factory = CredentialsFactory()
@@ -521,6 +579,7 @@ def setup(self):
521579
metafield_name,
522580
credentials,
523581
self.metrics,
582+
copy_headers_to_logs,
524583
)
525584

526585
self.app = self._get_asgi_app(endpoints_config)
@@ -537,7 +596,7 @@ def _get_asgi_app(endpoints_config: dict) -> falcon.asgi.App:
537596
app.add_sink(endpoint, prefix=route_compile_helper(endpoint_path))
538597
return app
539598

540-
def _get_event(self, timeout: float) -> Tuple:
599+
def _get_event(self, timeout: float) -> tuple:
541600
"""Returns the first message from the queue"""
542601
self.metrics.message_backlog_size += self.messages.qsize()
543602
try:
@@ -554,7 +613,7 @@ def shut_down(self):
554613
self.http_server.shut_down()
555614

556615
@cached_property
557-
def health_endpoints(self) -> List[str]:
616+
def health_endpoints(self) -> list[str]:
558617
"""Returns a list of endpoints for internal healthcheck
559618
the endpoints are examples to match against the configured regex enabled
560619
endpoints. The endpoints are normalized to match the regex patterns and

logprep/ng/connector/http/input.py

Lines changed: 38 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -87,15 +87,16 @@
8787
"""
8888

8989
import queue
90+
from collections.abc import Mapping
9091
from functools import cached_property
91-
from typing import Mapping, Type
9292

9393
import falcon
9494
import requests
9595
from attrs import define, field, validators
9696
from joblib._multiprocessing_helpers import mp
9797

9898
from logprep.connector.http.input import (
99+
DEFAULT_META_HEADERS,
99100
HttpEndpoint,
100101
JSONHttpEndpoint,
101102
JSONLHttpEndpoint,
@@ -202,8 +203,32 @@ class Config(Input.Config):
202203
be smaller than default value of 15.000 messages.
203204
"""
204205

205-
collect_meta: str = field(validator=validators.instance_of(bool), default=True)
206-
"""Defines if metadata should be collected
206+
copy_headers_to_logs: set[str] = field(
207+
validator=validators.deep_iterable(
208+
member_validator=validators.instance_of(str),
209+
iterable_validator=validators.or_(
210+
validators.instance_of(set), validators.instance_of(list)
211+
),
212+
),
213+
converter=set,
214+
factory=lambda: set(DEFAULT_META_HEADERS),
215+
)
216+
"""Defines what metadata should be collected from Http Headers
217+
Special cases:
218+
- remote_addr (Gets the inbound client ip instead of header)
219+
- url (Get the requested url from http request and not technically a header)
220+
221+
Defaults:
222+
- remote_addr
223+
- url
224+
- User-Agent
225+
226+
The output header names in Events are stored as json strings, and are transformed from "User-Agent" to "user_agent"
227+
"""
228+
229+
collect_meta: bool = field(validator=validators.instance_of(bool), default=True)
230+
"""Deprecated use copy_headers_to_logs instead, to turn off collecting metadata set copy_headers_to_logs to an empty list ([]).
231+
Defines if metadata should be collected
207232
- :code:`True`: Collect metadata
208233
- :code:`False`: Won't collect metadata
209234
@@ -218,14 +243,12 @@ class Config(Input.Config):
218243
"""Defines the name of the key for the collected metadata fields"""
219244

220245
original_event_field: dict = field(
221-
validator=[
222-
validators.optional(
223-
validators.deep_mapping(
224-
key_validator=validators.in_(["format", "target_field"]),
225-
value_validator=validators.instance_of(str),
226-
)
227-
),
228-
],
246+
validator=validators.optional(
247+
validators.deep_mapping(
248+
key_validator=validators.in_(["format", "target_field"]),
249+
value_validator=validators.instance_of(str),
250+
)
251+
),
229252
default=None,
230253
)
231254
"""Optional config parameter that writes the full event to one single target field. The
@@ -243,7 +266,7 @@ def __attrs_post_init__(self):
243266

244267
messages: mp.Queue = None
245268

246-
_endpoint_registry: Mapping[str, Type[HttpEndpoint]] = {
269+
_endpoint_registry: Mapping[str, type[HttpEndpoint]] = {
247270
"json": JSONHttpEndpoint,
248271
"plaintext": PlaintextHttpEndpoint,
249272
"jsonl": JSONLHttpEndpoint,
@@ -267,6 +290,8 @@ def setup(self) -> None:
267290
super().setup()
268291
endpoints_config = {}
269292
collect_meta = self._config.collect_meta
293+
copy_headers_to_logs = self._config.copy_headers_to_logs
294+
270295
metafield_name = self._config.metafield_name
271296
original_event_field = self._config.original_event_field
272297
cred_factory = CredentialsFactory()
@@ -286,6 +311,7 @@ def setup(self) -> None:
286311
metafield_name,
287312
credentials,
288313
self.metrics,
314+
copy_headers_to_logs,
289315
)
290316

291317
self.app = self._get_asgi_app(endpoints_config)

pyproject.toml

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -110,8 +110,8 @@ dev = [
110110
"pytest-asyncio",
111111
"pre-commit",
112112
"mypy>=1.18.2",
113-
"types-requests",
114113
"psutil",
114+
"types-requests",
115115
"types-psutil",
116116
"types-PyYaml",
117117
]
@@ -188,6 +188,11 @@ exclude = "tests/.*"
188188
follow_imports = "skip"
189189
follow_imports_for_stubs = "True"
190190

191+
# Evidently looking at dev Packages, they are installed, mypy doesnt seem to think so
192+
[[tool.mypy.overrides]]
193+
module = ["requests.*"]
194+
ignore_missing_imports = true
195+
191196
[[tool.mypy.overrides]]
192197
module = ["logprep.framework.pipeline_manager.*"]
193198
disable_error_code = "attr-defined"

tests/unit/connector/test_file_input_default_config.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
)
2222
from tests.unit.connector.base import BaseInputTestCase
2323

24-
CHECK_INTERVAL = 0.1
24+
CHECK_INTERVAL = 2
2525

2626

2727
def wait_for_interval(interval):

0 commit comments

Comments
 (0)