Skip to content

Commit afa989c

Browse files
[filebeat][ABS] - Fix CSV decoder JSON escaping in azure-blob-storage input (#50097) (#50110)
x-pack/filebeat/input/azureblobstorage: Fix CSV decoder JSON escaping in azure-blob-storage input The azure-blob-storage input's decode path only matched the decoder.Decoder interface, calling Decode() which builds JSON via string concatenation without escaping field values. CSV values containing double quotes (e.g. RFC 2045 MIME type parameters) produce malformed JSON, causing downstream ingest pipeline failures. Added a decoder.ValueDecoder switch case ahead of decoder.Decoder, matching the pattern already used by the GCS input. ValueDecoder's DecodeValue() uses json.Marshal which handles escaping correctly. (cherry picked from commit 34634a3) Co-authored-by: Shourie Ganguly <shourie.ganguly@elastic.co>
1 parent 1ba454b commit afa989c

File tree

6 files changed

+104
-2
lines changed

6 files changed

+104
-2
lines changed
Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
# REQUIRED
2+
# Kind can be one of:
3+
# - breaking-change: a change to previously-documented behavior
4+
# - deprecation: functionality that is being removed in a later release
5+
# - bug-fix: fixes a problem in a previous version
6+
# - enhancement: extends functionality but does not break or fix existing behavior
7+
# - feature: new functionality
8+
# - known-issue: problems that we are aware of in a given version
9+
# - security: impacts on the security of a product or a user's deployment.
10+
# - upgrade: important information for someone upgrading from a prior version
11+
# - other: does not fit into any of the other categories
12+
kind: bug-fix
13+
14+
# REQUIRED for all kinds
15+
# Change summary; a 80ish characters long description of the change.
16+
summary: Fix CSV decoder producing malformed JSON when field values contain double quotes in azure-blob-storage input.
17+
18+
# REQUIRED for breaking-change, deprecation, known-issue
19+
# Long description; in case the summary is not enough to describe the change
20+
# this field accommodate a description without length limits.
21+
description: |
22+
The azure-blob-storage input's decode path only matched the decoder.Decoder
23+
interface, which builds JSON via string concatenation without escaping field
24+
values. CSV values containing double quotes (e.g. RFC 2045 MIME type
25+
parameters) produce malformed JSON, causing downstream ingest pipeline
26+
failures. Add a decoder.ValueDecoder switch case which uses json.Marshal
27+
for correct escaping, matching the pattern already used by the GCS input.
28+
29+
# REQUIRED for breaking-change, deprecation, known-issue
30+
# impact:
31+
32+
# REQUIRED for breaking-change, deprecation, known-issue
33+
# action:
34+
35+
# REQUIRED for all kinds
36+
# Affected component; usually one of "elastic-agent", "fleet-server", "filebeat", "metricbeat", "auditbeat", "all", etc.
37+
component: filebeat
38+
39+
# AUTOMATED
40+
# OPTIONAL to manually add other PR URLs
41+
# PR URL: A link the PR that added the changeset.
42+
# If not present is automatically filled by the tooling finding the PR where this changelog fragment has been added.
43+
# NOTE: the tooling supports backports, so it's able to fill the original PR number instead of the backport PR number.
44+
# Please provide it if you are adding a fragment for a different PR.
45+
# pr: https://github.com/elastic/beats/pull/XXXXX
46+
47+
# AUTOMATED
48+
# OPTIONAL to manually add other issue URLs
49+
# Issue URL; optional; the GitHub issue related to this changeset (either closes or is part of).
50+
# If not present is automatically filled by the tooling with the issue linked to the PR number.
51+
issue: https://github.com/elastic/beats/issues/50097

x-pack/filebeat/input/azureblobstorage/decoding_test.go

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,20 @@ func TestDecoding(t *testing.T) {
7777
},
7878
},
7979
},
80+
{
81+
name: "csv_quoted_values",
82+
file: "txn_quoted.csv",
83+
content: "text/csv",
84+
numEvents: 2,
85+
assertAgainst: "txn_quoted.json",
86+
config: decoder.Config{
87+
Codec: &decoder.CodecConfig{
88+
CSV: &decoder.CSVCodecConfig{
89+
Enabled: true,
90+
},
91+
},
92+
},
93+
},
8094
}
8195

8296
for _, tc := range testCases {

x-pack/filebeat/input/azureblobstorage/job.go

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -195,6 +195,22 @@ func (j *job) decode(ctx context.Context, r io.Reader, id string) error {
195195
}
196196
var evtOffset int64
197197
switch dec := dec.(type) {
198+
case decoder.ValueDecoder:
199+
defer dec.Close()
200+
201+
for dec.Next() {
202+
offset, msg, _, err := dec.DecodeValue()
203+
if err != nil {
204+
if errors.Is(err, io.EOF) {
205+
return nil
206+
}
207+
j.status.UpdateStatus(status.Degraded, err.Error())
208+
return err
209+
}
210+
evt := j.createEvent(string(msg), offset)
211+
j.publish(evt, !dec.More(), id)
212+
}
213+
198214
case decoder.Decoder:
199215
defer dec.Close()
200216

x-pack/filebeat/input/azureblobstorage/mock/data_files.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -262,6 +262,6 @@ var BeatsFilesContainer_multiline_json_gz = []string{
262262
}
263263

264264
var BeatsFilesContainer_csv = []string{
265-
"{\"id\":\"1\",\"name\":\"Alice\",\"email\":\"alice@example.com\",\"status\":\"active\"}",
266-
"{\"id\":\"2\",\"name\":\"Bob\",\"email\":\"bob@example.com\",\"status\":\"inactive\"}",
265+
"{\"email\":\"alice@example.com\",\"id\":\"1\",\"name\":\"Alice\",\"status\":\"active\"}",
266+
"{\"email\":\"bob@example.com\",\"id\":\"2\",\"name\":\"Bob\",\"status\":\"inactive\"}",
267267
}
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
date,time,content-type,response-type,host,category
2+
2020-01-01,00:00:00,"text/plain; charset=""us-ascii""","multipart/mixed; boundary=""gc0p4Jq0M2Yt08jU534c0p""",test.invalid,demo
3+
2020-01-02,12:30:00,application/json,application/json,example.invalid,demo
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
[
2+
{
3+
"date": "2020-01-01",
4+
"time": "00:00:00",
5+
"content-type": "text/plain; charset=\"us-ascii\"",
6+
"response-type": "multipart/mixed; boundary=\"gc0p4Jq0M2Yt08jU534c0p\"",
7+
"host": "test.invalid",
8+
"category": "demo"
9+
},
10+
{
11+
"date": "2020-01-02",
12+
"time": "12:30:00",
13+
"content-type": "application/json",
14+
"response-type": "application/json",
15+
"host": "example.invalid",
16+
"category": "demo"
17+
}
18+
]

0 commit comments

Comments
 (0)