Skip to content

Commit 1007c8e

Browse files
[8.19](backport #50097) [filebeat][ABS] - Fix CSV decoder JSON escaping in azure-blob-storage input (#50108)
* [filebeat][ABS] - Fix CSV decoder JSON escaping in azure-blob-storage input (#50097) Fix CSV decoder type references and broken decodeValue in azure-blob-storage input The azure-blob-storage input's CSV decode path builds JSON via string concatenation without escaping field values. CSV values containing double quotes (e.g. RFC 2045 MIME type parameters) produce malformed JSON, causing downstream ingest pipeline failures. Add a valueDecoder switch case ahead of the plain decoder case so the CSV decoder's decodeValue() method is used, which serializes via json.Marshal for correct escaping. Also fix decodeValue() itself, which previously cleared its state then called decode(), causing a 'decode called before next' error. --------- Co-authored-by: Shourie Ganguly <shourie.ganguly@elastic.co>
1 parent 20d9568 commit 1007c8e

File tree

7 files changed

+108
-3
lines changed

7 files changed

+108
-3
lines changed
Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
# REQUIRED
2+
# Kind can be one of:
3+
# - breaking-change: a change to previously-documented behavior
4+
# - deprecation: functionality that is being removed in a later release
5+
# - bug-fix: fixes a problem in a previous version
6+
# - enhancement: extends functionality but does not break or fix existing behavior
7+
# - feature: new functionality
8+
# - known-issue: problems that we are aware of in a given version
9+
# - security: impacts on the security of a product or a user's deployment.
10+
# - upgrade: important information for someone upgrading from a prior version
11+
# - other: does not fit into any of the other categories
12+
kind: bug-fix
13+
14+
# REQUIRED for all kinds
15+
# Change summary; a 80ish characters long description of the change.
16+
summary: Fix CSV decoder producing malformed JSON when field values contain double quotes in azure-blob-storage input.
17+
18+
# REQUIRED for breaking-change, deprecation, known-issue
19+
# Long description; in case the summary is not enough to describe the change
20+
# this field accommodate a description without length limits.
21+
description: |
22+
The azure-blob-storage input's CSV decode path builds JSON via string
23+
concatenation without escaping field values. CSV values containing double
24+
quotes (e.g. RFC 2045 MIME type parameters) produce malformed JSON, causing
25+
downstream ingest pipeline failures. Add a valueDecoder switch case ahead of
26+
the plain decoder case so the CSV decoder's decodeValue() method is used,
27+
which serializes via json.Marshal for correct escaping. Also fix decodeValue()
28+
itself, which previously cleared its state then called decode(), causing a
29+
"decode called before next" error.
30+
31+
# REQUIRED for breaking-change, deprecation, known-issue
32+
# impact:
33+
34+
# REQUIRED for breaking-change, deprecation, known-issue
35+
# action:
36+
37+
# REQUIRED for all kinds
38+
# Affected component; usually one of "elastic-agent", "fleet-server", "filebeat", "metricbeat", "auditbeat", "all", etc.
39+
component: filebeat
40+
41+
# AUTOMATED
42+
# OPTIONAL to manually add other PR URLs
43+
# PR URL: A link the PR that added the changeset.
44+
# If not present is automatically filled by the tooling finding the PR where this changelog fragment has been added.
45+
# NOTE: the tooling supports backports, so it's able to fill the original PR number instead of the backport PR number.
46+
# Please provide it if you are adding a fragment for a different PR.
47+
# pr: https://github.com/elastic/beats/pull/XXXXX
48+
49+
# AUTOMATED
50+
# OPTIONAL to manually add other issue URLs
51+
# Issue URL; optional; the GitHub issue related to this changeset (either closes or is part of).
52+
# If not present is automatically filled by the tooling with the issue linked to the PR number.
53+
issue: https://github.com/elastic/beats/issues/50097

x-pack/filebeat/input/azureblobstorage/decoding_csv.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ package azureblobstorage
77
import (
88
"bytes"
99
"encoding/csv"
10+
"encoding/json"
1011
"fmt"
1112
"io"
1213
"slices"
@@ -106,7 +107,7 @@ func (d *csvDecoder) decodeValue() ([]byte, map[string]any, error) {
106107
m[n] = d.current[i]
107108
}
108109
d.current = d.current[:0]
109-
b, err := d.decode()
110+
b, err := json.Marshal(m)
110111
if err != nil {
111112
return nil, nil, err
112113
}

x-pack/filebeat/input/azureblobstorage/decoding_test.go

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,20 @@ func TestDecoding(t *testing.T) {
7070
},
7171
},
7272
},
73+
{
74+
name: "csv_quoted_values",
75+
file: "txn_quoted.csv",
76+
content: "text/csv",
77+
numEvents: 2,
78+
assertAgainst: "txn_quoted.json",
79+
config: decoderConfig{
80+
Codec: &codecConfig{
81+
CSV: &csvCodecConfig{
82+
Enabled: true,
83+
},
84+
},
85+
},
86+
},
7387
}
7488

7589
for _, tc := range testCases {

x-pack/filebeat/input/azureblobstorage/job.go

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -194,6 +194,22 @@ func (j *job) decode(ctx context.Context, r io.Reader, id string) error {
194194
}
195195
var evtOffset int64
196196
switch dec := dec.(type) {
197+
case valueDecoder:
198+
defer dec.close()
199+
200+
for dec.next() {
201+
msg, _, err := dec.decodeValue()
202+
if err != nil {
203+
if errors.Is(err, io.EOF) {
204+
return nil
205+
}
206+
j.status.UpdateStatus(status.Degraded, err.Error())
207+
return err
208+
}
209+
evt := j.createEvent(string(msg), evtOffset)
210+
j.publish(evt, !dec.more(), id)
211+
}
212+
197213
case decoder:
198214
defer dec.close()
199215

x-pack/filebeat/input/azureblobstorage/mock/data_files.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -262,6 +262,6 @@ var BeatsFilesContainer_multiline_json_gz = []string{
262262
}
263263

264264
var BeatsFilesContainer_csv = []string{
265-
"{\"id\":\"1\",\"name\":\"Alice\",\"email\":\"alice@example.com\",\"status\":\"active\"}",
266-
"{\"id\":\"2\",\"name\":\"Bob\",\"email\":\"bob@example.com\",\"status\":\"inactive\"}",
265+
"{\"email\":\"alice@example.com\",\"id\":\"1\",\"name\":\"Alice\",\"status\":\"active\"}",
266+
"{\"email\":\"bob@example.com\",\"id\":\"2\",\"name\":\"Bob\",\"status\":\"inactive\"}",
267267
}
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
date,time,content-type,response-type,host,category
2+
2020-01-01,00:00:00,"text/plain; charset=""us-ascii""","multipart/mixed; boundary=""gc0p4Jq0M2Yt08jU534c0p""",test.invalid,demo
3+
2020-01-02,12:30:00,application/json,application/json,example.invalid,demo
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
[
2+
{
3+
"date": "2020-01-01",
4+
"time": "00:00:00",
5+
"content-type": "text/plain; charset=\"us-ascii\"",
6+
"response-type": "multipart/mixed; boundary=\"gc0p4Jq0M2Yt08jU534c0p\"",
7+
"host": "test.invalid",
8+
"category": "demo"
9+
},
10+
{
11+
"date": "2020-01-02",
12+
"time": "12:30:00",
13+
"content-type": "application/json",
14+
"response-type": "application/json",
15+
"host": "example.invalid",
16+
"category": "demo"
17+
}
18+
]

0 commit comments

Comments
 (0)