Skip to content

Commit 24eb437

Browse files
JJK801HenryHengZJ
andauthored
[Feature] improve CsvLoader & clean code (#3830)
* Improve CSV Loader * Improve S3 Loaders --------- Co-authored-by: Henry <[email protected]>
1 parent cc87d85 commit 24eb437

File tree

7 files changed

+229
-206
lines changed

7 files changed

+229
-206
lines changed

packages/components/nodes/documentloaders/Csv/Csv.ts

Lines changed: 42 additions & 83 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
1-
import { omit } from 'lodash'
21
import { TextSplitter } from 'langchain/text_splitter'
3-
import { CSVLoader } from '@langchain/community/document_loaders/fs/csv'
4-
import { getFileFromStorage, handleEscapeCharacters } from '../../../src'
2+
import { CSVLoader } from './CsvLoader'
3+
import { getFileFromStorage, handleDocumentLoaderDocuments, handleDocumentLoaderMetadata, handleDocumentLoaderOutput } from '../../../src'
54
import { ICommonObject, IDocument, INode, INodeData, INodeOutputsValue, INodeParams } from '../../../src/Interface'
65

76
class Csv_DocumentLoaders implements INode {
@@ -19,7 +18,7 @@ class Csv_DocumentLoaders implements INode {
1918
constructor() {
2019
this.label = 'Csv File'
2120
this.name = 'csvFile'
22-
this.version = 2.0
21+
this.version = 3.0
2322
this.type = 'Document'
2423
this.icon = 'csv.svg'
2524
this.category = 'Document Loaders'
@@ -82,21 +81,11 @@ class Csv_DocumentLoaders implements INode {
8281
]
8382
}
8483

85-
async init(nodeData: INodeData, _: string, options: ICommonObject): Promise<any> {
86-
const textSplitter = nodeData.inputs?.textSplitter as TextSplitter
84+
getFiles(nodeData: INodeData) {
8785
const csvFileBase64 = nodeData.inputs?.csvFile as string
88-
const columnName = nodeData.inputs?.columnName as string
89-
const metadata = nodeData.inputs?.metadata
90-
const output = nodeData.outputs?.output as string
91-
const _omitMetadataKeys = nodeData.inputs?.omitMetadataKeys as string
92-
93-
let omitMetadataKeys: string[] = []
94-
if (_omitMetadataKeys) {
95-
omitMetadataKeys = _omitMetadataKeys.split(',').map((key) => key.trim())
96-
}
9786

98-
let docs: IDocument[] = []
9987
let files: string[] = []
88+
let fromStorage: boolean = true
10089

10190
if (csvFileBase64.startsWith('FILE-STORAGE::')) {
10291
const fileName = csvFileBase64.replace('FILE-STORAGE::', '')
@@ -105,86 +94,56 @@ class Csv_DocumentLoaders implements INode {
10594
} else {
10695
files = [fileName]
10796
}
108-
const chatflowid = options.chatflowid
109-
110-
for (const file of files) {
111-
if (!file) continue
112-
const fileData = await getFileFromStorage(file, chatflowid)
113-
const blob = new Blob([fileData])
114-
const loader = new CSVLoader(blob, columnName.trim().length === 0 ? undefined : columnName.trim())
115-
116-
if (textSplitter) {
117-
docs = await loader.load()
118-
docs = await textSplitter.splitDocuments(docs)
119-
} else {
120-
docs.push(...(await loader.load()))
121-
}
122-
}
12397
} else {
12498
if (csvFileBase64.startsWith('[') && csvFileBase64.endsWith(']')) {
12599
files = JSON.parse(csvFileBase64)
126100
} else {
127101
files = [csvFileBase64]
128102
}
129103

130-
for (const file of files) {
131-
if (!file) continue
132-
const splitDataURI = file.split(',')
133-
splitDataURI.pop()
134-
const bf = Buffer.from(splitDataURI.pop() || '', 'base64')
135-
const blob = new Blob([bf])
136-
const loader = new CSVLoader(blob, columnName.trim().length === 0 ? undefined : columnName.trim())
137-
138-
if (textSplitter) {
139-
docs = await loader.load()
140-
docs = await textSplitter.splitDocuments(docs)
141-
} else {
142-
docs.push(...(await loader.load()))
143-
}
144-
}
104+
fromStorage = false
145105
}
146106

147-
if (metadata) {
148-
const parsedMetadata = typeof metadata === 'object' ? metadata : JSON.parse(metadata)
149-
docs = docs.map((doc) => ({
150-
...doc,
151-
metadata:
152-
_omitMetadataKeys === '*'
153-
? {
154-
...parsedMetadata
155-
}
156-
: omit(
157-
{
158-
...doc.metadata,
159-
...parsedMetadata
160-
},
161-
omitMetadataKeys
162-
)
163-
}))
107+
return { files, fromStorage }
108+
}
109+
110+
async getFileData(file: string, { chatflowid }: { chatflowid: string }, fromStorage?: boolean) {
111+
if (fromStorage) {
112+
return getFileFromStorage(file, chatflowid)
164113
} else {
165-
docs = docs.map((doc) => ({
166-
...doc,
167-
metadata:
168-
_omitMetadataKeys === '*'
169-
? {}
170-
: omit(
171-
{
172-
...doc.metadata
173-
},
174-
omitMetadataKeys
175-
)
176-
}))
114+
const splitDataURI = file.split(',')
115+
splitDataURI.pop()
116+
return Buffer.from(splitDataURI.pop() || '', 'base64')
177117
}
118+
}
178119

179-
if (output === 'document') {
180-
return docs
181-
} else {
182-
let finaltext = ''
183-
for (const doc of docs) {
184-
finaltext += `${doc.pageContent}\n`
185-
}
186-
return handleEscapeCharacters(finaltext, false)
120+
async init(nodeData: INodeData, _: string, options: ICommonObject): Promise<any> {
121+
const textSplitter = nodeData.inputs?.textSplitter as TextSplitter
122+
const columnName = nodeData.inputs?.columnName as string
123+
const metadata = nodeData.inputs?.metadata
124+
const output = nodeData.outputs?.output as string
125+
const _omitMetadataKeys = nodeData.inputs?.omitMetadataKeys as string
126+
127+
let docs: IDocument[] = []
128+
129+
const chatflowid = options.chatflowid
130+
131+
const { files, fromStorage } = this.getFiles(nodeData)
132+
133+
for (const file of files) {
134+
if (!file) continue
135+
136+
const fileData = await this.getFileData(file, { chatflowid }, fromStorage)
137+
const blob = new Blob([fileData])
138+
const loader = new CSVLoader(blob, columnName.trim().length === 0 ? undefined : columnName.trim())
139+
140+
// use spread instead of push, because it raises RangeError: Maximum call stack size exceeded when too many docs
141+
docs = [...docs, ...(await handleDocumentLoaderDocuments(loader, textSplitter))]
187142
}
143+
144+
docs = handleDocumentLoaderMetadata(docs, _omitMetadataKeys, metadata)
145+
146+
return handleDocumentLoaderOutput(docs, output)
188147
}
189148
}
190149

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
import { TextLoader } from 'langchain/document_loaders/fs/text'
2+
import Papa from 'papaparse'
3+
4+
type CSVLoaderOptions = {
5+
// Return specifific column from key (string) or index (integer)
6+
column?: string | number
7+
// Force separator (default: auto detect)
8+
separator?: string
9+
}
10+
11+
/**
12+
* A class that extends the TextLoader class. It represents a document
13+
* loader that loads documents from a CSV file. It has a constructor that
14+
* takes a `filePathOrBlob` parameter representing the path to the CSV
15+
* file or a Blob object, and an optional `options` parameter of type
16+
* `CSVLoaderOptions` or a string representing the column to use as the
17+
* document's pageContent.
18+
*/
19+
export class CSVLoader extends TextLoader {
20+
protected options: CSVLoaderOptions = {}
21+
22+
constructor(filePathOrBlob: ConstructorParameters<typeof TextLoader>[0], options?: CSVLoaderOptions | string) {
23+
super(filePathOrBlob)
24+
25+
if (typeof options === 'string') {
26+
this.options = { column: options }
27+
} else {
28+
this.options = options ?? this.options
29+
}
30+
}
31+
/**
32+
* A protected method that parses the raw CSV data and returns an array of
33+
* strings representing the pageContent of each document. It uses the
34+
* `papaparse` to parse the CSV data. If
35+
* the `column` option is specified, it checks if the column exists in the
36+
* CSV file and returns the values of that column as the pageContent. If
37+
* the `column` option is not specified, it converts each row of the CSV
38+
* data into key/value pairs and joins them with newline characters.
39+
* @param raw The raw CSV data to be parsed.
40+
* @returns An array of strings representing the pageContent of each document.
41+
*/
42+
async parse(raw: string): Promise<string[]> {
43+
const { column, separator } = this.options
44+
45+
const {
46+
data: parsed,
47+
meta: { fields = [] }
48+
} = Papa.parse<{ [K: string]: string }>(raw.trim(), {
49+
delimiter: separator,
50+
header: true
51+
})
52+
53+
if (column !== undefined) {
54+
if (!fields.length) {
55+
throw new Error(`Unable to resolve fields from header.`)
56+
}
57+
58+
let searchIdx = column
59+
60+
if (typeof column == 'number') {
61+
searchIdx = fields[column]
62+
}
63+
64+
if (!fields.includes(searchIdx as string)) {
65+
throw new Error(`Column ${column} not found in CSV file.`)
66+
}
67+
68+
// Note TextLoader will raise an exception if the value is null.
69+
return parsed.map((row) => row[searchIdx])
70+
}
71+
72+
return parsed.map((row) => fields.map((key) => `${key.trim() || '_0'}: ${row[key]?.trim()}`).join('\n'))
73+
}
74+
}

packages/components/nodes/documentloaders/S3Directory/S3Directory.ts

Lines changed: 20 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,11 @@
1-
import { omit } from 'lodash'
21
import { ICommonObject, INode, INodeData, INodeOptionsValue, INodeOutputsValue, INodeParams } from '../../../src/Interface'
3-
import { getCredentialData, getCredentialParam, handleEscapeCharacters } from '../../../src/utils'
2+
import {
3+
getCredentialData,
4+
getCredentialParam,
5+
handleDocumentLoaderDocuments,
6+
handleDocumentLoaderMetadata,
7+
handleDocumentLoaderOutput
8+
} from '../../../src/utils'
49
import { S3Client, GetObjectCommand, S3ClientConfig, ListObjectsV2Command, ListObjectsV2Output } from '@aws-sdk/client-s3'
510
import { getRegions, MODEL_TYPE } from '../../../src/modelLoader'
611
import { Readable } from 'node:stream'
@@ -10,12 +15,13 @@ import * as os from 'node:os'
1015

1116
import { DirectoryLoader } from 'langchain/document_loaders/fs/directory'
1217
import { JSONLoader } from 'langchain/document_loaders/fs/json'
13-
import { CSVLoader } from '@langchain/community/document_loaders/fs/csv'
1418
import { PDFLoader } from '@langchain/community/document_loaders/fs/pdf'
1519
import { DocxLoader } from '@langchain/community/document_loaders/fs/docx'
1620
import { TextLoader } from 'langchain/document_loaders/fs/text'
1721
import { TextSplitter } from 'langchain/text_splitter'
1822

23+
import { CSVLoader } from '../Csv/CsvLoader'
24+
1925
class S3_DocumentLoaders implements INode {
2026
label: string
2127
name: string
@@ -151,11 +157,6 @@ class S3_DocumentLoaders implements INode {
151157
const _omitMetadataKeys = nodeData.inputs?.omitMetadataKeys as string
152158
const output = nodeData.outputs?.output as string
153159

154-
let omitMetadataKeys: string[] = []
155-
if (_omitMetadataKeys) {
156-
omitMetadataKeys = _omitMetadataKeys.split(',').map((key) => key.trim())
157-
}
158-
159160
let credentials: S3ClientConfig['credentials'] | undefined
160161

161162
if (nodeData.credential) {
@@ -241,11 +242,11 @@ class S3_DocumentLoaders implements INode {
241242
'.csv': (path) => new CSVLoader(path),
242243
'.docx': (path) => new DocxLoader(path),
243244
'.pdf': (path) =>
244-
pdfUsage === 'perFile'
245-
? // @ts-ignore
246-
new PDFLoader(path, { splitPages: false, pdfjs: () => import('pdf-parse/lib/pdf.js/v1.10.100/build/pdf.js') })
247-
: // @ts-ignore
248-
new PDFLoader(path, { pdfjs: () => import('pdf-parse/lib/pdf.js/v1.10.100/build/pdf.js') }),
245+
new PDFLoader(path, {
246+
splitPages: pdfUsage !== 'perFile',
247+
// @ts-ignore
248+
pdfjs: () => import('pdf-parse/lib/pdf.js/v1.10.100/build/pdf.js')
249+
}),
249250
'.aspx': (path) => new TextLoader(path),
250251
'.asp': (path) => new TextLoader(path),
251252
'.cpp': (path) => new TextLoader(path), // C++
@@ -284,63 +285,16 @@ class S3_DocumentLoaders implements INode {
284285
true
285286
)
286287

287-
let docs = []
288-
289-
if (textSplitter) {
290-
let splittedDocs = await loader.load()
291-
splittedDocs = await textSplitter.splitDocuments(splittedDocs)
292-
docs.push(...splittedDocs)
293-
} else {
294-
docs = await loader.load()
295-
}
296-
297-
if (metadata) {
298-
const parsedMetadata = typeof metadata === 'object' ? metadata : JSON.parse(metadata)
299-
docs = docs.map((doc) => ({
300-
...doc,
301-
metadata:
302-
_omitMetadataKeys === '*'
303-
? {
304-
...parsedMetadata
305-
}
306-
: omit(
307-
{
308-
...doc.metadata,
309-
...parsedMetadata
310-
},
311-
omitMetadataKeys
312-
)
313-
}))
314-
} else {
315-
docs = docs.map((doc) => ({
316-
...doc,
317-
metadata:
318-
_omitMetadataKeys === '*'
319-
? {}
320-
: omit(
321-
{
322-
...doc.metadata
323-
},
324-
omitMetadataKeys
325-
)
326-
}))
327-
}
288+
let docs = await handleDocumentLoaderDocuments(loader, textSplitter)
328289

329-
// remove the temp directory before returning docs
330-
fsDefault.rmSync(tempDir, { recursive: true })
290+
docs = handleDocumentLoaderMetadata(docs, _omitMetadataKeys, metadata)
331291

332-
if (output === 'document') {
333-
return docs
334-
} else {
335-
let finaltext = ''
336-
for (const doc of docs) {
337-
finaltext += `${doc.pageContent}\n`
338-
}
339-
return handleEscapeCharacters(finaltext, false)
340-
}
292+
return handleDocumentLoaderOutput(docs, output)
341293
} catch (e: any) {
342-
fsDefault.rmSync(tempDir, { recursive: true })
343294
throw new Error(`Failed to load data from bucket ${bucketName}: ${e.message}`)
295+
} finally {
296+
// remove the temp directory before returning docs
297+
fsDefault.rmSync(tempDir, { recursive: true })
344298
}
345299
}
346300
}

0 commit comments

Comments
 (0)