Skip to content

Commit 307d7ab

Browse files
committed
stronger typing
1 parent 523d8a9 commit 307d7ab

File tree

8 files changed

+89
-34
lines changed

8 files changed

+89
-34
lines changed

apps/sim/lib/chunkers/docs-chunker.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ interface HeaderInfo {
1616
interface Frontmatter {
1717
title?: string
1818
description?: string
19-
[key: string]: any
19+
[key: string]: unknown
2020
}
2121

2222
const logger = createLogger('DocsChunker')

apps/sim/lib/chunkers/json-yaml-chunker.ts

Lines changed: 14 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,11 @@ import { estimateTokenCount } from '@/lib/tokenization/estimators'
66

77
const logger = createLogger('JsonYamlChunker')
88

9+
type JsonPrimitive = string | number | boolean | null
10+
type JsonValue = JsonPrimitive | JsonObject | JsonArray
11+
type JsonObject = { [key: string]: JsonValue }
12+
type JsonArray = JsonValue[]
13+
914
function getTokenCount(text: string): number {
1015
try {
1116
return getAccurateTokenCount(text, 'text-embedding-3-small')
@@ -59,11 +64,11 @@ export class JsonYamlChunker {
5964
*/
6065
async chunk(content: string): Promise<Chunk[]> {
6166
try {
62-
let data: any
67+
let data: JsonValue
6368
try {
64-
data = JSON.parse(content)
69+
data = JSON.parse(content) as JsonValue
6570
} catch {
66-
data = yaml.load(content)
71+
data = yaml.load(content) as JsonValue
6772
}
6873
const chunks = this.chunkStructuredData(data)
6974

@@ -86,15 +91,15 @@ export class JsonYamlChunker {
8691
/**
8792
* Chunk structured data based on its structure
8893
*/
89-
private chunkStructuredData(data: any, path: string[] = []): Chunk[] {
94+
private chunkStructuredData(data: JsonValue, path: string[] = []): Chunk[] {
9095
const chunks: Chunk[] = []
9196

9297
if (Array.isArray(data)) {
9398
return this.chunkArray(data, path)
9499
}
95100

96101
if (typeof data === 'object' && data !== null) {
97-
return this.chunkObject(data, path)
102+
return this.chunkObject(data as JsonObject, path)
98103
}
99104

100105
const content = JSON.stringify(data, null, 2)
@@ -118,9 +123,9 @@ export class JsonYamlChunker {
118123
/**
119124
* Chunk an array intelligently
120125
*/
121-
private chunkArray(arr: any[], path: string[]): Chunk[] {
126+
private chunkArray(arr: JsonArray, path: string[]): Chunk[] {
122127
const chunks: Chunk[] = []
123-
let currentBatch: any[] = []
128+
let currentBatch: JsonValue[] = []
124129
let currentTokens = 0
125130

126131
const contextHeader = path.length > 0 ? `// ${path.join('.')}\n` : ''
@@ -194,7 +199,7 @@ export class JsonYamlChunker {
194199
/**
195200
* Chunk an object intelligently
196201
*/
197-
private chunkObject(obj: Record<string, any>, path: string[]): Chunk[] {
202+
private chunkObject(obj: JsonObject, path: string[]): Chunk[] {
198203
const chunks: Chunk[] = []
199204
const entries = Object.entries(obj)
200205

@@ -213,7 +218,7 @@ export class JsonYamlChunker {
213218
return chunks
214219
}
215220

216-
let currentObj: Record<string, any> = {}
221+
let currentObj: JsonObject = {}
217222
let currentTokens = 0
218223
let currentKeys: string[] = []
219224

apps/sim/lib/file-parsers/docx-parser.ts

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,14 @@ import { sanitizeTextForUTF8 } from '@/lib/file-parsers/utils'
66

77
const logger = createLogger('DocxParser')
88

9+
interface MammothMessage {
10+
type: 'warning' | 'error'
11+
message: string
12+
}
13+
914
interface MammothResult {
1015
value: string
11-
messages: any[]
16+
messages: MammothMessage[]
1217
}
1318

1419
export class DocxParser implements FileParser {

apps/sim/lib/file-parsers/types.ts

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,22 @@
1+
export interface FileParseMetadata {
2+
characterCount?: number
3+
pageCount?: number
4+
extractionMethod?: string
5+
warning?: string
6+
messages?: unknown[]
7+
html?: string
8+
type?: string
9+
headers?: string[]
10+
totalRows?: number
11+
rowCount?: number
12+
sheetNames?: string[]
13+
source?: string
14+
[key: string]: unknown
15+
}
16+
117
export interface FileParseResult {
218
content: string
3-
metadata?: Record<string, any>
19+
metadata?: FileParseMetadata
420
}
521

622
export interface FileParser {

apps/sim/lib/knowledge/documents/document-processor.ts

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ import { getBYOKKey } from '@/lib/api-key/byok'
44
import { type Chunk, JsonYamlChunker, StructuredDataChunker, TextChunker } from '@/lib/chunkers'
55
import { env } from '@/lib/core/config/env'
66
import { parseBuffer, parseFile } from '@/lib/file-parsers'
7+
import type { FileParseMetadata } from '@/lib/file-parsers/types'
78
import { retryWithExponentialBackoff } from '@/lib/knowledge/documents/utils'
89
import { StorageService } from '@/lib/uploads'
910
import { downloadFileFromUrl } from '@/lib/uploads/utils/file-utils.server'
@@ -35,7 +36,6 @@ type OCRRequestBody = {
3536
document_url: string
3637
}
3738
include_image_base64: boolean
38-
pages?: number[]
3939
}
4040

4141
const MISTRAL_MAX_PAGES = 1000
@@ -138,7 +138,7 @@ export async function processDocument(
138138
const cloudUrl = 'cloudUrl' in parseResult ? parseResult.cloudUrl : undefined
139139

140140
let chunks: Chunk[]
141-
const metadata = 'metadata' in parseResult ? parseResult.metadata : {}
141+
const metadata: FileParseMetadata = parseResult.metadata ?? {}
142142

143143
const isJsonYaml =
144144
metadata.type === 'json' ||
@@ -154,10 +154,11 @@ export async function processDocument(
154154
})
155155
} else if (StructuredDataChunker.isStructuredData(content, mimeType)) {
156156
logger.info('Using structured data chunker for spreadsheet/CSV content')
157+
const rowCount = metadata.totalRows ?? metadata.rowCount
157158
chunks = await StructuredDataChunker.chunkStructuredData(content, {
158159
chunkSize,
159160
headers: metadata.headers,
160-
totalRows: metadata.totalRows || metadata.rowCount,
161+
totalRows: typeof rowCount === 'number' ? rowCount : undefined,
161162
sheetName: metadata.sheetNames?.[0],
162163
})
163164
} else {
@@ -210,7 +211,7 @@ async function parseDocument(
210211
content: string
211212
processingMethod: 'file-parser' | 'mistral-ocr'
212213
cloudUrl?: string
213-
metadata?: any
214+
metadata?: FileParseMetadata
214215
}> {
215216
const isPDF = mimeType === 'application/pdf'
216217
const hasAzureMistralOCR =
@@ -663,7 +664,7 @@ async function processChunk(
663664
}
664665

665666
// Maximum concurrent chunk processing to avoid overwhelming APIs
666-
const MAX_CONCURRENT_CHUNKS = env.KB_CONFIG_CHUNK_CONCURRENCY || 5
667+
const MAX_CONCURRENT_CHUNKS = env.KB_CONFIG_CHUNK_CONCURRENCY
667668

668669
async function processMistralOCRInBatches(
669670
filename: string,
@@ -736,7 +737,7 @@ async function processMistralOCRInBatches(
736737
async function parseWithFileParser(fileUrl: string, filename: string, mimeType: string) {
737738
try {
738739
let content: string
739-
let metadata: any = {}
740+
let metadata: FileParseMetadata = {}
740741

741742
if (fileUrl.startsWith('data:')) {
742743
content = await parseDataURI(fileUrl, filename, mimeType)
@@ -782,7 +783,7 @@ async function parseDataURI(fileUrl: string, filename: string, mimeType: string)
782783
async function parseHttpFile(
783784
fileUrl: string,
784785
filename: string
785-
): Promise<{ content: string; metadata?: any }> {
786+
): Promise<{ content: string; metadata?: FileParseMetadata }> {
786787
const buffer = await downloadFileWithTimeout(fileUrl)
787788

788789
const extension = filename.split('.').pop()?.toLowerCase()

apps/sim/lib/knowledge/documents/service.ts

Lines changed: 15 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -693,7 +693,7 @@ export async function createDocumentRecords(
693693
for (const docData of documents) {
694694
const documentId = randomUUID()
695695

696-
let processedTags: Record<string, any> = {}
696+
let processedTags: Partial<ProcessedDocumentTags> = {}
697697

698698
if (docData.documentTagsData) {
699699
try {
@@ -1057,7 +1057,7 @@ export async function createSingleDocument(
10571057
const now = new Date()
10581058

10591059
// Process structured tag data if provided
1060-
let processedTags: Record<string, any> = {
1060+
let processedTags: ProcessedDocumentTags = {
10611061
// Text tags (7 slots)
10621062
tag1: documentData.tag1 ?? null,
10631063
tag2: documentData.tag2 ?? null,
@@ -1533,23 +1533,30 @@ export async function updateDocument(
15331533
return value || null
15341534
}
15351535

1536+
// Type-safe access to tag slots in updateData
1537+
type UpdateDataWithTags = typeof updateData & Record<TagSlot, string | undefined>
1538+
const typedUpdateData = updateData as UpdateDataWithTags
1539+
15361540
ALL_TAG_SLOTS.forEach((slot: TagSlot) => {
1537-
const updateValue = (updateData as any)[slot]
1541+
const updateValue = typedUpdateData[slot]
15381542
if (updateValue !== undefined) {
1539-
;(dbUpdateData as any)[slot] = convertTagValue(slot, updateValue)
1543+
;(dbUpdateData as Record<TagSlot, string | number | Date | boolean | null>)[slot] =
1544+
convertTagValue(slot, updateValue)
15401545
}
15411546
})
15421547

15431548
await db.transaction(async (tx) => {
15441549
await tx.update(document).set(dbUpdateData).where(eq(document.id, documentId))
15451550

1546-
const hasTagUpdates = ALL_TAG_SLOTS.some((field) => (updateData as any)[field] !== undefined)
1551+
const hasTagUpdates = ALL_TAG_SLOTS.some((field) => typedUpdateData[field] !== undefined)
15471552

15481553
if (hasTagUpdates) {
1549-
const embeddingUpdateData: Record<string, any> = {}
1554+
const embeddingUpdateData: Partial<ProcessedDocumentTags> = {}
15501555
ALL_TAG_SLOTS.forEach((field) => {
1551-
if ((updateData as any)[field] !== undefined) {
1552-
embeddingUpdateData[field] = convertTagValue(field, (updateData as any)[field])
1556+
if (typedUpdateData[field] !== undefined) {
1557+
;(embeddingUpdateData as Record<TagSlot, string | number | Date | boolean | null>)[
1558+
field
1559+
] = convertTagValue(field, typedUpdateData[field])
15531560
}
15541561
})
15551562

apps/sim/lib/knowledge/documents/utils.ts

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ export interface RetryOptions {
1414
initialDelayMs?: number
1515
maxDelayMs?: number
1616
backoffMultiplier?: number
17-
retryCondition?: (error: RetryableError) => boolean
17+
retryCondition?: (error: unknown) => boolean
1818
}
1919

2020
export interface RetryResult<T> {
@@ -30,11 +30,18 @@ function hasStatus(
3030
return typeof error === 'object' && error !== null && 'status' in error
3131
}
3232

33+
function isRetryableErrorType(error: unknown): error is RetryableError {
34+
if (!error) return false
35+
if (error instanceof Error) return true
36+
if (typeof error === 'object' && ('status' in error || 'message' in error)) return true
37+
return false
38+
}
39+
3340
/**
3441
* Default retry condition for rate limiting errors
3542
*/
36-
export function isRetryableError(error: RetryableError): boolean {
37-
if (!error) return false
43+
export function isRetryableError(error: unknown): boolean {
44+
if (!isRetryableErrorType(error)) return false
3845

3946
// Check for rate limiting status codes
4047
if (
@@ -45,7 +52,7 @@ export function isRetryableError(error: RetryableError): boolean {
4552
}
4653

4754
// Check for rate limiting in error messages
48-
const errorMessage = error.message || error.toString()
55+
const errorMessage = error instanceof Error ? error.message : String(error)
4956
const rateLimitKeywords = [
5057
'rate limit',
5158
'rate_limit',

apps/sim/lib/knowledge/embeddings.ts

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,20 @@ interface EmbeddingConfig {
2626
modelName: string
2727
}
2828

29+
interface EmbeddingResponseItem {
30+
embedding: number[]
31+
index: number
32+
}
33+
34+
interface EmbeddingAPIResponse {
35+
data: EmbeddingResponseItem[]
36+
model: string
37+
usage: {
38+
prompt_tokens: number
39+
total_tokens: number
40+
}
41+
}
42+
2943
async function getEmbeddingConfig(
3044
embeddingModel = 'text-embedding-3-small',
3145
workspaceId?: string | null
@@ -104,14 +118,14 @@ async function callEmbeddingAPI(inputs: string[], config: EmbeddingConfig): Prom
104118
)
105119
}
106120

107-
const data = await response.json()
108-
return data.data.map((item: any) => item.embedding)
121+
const data: EmbeddingAPIResponse = await response.json()
122+
return data.data.map((item) => item.embedding)
109123
},
110124
{
111125
maxRetries: 3,
112126
initialDelayMs: 1000,
113127
maxDelayMs: 10000,
114-
retryCondition: (error: any) => {
128+
retryCondition: (error: unknown) => {
115129
if (error instanceof EmbeddingAPIError) {
116130
return error.status === 429 || error.status >= 500
117131
}

0 commit comments

Comments
 (0)