Skip to content

Commit b8e017e

Browse files
committed
enhance: content validation in config indexing #1240
1 parent 65a4aaa commit b8e017e

File tree

1 file changed

+88
-2
lines changed

1 file changed

+88
-2
lines changed

internal/cache/search.go

Lines changed: 88 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -182,7 +182,33 @@ func (si *SearchIndexer) createIndexMapping() mapping.IndexMapping {
182182
}
183183

184184
// handleConfigScan processes scanned config files and indexes them
185-
func (si *SearchIndexer) handleConfigScan(configPath string, content []byte) error {
185+
func (si *SearchIndexer) handleConfigScan(configPath string, content []byte) (err error) {
186+
// Add panic recovery to prevent the entire application from crashing
187+
defer func() {
188+
if r := recover(); r != nil {
189+
err = fmt.Errorf("panic during config scan: %v", r)
190+
logger.Error("Panic occurred while scanning config", "config_path", configPath, "content_size", len(content), "error", err)
191+
}
192+
}()
193+
194+
// File size limit: 10MB to prevent memory overflow
195+
const maxFileSize = 10 * 1024 * 1024 // 10MB
196+
if len(content) > maxFileSize {
197+
logger.Warn("Skipping file due to size limit", "path", configPath, "size", len(content), "limit", maxFileSize)
198+
return nil
199+
}
200+
201+
// Skip empty files
202+
if len(content) == 0 {
203+
return nil
204+
}
205+
206+
// Basic content validation: check if it's text content
207+
if !isTextContent(content) {
208+
logger.Warn("Skipping non-text file", "path", configPath)
209+
return nil
210+
}
211+
186212
docType := si.determineConfigType(configPath)
187213
if docType == "" {
188214
return nil // Skip unsupported file types
@@ -214,14 +240,27 @@ func (si *SearchIndexer) determineConfigType(configPath string) string {
214240
}
215241

216242
// IndexDocument indexes a single document
217-
func (si *SearchIndexer) IndexDocument(doc SearchDocument) error {
243+
func (si *SearchIndexer) IndexDocument(doc SearchDocument) (err error) {
244+
// Add panic recovery to prevent the entire application from crashing
245+
defer func() {
246+
if r := recover(); r != nil {
247+
err = fmt.Errorf("panic during indexing: %v", r)
248+
logger.Error("Panic occurred while indexing document", "document_id", doc.ID, "error", err)
249+
}
250+
}()
251+
218252
si.indexMutex.RLock()
219253
defer si.indexMutex.RUnlock()
220254

221255
if si.index == nil {
222256
return fmt.Errorf("search index not initialized")
223257
}
224258

259+
// Additional size check as a safety measure
260+
if len(doc.Content) > 50*1024*1024 { // 50MB absolute limit
261+
return fmt.Errorf("document content too large: %d bytes", len(doc.Content))
262+
}
263+
225264
// logger.Debugf("Indexing document: ID=%s, Type=%s, Name=%s, Path=%s",
226265
// doc.ID, doc.Type, doc.Name, doc.Path)
227266

@@ -497,3 +536,50 @@ func SearchConfigs(ctx context.Context, query string, limit int) ([]SearchResult
497536
func SearchAll(ctx context.Context, query string, limit int) ([]SearchResult, error) {
498537
return GetSearchIndexer().Search(ctx, query, limit)
499538
}
539+
540+
// isTextContent checks if the content appears to be text-based
541+
// This helps prevent indexing binary files that might have been misidentified
542+
func isTextContent(content []byte) bool {
543+
if len(content) == 0 {
544+
return true // Empty content is considered text
545+
}
546+
547+
// Check for common binary file signatures
548+
if len(content) >= 4 {
549+
// Check for some common binary file headers
550+
switch {
551+
case content[0] == 0x7F && content[1] == 0x45 && content[2] == 0x4C && content[3] == 0x46: // ELF
552+
return false
553+
case content[0] == 0x89 && content[1] == 0x50 && content[2] == 0x4E && content[3] == 0x47: // PNG
554+
return false
555+
case content[0] == 0xFF && content[1] == 0xD8 && content[2] == 0xFF: // JPEG
556+
return false
557+
case content[0] == 0x50 && content[1] == 0x4B && content[2] == 0x03 && content[3] == 0x04: // ZIP
558+
return false
559+
case content[0] == 0x50 && content[1] == 0x4B && content[2] == 0x05 && content[3] == 0x06: // ZIP (empty)
560+
return false
561+
case content[0] == 0x50 && content[1] == 0x4B && content[2] == 0x07 && content[3] == 0x08: // ZIP (spanned)
562+
return false
563+
}
564+
}
565+
566+
// Check if the first part of the content contains mostly printable characters
567+
// Sample up to 8KB for performance
568+
sampleSize := len(content)
569+
if sampleSize > 8192 {
570+
sampleSize = 8192
571+
}
572+
573+
nonPrintableCount := 0
574+
for i := 0; i < sampleSize; i++ {
575+
b := content[i]
576+
// Allow printable ASCII characters, newlines, tabs, and carriage returns
577+
if (b < 32 && b != 9 && b != 10 && b != 13) || b > 126 {
578+
nonPrintableCount++
579+
}
580+
}
581+
582+
// If more than 30% of the sampled content is non-printable, consider it binary
583+
threshold := float64(sampleSize) * 0.3
584+
return float64(nonPrintableCount) <= threshold
585+
}

0 commit comments

Comments
 (0)