@@ -13,6 +13,7 @@ import (
1313 "github.com/blevesearch/bleve/v2/analysis/lang/en"
1414 "github.com/blevesearch/bleve/v2/mapping"
1515 "github.com/blevesearch/bleve/v2/search/query"
16+ "github.com/gabriel-vasile/mimetype"
1617 "github.com/uozi-tech/cosy/logger"
1718)
1819
@@ -40,6 +41,12 @@ type SearchIndexer struct {
4041 ctx context.Context
4142 cancel context.CancelFunc
4243 cleanupOnce sync.Once
44+
45+ // Memory management
46+ totalContentSize int64
47+ documentCount int64
48+ maxMemoryUsage int64
49+ memoryMutex sync.RWMutex
4350}
4451
4552var (
@@ -57,7 +64,8 @@ func GetSearchIndexer() *SearchIndexer {
5764 }
5865
5966 searchIndexer = & SearchIndexer {
60- indexPath : tempDir ,
67+ indexPath : tempDir ,
68+ maxMemoryUsage : 100 * 1024 * 1024 , // 100MB memory limit for indexed content
6169 }
6270 })
6371 return searchIndexer
@@ -131,6 +139,12 @@ func (si *SearchIndexer) cleanup() {
131139 si .index = nil
132140 }
133141
142+ // Reset memory tracking
143+ si .memoryMutex .Lock ()
144+ si .totalContentSize = 0
145+ si .documentCount = 0
146+ si .memoryMutex .Unlock ()
147+
134148 // Remove the temporary directory
135149 if err := os .RemoveAll (si .indexPath ); err != nil {
136150 logger .Error ("Failed to remove search index directory:" , err )
@@ -191,10 +205,10 @@ func (si *SearchIndexer) handleConfigScan(configPath string, content []byte) (er
191205 }
192206 }()
193207
194- // File size limit: 10MB to prevent memory overflow
195- const maxFileSize = 10 * 1024 * 1024 // 10MB
208+ // File size limit: 1MB to prevent memory overflow and improve performance
209+ const maxFileSize = 1024 * 1024 // 1MB
196210 if len (content ) > maxFileSize {
197- logger .Warn ("Skipping file due to size limit" , " path" , configPath , "size" , len (content ), "limit" , maxFileSize )
211+ logger .Debugf ("Skipping file due to size limit, path: %s, size: %d, limit: %d" , configPath , len (content ), maxFileSize )
198212 return nil
199213 }
200214
@@ -203,9 +217,9 @@ func (si *SearchIndexer) handleConfigScan(configPath string, content []byte) (er
203217 return nil
204218 }
205219
206- // Basic content validation: check if it's text content
207- if ! isTextContent (content ) {
208- logger .Warn ("Skipping non-text file" , "path " , configPath )
220+ // Basic content validation: check if it's a configuration file
221+ if ! isConfigFile (content ) {
222+ logger .Debugf ("Skipping non-config file: %s " , configPath )
209223 return nil
210224 }
211225
@@ -249,22 +263,38 @@ func (si *SearchIndexer) IndexDocument(doc SearchDocument) (err error) {
249263 }
250264 }()
251265
266+ // Additional size check as a safety measure
267+ if len (doc .Content ) > 2 * 1024 * 1024 { // 2MB absolute limit
268+ return fmt .Errorf ("document content too large: %d bytes" , len (doc .Content ))
269+ }
270+
271+ // Check memory usage before indexing
272+ contentSize := int64 (len (doc .Content ))
273+ if ! si .checkMemoryLimitBeforeIndexing (contentSize ) {
274+ logger .Warn ("Skipping document due to memory limit" , "document_id" , doc .ID , "content_size" , contentSize )
275+ return nil
276+ }
277+
252278 si .indexMutex .RLock ()
253279 defer si .indexMutex .RUnlock ()
254280
255281 if si .index == nil {
256282 return fmt .Errorf ("search index not initialized" )
257283 }
258284
259- // Additional size check as a safety measure
260- if len (doc .Content ) > 50 * 1024 * 1024 { // 50MB absolute limit
261- return fmt .Errorf ("document content too large: %d bytes" , len (doc .Content ))
285+ // Index the document
286+ err = si .index .Index (doc .ID , doc )
287+ if err != nil {
288+ return err
262289 }
263290
291+ // Update memory usage tracking
292+ si .updateMemoryUsage (doc .ID , contentSize , true )
293+
264294 // logger.Debugf("Indexing document: ID=%s, Type=%s, Name=%s, Path=%s",
265295 // doc.ID, doc.Type, doc.Name, doc.Path)
266296
267- return si . index . Index ( doc . ID , doc )
297+ return nil
268298}
269299
270300// Search performs a search query
@@ -324,7 +354,7 @@ func (si *SearchIndexer) searchWithType(ctx context.Context, queryStr string, do
324354 }
325355 results := si .convertResults (res .result )
326356
327- // Debug log the search execution
357+ // log the search execution
328358 logger .Debugf ("Search index query '%s' (type: %s, limit: %d) returned %d results" ,
329359 queryStr , docType , limit , len (results ))
330360
@@ -436,6 +466,10 @@ func (si *SearchIndexer) DeleteDocument(docID string) error {
436466 return fmt .Errorf ("search index not initialized" )
437467 }
438468
469+ // Note: We don't track the exact size of deleted documents here
470+ // as it would require storing document sizes separately.
471+ // The memory tracking will reset during periodic cleanups or restarts.
472+
439473 return si .index .Delete (docID )
440474}
441475
@@ -499,9 +533,16 @@ func (si *SearchIndexer) GetIndexStats() (map[string]interface{}, error) {
499533 return nil , err
500534 }
501535
536+ // Get memory usage statistics
537+ totalContentSize , trackedDocCount , maxMemoryUsage := si .getMemoryUsage ()
538+
502539 return map [string ]interface {}{
503- "document_count" : docCount ,
504- "index_path" : si .indexPath ,
540+ "document_count" : docCount ,
541+ "tracked_document_count" : trackedDocCount ,
542+ "total_content_size" : totalContentSize ,
543+ "max_memory_usage" : maxMemoryUsage ,
544+ "memory_usage_percent" : float64 (totalContentSize ) / float64 (maxMemoryUsage ) * 100 ,
545+ "index_path" : si .indexPath ,
505546 }, nil
506547}
507548
@@ -537,49 +578,72 @@ func SearchAll(ctx context.Context, query string, limit int) ([]SearchResult, er
537578 return GetSearchIndexer ().Search (ctx , query , limit )
538579}
539580
540- // isTextContent checks if the content appears to be text-based
541- // This helps prevent indexing binary files that might have been misidentified
542- func isTextContent (content []byte ) bool {
543- if len (content ) == 0 {
544- return true // Empty content is considered text
545- }
546-
547- // Check for common binary file signatures
548- if len (content ) >= 4 {
549- // Check for some common binary file headers
550- switch {
551- case content [0 ] == 0x7F && content [1 ] == 0x45 && content [2 ] == 0x4C && content [3 ] == 0x46 : // ELF
552- return false
553- case content [0 ] == 0x89 && content [1 ] == 0x50 && content [2 ] == 0x4E && content [3 ] == 0x47 : // PNG
554- return false
555- case content [0 ] == 0xFF && content [1 ] == 0xD8 && content [2 ] == 0xFF : // JPEG
556- return false
557- case content [0 ] == 0x50 && content [1 ] == 0x4B && content [2 ] == 0x03 && content [3 ] == 0x04 : // ZIP
558- return false
559- case content [0 ] == 0x50 && content [1 ] == 0x4B && content [2 ] == 0x05 && content [3 ] == 0x06 : // ZIP (empty)
560- return false
561- case content [0 ] == 0x50 && content [1 ] == 0x4B && content [2 ] == 0x07 && content [3 ] == 0x08 : // ZIP (spanned)
562- return false
563- }
581+ // checkMemoryLimitBeforeIndexing checks if adding new content would exceed memory limits
582+ func (si * SearchIndexer ) checkMemoryLimitBeforeIndexing (contentSize int64 ) bool {
583+ si .memoryMutex .RLock ()
584+ defer si .memoryMutex .RUnlock ()
585+
586+ // Check if adding this content would exceed the memory limit
587+ newTotalSize := si .totalContentSize + contentSize
588+ if newTotalSize > si .maxMemoryUsage {
589+ logger .Debugf ("Memory limit would be exceeded: current=%d, new=%d, limit=%d" ,
590+ si .totalContentSize , newTotalSize , si .maxMemoryUsage )
591+ return false
564592 }
565593
566- // Check if the first part of the content contains mostly printable characters
567- // Sample up to 8KB for performance
568- sampleSize := len (content )
569- if sampleSize > 8192 {
570- sampleSize = 8192
594+ // Also check document count limit (max 1000 documents)
595+ if si .documentCount >= 1000 {
596+ logger .Debugf ("Document count limit reached: %d" , si .documentCount )
597+ return false
571598 }
572599
573- nonPrintableCount := 0
574- for i := 0 ; i < sampleSize ; i ++ {
575- b := content [i ]
576- // Allow printable ASCII characters, newlines, tabs, and carriage returns
577- if (b < 32 && b != 9 && b != 10 && b != 13 ) || b > 126 {
578- nonPrintableCount ++
600+ return true
601+ }
602+
603+ // updateMemoryUsage updates the memory usage tracking
604+ func (si * SearchIndexer ) updateMemoryUsage (documentID string , contentSize int64 , isAddition bool ) {
605+ si .memoryMutex .Lock ()
606+ defer si .memoryMutex .Unlock ()
607+
608+ if isAddition {
609+ si .totalContentSize += contentSize
610+ si .documentCount ++
611+ logger .Debugf ("Added document %s: size=%d, total_size=%d, count=%d" ,
612+ documentID , contentSize , si .totalContentSize , si .documentCount )
613+ } else {
614+ si .totalContentSize -= contentSize
615+ si .documentCount --
616+ if si .totalContentSize < 0 {
617+ si .totalContentSize = 0
579618 }
619+ if si .documentCount < 0 {
620+ si .documentCount = 0
621+ }
622+ logger .Debugf ("Removed document %s: size=%d, total_size=%d, count=%d" ,
623+ documentID , contentSize , si .totalContentSize , si .documentCount )
624+ }
625+ }
626+
627+ // getMemoryUsage returns current memory usage statistics
628+ func (si * SearchIndexer ) getMemoryUsage () (int64 , int64 , int64 ) {
629+ si .memoryMutex .RLock ()
630+ defer si .memoryMutex .RUnlock ()
631+ return si .totalContentSize , si .documentCount , si .maxMemoryUsage
632+ }
633+
634+ // isConfigFile checks if the content is a text/plain file (most nginx configs)
635+ func isConfigFile (content []byte ) bool {
636+ if len (content ) == 0 {
637+ return false // Empty files are not useful for configuration
638+ }
639+
640+ // Detect MIME type and only accept text/plain
641+ mtype := mimetype .Detect (content )
642+
643+ if mtype .Is ("text/plain" ) {
644+ return true
580645 }
581646
582- // If more than 30% of the sampled content is non-printable, consider it binary
583- threshold := float64 (sampleSize ) * 0.3
584- return float64 (nonPrintableCount ) <= threshold
647+ logger .Debugf ("Skipping non-text/plain file with MIME type: %s" , mtype .String ())
648+ return false
585649}
0 commit comments