Skip to content

Commit f67bff7

Browse files
committed
enhance: memory management in search indexing #1240
1 parent b8e017e commit f67bff7

File tree

2 files changed

+182
-61
lines changed

2 files changed

+182
-61
lines changed

internal/cache/index.go

Lines changed: 66 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,31 @@ func InitScanner(ctx context.Context) {
4747
}
4848
}
4949

50+
// shouldSkipPath checks if a path should be skipped during scanning or watching
51+
func shouldSkipPath(path string) bool {
52+
// Define directories to exclude from scanning/watching
53+
excludedDirs := []string{
54+
nginx.GetConfPath("ssl"), // SSL certificates and keys
55+
nginx.GetConfPath("cache"), // Nginx cache files
56+
nginx.GetConfPath("logs"), // Log files directory
57+
nginx.GetConfPath("temp"), // Temporary files directory
58+
nginx.GetConfPath("proxy_temp"), // Proxy temporary files
59+
nginx.GetConfPath("client_body_temp"), // Client body temporary files
60+
nginx.GetConfPath("fastcgi_temp"), // FastCGI temporary files
61+
nginx.GetConfPath("uwsgi_temp"), // uWSGI temporary files
62+
nginx.GetConfPath("scgi_temp"), // SCGI temporary files
63+
}
64+
65+
// Check if path starts with any excluded directory
66+
for _, excludedDir := range excludedDirs {
67+
if excludedDir != "" && strings.HasPrefix(path, excludedDir) {
68+
return true
69+
}
70+
}
71+
72+
return false
73+
}
74+
5075
// GetScanner returns the singleton scanner instance
5176
func GetScanner() *Scanner {
5277
scannerInitMutex.Lock()
@@ -95,16 +120,15 @@ func (s *Scanner) Initialize(ctx context.Context) error {
95120
// watchAllDirectories recursively adds all directories under nginx config path to watcher
96121
func (s *Scanner) watchAllDirectories() error {
97122
root := nginx.GetConfPath()
98-
sslDir := nginx.GetConfPath("ssl")
99123

100124
return filepath.WalkDir(root, func(path string, d fs.DirEntry, err error) error {
101125
if err != nil {
102126
return err
103127
}
104128

105129
if d.IsDir() {
106-
// Skip ssl directory
107-
if path == sslDir {
130+
// Skip excluded directories (ssl, cache, logs, temp, etc.)
131+
if shouldSkipPath(path) {
108132
return filepath.SkipDir
109133
}
110134

@@ -170,9 +194,8 @@ func (s *Scanner) handleFileEvent(event fsnotify.Event) {
170194
return
171195
}
172196

173-
// Skip ssl directory
174-
sslDir := nginx.GetConfPath("ssl")
175-
if strings.HasPrefix(event.Name, sslDir) {
197+
// Skip excluded directories (ssl, cache, etc.)
198+
if shouldSkipPath(event.Name) {
176199
return
177200
}
178201

@@ -212,6 +235,41 @@ func (s *Scanner) scanSingleFile(filePath string) error {
212235
s.setScanningState(true)
213236
defer s.setScanningState(false)
214237

238+
// Check if path should be skipped
239+
if shouldSkipPath(filePath) {
240+
return nil
241+
}
242+
243+
// Get file info to check type and size
244+
fileInfo, err := os.Lstat(filePath) // Use Lstat to avoid following symlinks
245+
if err != nil {
246+
return err
247+
}
248+
249+
// Skip directories
250+
if fileInfo.IsDir() {
251+
logger.Debugf("Skipping directory: %s", filePath)
252+
return nil
253+
}
254+
255+
// Skip symlinks to avoid potential issues
256+
if fileInfo.Mode()&os.ModeSymlink != 0 {
257+
logger.Debugf("Skipping symlink: %s", filePath)
258+
return nil
259+
}
260+
261+
// Skip non-regular files (devices, pipes, sockets, etc.)
262+
if !fileInfo.Mode().IsRegular() {
263+
logger.Debugf("Skipping non-regular file: %s (mode: %s)", filePath, fileInfo.Mode())
264+
return nil
265+
}
266+
267+
// Skip files larger than 1MB before reading
268+
if fileInfo.Size() > 1024*1024 {
269+
logger.Debugf("Skipping large file: %s (size: %d bytes)", filePath, fileInfo.Size())
270+
return nil
271+
}
272+
215273
// Read file content
216274
content, err := os.ReadFile(filePath)
217275
if err != nil {
@@ -256,16 +314,15 @@ func (s *Scanner) ScanAllConfigs() error {
256314
defer s.setScanningState(false)
257315

258316
root := nginx.GetConfPath()
259-
sslDir := nginx.GetConfPath("ssl")
260317

261318
// Scan all files in the config directory and subdirectories
262319
return filepath.WalkDir(root, func(path string, d fs.DirEntry, err error) error {
263320
if err != nil {
264321
return err
265322
}
266323

267-
// Skip ssl directory
268-
if d.IsDir() && path == sslDir {
324+
// Skip excluded directories (ssl, cache, logs, temp, etc.)
325+
if d.IsDir() && shouldSkipPath(path) {
269326
return filepath.SkipDir
270327
}
271328

internal/cache/search.go

Lines changed: 116 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ import (
1313
"github.com/blevesearch/bleve/v2/analysis/lang/en"
1414
"github.com/blevesearch/bleve/v2/mapping"
1515
"github.com/blevesearch/bleve/v2/search/query"
16+
"github.com/gabriel-vasile/mimetype"
1617
"github.com/uozi-tech/cosy/logger"
1718
)
1819

@@ -40,6 +41,12 @@ type SearchIndexer struct {
4041
ctx context.Context
4142
cancel context.CancelFunc
4243
cleanupOnce sync.Once
44+
45+
// Memory management
46+
totalContentSize int64
47+
documentCount int64
48+
maxMemoryUsage int64
49+
memoryMutex sync.RWMutex
4350
}
4451

4552
var (
@@ -57,7 +64,8 @@ func GetSearchIndexer() *SearchIndexer {
5764
}
5865

5966
searchIndexer = &SearchIndexer{
60-
indexPath: tempDir,
67+
indexPath: tempDir,
68+
maxMemoryUsage: 100 * 1024 * 1024, // 100MB memory limit for indexed content
6169
}
6270
})
6371
return searchIndexer
@@ -131,6 +139,12 @@ func (si *SearchIndexer) cleanup() {
131139
si.index = nil
132140
}
133141

142+
// Reset memory tracking
143+
si.memoryMutex.Lock()
144+
si.totalContentSize = 0
145+
si.documentCount = 0
146+
si.memoryMutex.Unlock()
147+
134148
// Remove the temporary directory
135149
if err := os.RemoveAll(si.indexPath); err != nil {
136150
logger.Error("Failed to remove search index directory:", err)
@@ -191,10 +205,10 @@ func (si *SearchIndexer) handleConfigScan(configPath string, content []byte) (er
191205
}
192206
}()
193207

194-
// File size limit: 10MB to prevent memory overflow
195-
const maxFileSize = 10 * 1024 * 1024 // 10MB
208+
// File size limit: 1MB to prevent memory overflow and improve performance
209+
const maxFileSize = 1024 * 1024 // 1MB
196210
if len(content) > maxFileSize {
197-
logger.Warn("Skipping file due to size limit", "path", configPath, "size", len(content), "limit", maxFileSize)
211+
logger.Debugf("Skipping file due to size limit, path: %s, size: %d, limit: %d", configPath, len(content), maxFileSize)
198212
return nil
199213
}
200214

@@ -203,9 +217,9 @@ func (si *SearchIndexer) handleConfigScan(configPath string, content []byte) (er
203217
return nil
204218
}
205219

206-
// Basic content validation: check if it's text content
207-
if !isTextContent(content) {
208-
logger.Warn("Skipping non-text file", "path", configPath)
220+
// Basic content validation: check if it's a configuration file
221+
if !isConfigFile(content) {
222+
logger.Debugf("Skipping non-config file: %s", configPath)
209223
return nil
210224
}
211225

@@ -249,22 +263,38 @@ func (si *SearchIndexer) IndexDocument(doc SearchDocument) (err error) {
249263
}
250264
}()
251265

266+
// Additional size check as a safety measure
267+
if len(doc.Content) > 2*1024*1024 { // 2MB absolute limit
268+
return fmt.Errorf("document content too large: %d bytes", len(doc.Content))
269+
}
270+
271+
// Check memory usage before indexing
272+
contentSize := int64(len(doc.Content))
273+
if !si.checkMemoryLimitBeforeIndexing(contentSize) {
274+
logger.Warn("Skipping document due to memory limit", "document_id", doc.ID, "content_size", contentSize)
275+
return nil
276+
}
277+
252278
si.indexMutex.RLock()
253279
defer si.indexMutex.RUnlock()
254280

255281
if si.index == nil {
256282
return fmt.Errorf("search index not initialized")
257283
}
258284

259-
// Additional size check as a safety measure
260-
if len(doc.Content) > 50*1024*1024 { // 50MB absolute limit
261-
return fmt.Errorf("document content too large: %d bytes", len(doc.Content))
285+
// Index the document
286+
err = si.index.Index(doc.ID, doc)
287+
if err != nil {
288+
return err
262289
}
263290

291+
// Update memory usage tracking
292+
si.updateMemoryUsage(doc.ID, contentSize, true)
293+
264294
// logger.Debugf("Indexing document: ID=%s, Type=%s, Name=%s, Path=%s",
265295
// doc.ID, doc.Type, doc.Name, doc.Path)
266296

267-
return si.index.Index(doc.ID, doc)
297+
return nil
268298
}
269299

270300
// Search performs a search query
@@ -324,7 +354,7 @@ func (si *SearchIndexer) searchWithType(ctx context.Context, queryStr string, do
324354
}
325355
results := si.convertResults(res.result)
326356

327-
// Debug log the search execution
357+
// log the search execution
328358
logger.Debugf("Search index query '%s' (type: %s, limit: %d) returned %d results",
329359
queryStr, docType, limit, len(results))
330360

@@ -436,6 +466,10 @@ func (si *SearchIndexer) DeleteDocument(docID string) error {
436466
return fmt.Errorf("search index not initialized")
437467
}
438468

469+
// Note: We don't track the exact size of deleted documents here
470+
// as it would require storing document sizes separately.
471+
// The memory tracking will reset during periodic cleanups or restarts.
472+
439473
return si.index.Delete(docID)
440474
}
441475

@@ -499,9 +533,16 @@ func (si *SearchIndexer) GetIndexStats() (map[string]interface{}, error) {
499533
return nil, err
500534
}
501535

536+
// Get memory usage statistics
537+
totalContentSize, trackedDocCount, maxMemoryUsage := si.getMemoryUsage()
538+
502539
return map[string]interface{}{
503-
"document_count": docCount,
504-
"index_path": si.indexPath,
540+
"document_count": docCount,
541+
"tracked_document_count": trackedDocCount,
542+
"total_content_size": totalContentSize,
543+
"max_memory_usage": maxMemoryUsage,
544+
"memory_usage_percent": float64(totalContentSize) / float64(maxMemoryUsage) * 100,
545+
"index_path": si.indexPath,
505546
}, nil
506547
}
507548

@@ -537,49 +578,72 @@ func SearchAll(ctx context.Context, query string, limit int) ([]SearchResult, er
537578
return GetSearchIndexer().Search(ctx, query, limit)
538579
}
539580

540-
// isTextContent checks if the content appears to be text-based
541-
// This helps prevent indexing binary files that might have been misidentified
542-
func isTextContent(content []byte) bool {
543-
if len(content) == 0 {
544-
return true // Empty content is considered text
545-
}
546-
547-
// Check for common binary file signatures
548-
if len(content) >= 4 {
549-
// Check for some common binary file headers
550-
switch {
551-
case content[0] == 0x7F && content[1] == 0x45 && content[2] == 0x4C && content[3] == 0x46: // ELF
552-
return false
553-
case content[0] == 0x89 && content[1] == 0x50 && content[2] == 0x4E && content[3] == 0x47: // PNG
554-
return false
555-
case content[0] == 0xFF && content[1] == 0xD8 && content[2] == 0xFF: // JPEG
556-
return false
557-
case content[0] == 0x50 && content[1] == 0x4B && content[2] == 0x03 && content[3] == 0x04: // ZIP
558-
return false
559-
case content[0] == 0x50 && content[1] == 0x4B && content[2] == 0x05 && content[3] == 0x06: // ZIP (empty)
560-
return false
561-
case content[0] == 0x50 && content[1] == 0x4B && content[2] == 0x07 && content[3] == 0x08: // ZIP (spanned)
562-
return false
563-
}
581+
// checkMemoryLimitBeforeIndexing checks if adding new content would exceed memory limits
582+
func (si *SearchIndexer) checkMemoryLimitBeforeIndexing(contentSize int64) bool {
583+
si.memoryMutex.RLock()
584+
defer si.memoryMutex.RUnlock()
585+
586+
// Check if adding this content would exceed the memory limit
587+
newTotalSize := si.totalContentSize + contentSize
588+
if newTotalSize > si.maxMemoryUsage {
589+
logger.Debugf("Memory limit would be exceeded: current=%d, new=%d, limit=%d",
590+
si.totalContentSize, newTotalSize, si.maxMemoryUsage)
591+
return false
564592
}
565593

566-
// Check if the first part of the content contains mostly printable characters
567-
// Sample up to 8KB for performance
568-
sampleSize := len(content)
569-
if sampleSize > 8192 {
570-
sampleSize = 8192
594+
// Also check document count limit (max 1000 documents)
595+
if si.documentCount >= 1000 {
596+
logger.Debugf("Document count limit reached: %d", si.documentCount)
597+
return false
571598
}
572599

573-
nonPrintableCount := 0
574-
for i := 0; i < sampleSize; i++ {
575-
b := content[i]
576-
// Allow printable ASCII characters, newlines, tabs, and carriage returns
577-
if (b < 32 && b != 9 && b != 10 && b != 13) || b > 126 {
578-
nonPrintableCount++
600+
return true
601+
}
602+
603+
// updateMemoryUsage updates the memory usage tracking
604+
func (si *SearchIndexer) updateMemoryUsage(documentID string, contentSize int64, isAddition bool) {
605+
si.memoryMutex.Lock()
606+
defer si.memoryMutex.Unlock()
607+
608+
if isAddition {
609+
si.totalContentSize += contentSize
610+
si.documentCount++
611+
logger.Debugf("Added document %s: size=%d, total_size=%d, count=%d",
612+
documentID, contentSize, si.totalContentSize, si.documentCount)
613+
} else {
614+
si.totalContentSize -= contentSize
615+
si.documentCount--
616+
if si.totalContentSize < 0 {
617+
si.totalContentSize = 0
579618
}
619+
if si.documentCount < 0 {
620+
si.documentCount = 0
621+
}
622+
logger.Debugf("Removed document %s: size=%d, total_size=%d, count=%d",
623+
documentID, contentSize, si.totalContentSize, si.documentCount)
624+
}
625+
}
626+
627+
// getMemoryUsage returns current memory usage statistics
628+
func (si *SearchIndexer) getMemoryUsage() (int64, int64, int64) {
629+
si.memoryMutex.RLock()
630+
defer si.memoryMutex.RUnlock()
631+
return si.totalContentSize, si.documentCount, si.maxMemoryUsage
632+
}
633+
634+
// isConfigFile checks if the content is a text/plain file (most nginx configs)
635+
func isConfigFile(content []byte) bool {
636+
if len(content) == 0 {
637+
return false // Empty files are not useful for configuration
638+
}
639+
640+
// Detect MIME type and only accept text/plain
641+
mtype := mimetype.Detect(content)
642+
643+
if mtype.Is("text/plain") {
644+
return true
580645
}
581646

582-
// If more than 30% of the sampled content is non-printable, consider it binary
583-
threshold := float64(sampleSize) * 0.3
584-
return float64(nonPrintableCount) <= threshold
647+
logger.Debugf("Skipping non-text/plain file with MIME type: %s", mtype.String())
648+
return false
585649
}

0 commit comments

Comments
 (0)