1+ using EssentialCSharp . Web . Models ;
2+ using HtmlAgilityPack ;
3+ using System . Text . RegularExpressions ;
4+
5+ namespace EssentialCSharp . Web . Services ;
6+
7+ public interface IContentIndexingService
8+ {
9+ Task < bool > IndexAllContentAsync ( CancellationToken cancellationToken = default ) ;
10+ Task < bool > IndexSiteMappingAsync ( SiteMapping siteMapping , CancellationToken cancellationToken = default ) ;
11+ }
12+
13+ public class ContentIndexingService : IContentIndexingService
14+ {
15+ private readonly ITypesenseSearchService _searchService ;
16+ private readonly ISiteMappingService _siteMappingService ;
17+ private readonly IWebHostEnvironment _environment ;
18+ private readonly ILogger < ContentIndexingService > _logger ;
19+
20+ public ContentIndexingService (
21+ ITypesenseSearchService searchService ,
22+ ISiteMappingService siteMappingService ,
23+ IWebHostEnvironment environment ,
24+ ILogger < ContentIndexingService > logger )
25+ {
26+ _searchService = searchService ;
27+ _siteMappingService = siteMappingService ;
28+ _environment = environment ;
29+ _logger = logger ;
30+ }
31+
32+ public async Task < bool > IndexAllContentAsync ( CancellationToken cancellationToken = default )
33+ {
34+ try
35+ {
36+ _logger . LogInformation ( "Starting to index all content" ) ;
37+
38+ // Initialize the collection if it doesn't exist
39+ if ( ! await _searchService . InitializeCollectionAsync ( cancellationToken ) )
40+ {
41+ _logger . LogError ( "Failed to initialize Typesense collection" ) ;
42+ return false ;
43+ }
44+
45+ var documents = new List < SearchDocument > ( ) ;
46+
47+ foreach ( var siteMapping in _siteMappingService . SiteMappings )
48+ {
49+ var document = await CreateSearchDocumentAsync ( siteMapping ) ;
50+ if ( document != null )
51+ {
52+ documents . Add ( document ) ;
53+ }
54+ }
55+
56+ if ( documents . Count > 0 )
57+ {
58+ var success = await _searchService . IndexDocumentsAsync ( documents , cancellationToken ) ;
59+ _logger . LogInformation ( "Indexed {Count} documents, success: {Success}" , documents . Count , success ) ;
60+ return success ;
61+ }
62+
63+ _logger . LogWarning ( "No documents to index" ) ;
64+ return true ;
65+ }
66+ catch ( Exception ex )
67+ {
68+ _logger . LogError ( ex , "Failed to index all content" ) ;
69+ return false ;
70+ }
71+ }
72+
73+ public async Task < bool > IndexSiteMappingAsync ( SiteMapping siteMapping , CancellationToken cancellationToken = default )
74+ {
75+ try
76+ {
77+ var document = await CreateSearchDocumentAsync ( siteMapping ) ;
78+ if ( document == null )
79+ {
80+ return false ;
81+ }
82+
83+ return await _searchService . IndexDocumentAsync ( document , cancellationToken ) ;
84+ }
85+ catch ( Exception ex )
86+ {
87+ _logger . LogError ( ex , "Failed to index site mapping {Key}" , siteMapping . PrimaryKey ) ;
88+ return false ;
89+ }
90+ }
91+
92+ private async Task < SearchDocument ? > CreateSearchDocumentAsync ( SiteMapping siteMapping )
93+ {
94+ try
95+ {
96+ var filePath = Path . Combine ( _environment . ContentRootPath , Path . Combine ( siteMapping . PagePath ) ) ;
97+ if ( ! File . Exists ( filePath ) )
98+ {
99+ _logger . LogWarning ( "File not found: {FilePath}" , filePath ) ;
100+ return null ;
101+ }
102+
103+ var htmlContent = await File . ReadAllTextAsync ( filePath ) ;
104+ var doc = new HtmlDocument ( ) ;
105+ doc . LoadHtml ( htmlContent ) ;
106+
107+ // Extract content from body
108+ var bodyNode = doc . DocumentNode . SelectSingleNode ( "//body" ) ;
109+ if ( bodyNode == null )
110+ {
111+ _logger . LogWarning ( "No body content found in {FilePath}" , filePath ) ;
112+ return null ;
113+ }
114+
115+ // Remove script and style elements
116+ var scriptsAndStyles = bodyNode . SelectNodes ( "//script | //style" ) ;
117+ if ( scriptsAndStyles != null )
118+ {
119+ foreach ( var node in scriptsAndStyles )
120+ {
121+ node . Remove ( ) ;
122+ }
123+ }
124+
125+ // Extract plain text content
126+ var textContent = bodyNode . InnerText ;
127+ var cleanContent = CleanTextContent ( textContent ) ;
128+
129+ // Create tags based on the content
130+ var tags = new List < string > ( ) ;
131+ if ( ! string . IsNullOrEmpty ( siteMapping . ChapterTitle ) )
132+ {
133+ tags . Add ( $ "chapter-{ siteMapping . ChapterNumber } ") ;
134+ }
135+
136+ // Extract URL from the first key
137+ var url = $ "/{ siteMapping . Keys . First ( ) } ";
138+ if ( ! string . IsNullOrEmpty ( siteMapping . AnchorId ) )
139+ {
140+ url += $ "#{ siteMapping . AnchorId } ";
141+ }
142+
143+ return new SearchDocument
144+ {
145+ Id = siteMapping . PrimaryKey ,
146+ Title = siteMapping . RawHeading ?? siteMapping . ChapterTitle ?? "Unknown" ,
147+ Content = cleanContent ,
148+ Url = url ,
149+ Chapter = $ "Chapter { siteMapping . ChapterNumber } : { siteMapping . ChapterTitle } ",
150+ Section = siteMapping . RawHeading ?? string . Empty ,
151+ Tags = tags ,
152+ CreatedAt = DateTimeOffset . UtcNow . ToUnixTimeSeconds ( )
153+ } ;
154+ }
155+ catch ( Exception ex )
156+ {
157+ _logger . LogError ( ex , "Failed to create search document for {Key}" , siteMapping . PrimaryKey ) ;
158+ return null ;
159+ }
160+ }
161+
162+ private static string CleanTextContent ( string htmlText )
163+ {
164+ if ( string . IsNullOrEmpty ( htmlText ) )
165+ {
166+ return string . Empty ;
167+ }
168+
169+ // Decode HTML entities
170+ var decodedText = HtmlEntity . DeEntitize ( htmlText ) ;
171+
172+ // Remove extra whitespace and normalize line breaks
173+ var cleanText = Regex . Replace ( decodedText , @"\s+" , " " ) ;
174+
175+ // Remove leading/trailing whitespace
176+ cleanText = cleanText . Trim ( ) ;
177+
178+ // Limit content length for search indexing (Typesense has limits)
179+ if ( cleanText . Length > 10000 )
180+ {
181+ cleanText = cleanText [ ..10000 ] + "..." ;
182+ }
183+
184+ return cleanText ;
185+ }
186+ }
0 commit comments