@@ -65,6 +65,7 @@ pub fn extract_content_as_markdown(
6565mod tests {
6666 use super :: * ;
6767 use crate :: DensityTree ;
68+ use std:: fs;
6869
6970 #[ test]
7071 #[ cfg( feature = "markdown" ) ]
@@ -89,13 +90,110 @@ mod tests {
8990
9091 let markdown = extract_content_as_markdown ( & dtree, & document) . unwrap ( ) ;
9192
92- // Debug: print what we actually got
93- println ! ( "Generated markdown: '{}'" , markdown) ;
94-
9593 // Should contain the main content
9694 assert ! ( !markdown. is_empty( ) , "Markdown should not be empty" ) ;
97- // Relaxed assertions for debugging
9895 assert ! ( markdown. contains( "Main Article" ) ) ;
9996 assert ! ( markdown. contains( "main content" ) ) ;
10097 }
98+
99+ #[ test]
100+ #[ cfg( feature = "markdown" ) ]
101+ fn test_extract_from_test1_html ( ) {
102+ let html_content = fs:: read_to_string ( "html/test_1.html" )
103+ . expect ( "Unable to read test_1.html" ) ;
104+ let document = Html :: parse_document ( & html_content) ;
105+ let mut dtree = DensityTree :: from_document ( & document) . unwrap ( ) ;
106+ dtree. calculate_density_sum ( ) . unwrap ( ) ;
107+
108+ let markdown = extract_content_as_markdown ( & dtree, & document) . unwrap ( ) ;
109+
110+ // Debug: print what we actually got
111+ println ! ( "test1 markdown: '{}'" , markdown) ;
112+
113+ // Should extract article body content (highest density)
114+ assert ! ( !markdown. is_empty( ) , "Markdown should not be empty" ) ;
115+ // Check for content that should be present in article body
116+ assert ! ( markdown. contains( "Here is text" ) ) ;
117+ assert ! ( markdown. contains( "Paragraph text" ) ) ;
118+ assert ! ( markdown. contains( "huge paragraph" ) ) ;
119+ // Should not contain footer navigation
120+ assert ! ( !markdown. contains( "Menu" ) ) ;
121+ assert ! ( !markdown. contains( "link1" ) ) ;
122+ }
123+
124+ #[ test]
125+ #[ cfg( feature = "markdown" ) ]
126+ fn test_extract_from_test2_html ( ) {
127+ let html_content = fs:: read_to_string ( "html/test_2.html" )
128+ . expect ( "Unable to read test_2.html" ) ;
129+ let document = Html :: parse_document ( & html_content) ;
130+ let mut dtree = DensityTree :: from_document ( & document) . unwrap ( ) ;
131+ dtree. calculate_density_sum ( ) . unwrap ( ) ;
132+
133+ let markdown = extract_content_as_markdown ( & dtree, & document) . unwrap ( ) ;
134+
135+ // Debug: print what we actually got
136+ println ! ( "test2 markdown: '{}'" , markdown) ;
137+
138+ // Should extract article body content (highest density)
139+ assert ! ( !markdown. is_empty( ) , "Markdown should not be empty" ) ;
140+ // Check for content that should be present in article body
141+ assert ! ( markdown. contains( "Here is text" ) ) ;
142+ assert ! ( markdown. contains( "long paragraph" ) ) ;
143+ // Links should be converted to markdown format
144+ assert ! ( markdown. contains( "wikipedia" ) ) ;
145+ }
146+
147+ #[ test]
148+ #[ cfg( feature = "markdown" ) ]
149+ fn test_extract_from_test4_html ( ) {
150+ let html_content = fs:: read_to_string ( "html/test_4.html" )
151+ . expect ( "Unable to read test_4.html" ) ;
152+ let document = Html :: parse_document ( & html_content) ;
153+ let mut dtree = DensityTree :: from_document ( & document) . unwrap ( ) ;
154+ dtree. calculate_density_sum ( ) . unwrap ( ) ;
155+
156+ let markdown = extract_content_as_markdown ( & dtree, & document) . unwrap ( ) ;
157+
158+ // Debug: print what we actually got
159+ println ! ( "test4 markdown: '{}'" , markdown) ;
160+
161+ // Should extract article content and filter out scripts/comments
162+ assert ! ( !markdown. is_empty( ) , "Markdown should not be empty" ) ;
163+ // Check for content that should be present
164+ assert ! ( markdown. contains( "Lorem ipsum" ) ) ;
165+ assert ! ( markdown. contains( "long paragraph" ) ) ;
166+ assert ! ( markdown. contains( "wikipedia" ) ) ;
167+ // Should not contain script content
168+ assert ! ( !markdown. contains( "myFunction" ) ) ;
169+ assert ! ( !markdown. contains( "Some comments" ) ) ;
170+ }
171+
172+ #[ test]
173+ #[ cfg( feature = "markdown" ) ]
174+ fn test_empty_content_returns_empty_markdown ( ) {
175+ let html = r#"
176+ <html>
177+ <body>
178+ <script>console.log("empty")</script>
179+ </body>
180+ </html>
181+ "# ;
182+
183+ let document = Html :: parse_document ( html) ;
184+ let mut dtree = DensityTree :: from_document ( & document) . unwrap ( ) ;
185+ dtree. calculate_density_sum ( ) . unwrap ( ) ;
186+
187+ let markdown = extract_content_as_markdown ( & dtree, & document) . unwrap ( ) ;
188+
189+ // Debug: print what we actually got
190+ println ! ( "empty content markdown: '{}'" , markdown) ;
191+
192+ // Empty content should return empty string
193+ assert ! (
194+ markdown. is_empty( ) ,
195+ "Expected empty markdown for content-less HTML, got: '{}'" ,
196+ markdown
197+ ) ;
198+ }
101199}
0 commit comments