Skip to content

Commit 288c774

Browse files
committed
some words about text extraction with HtmlUnit added
1 parent 91ed24c commit 288c774

File tree

2 files changed

+247
-23
lines changed

2 files changed

+247
-23
lines changed

src/site/xdoc/gettingStarted.xml

Lines changed: 70 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -41,15 +41,15 @@
4141
<source><![CDATA[
4242
@Test
4343
public void homePage() throws Exception {
44-
try (final WebClient webClient = new WebClient()) {
44+
try (WebClient webClient = new WebClient()) {
4545
final HtmlPage page = webClient.getPage("https://www.htmlunit.org/");
46-
Assert.assertEquals("HtmlUnit – Welcome to HtmlUnit", page.getTitleText());
46+
Assertions.assertEquals("HtmlUnit – Welcome to HtmlUnit", page.getTitleText());
4747
4848
final String pageAsXml = page.asXml();
49-
Assert.assertTrue(pageAsXml.contains("<body class=\"topBarDisabled\">"));
49+
Assertions.assertTrue(pageAsXml.contains("<body class=\"topBarDisabled\">"));
5050
5151
final String pageAsText = page.asNormalizedText();
52-
Assert.assertTrue(pageAsText.contains("Support for the HTTP and HTTPS protocols"));
52+
Assertions.assertTrue(pageAsText.contains("Support for the HTTP and HTTPS protocols"));
5353
}
5454
}]]></source>
5555
</section>
@@ -62,14 +62,14 @@ public void homePage() throws Exception {
6262
<source><![CDATA[
6363
@Test
6464
public void submittingForm() throws Exception {
65-
try (final WebClient webClient = new WebClient()) {
65+
try (WebClient webClient = new WebClient()) {
6666
6767
// Get the first page
68-
final HtmlPage page1 = webClient.getPage("http://some_url");
68+
final HtmlPage page = webClient.getPage("http://some_url");
6969
70-
// Get the form that we are dealing with and within that form,
70+
// Get the form that we are dealing with and within that form,
7171
// find the submit button and the field that we want to change.
72-
final HtmlForm form = page1.getFormByName("myform");
72+
final HtmlForm form = page.getFormByName("myform");
7373
7474
final HtmlSubmitInput button = form.getInputByName("submitbutton");
7575
final HtmlTextInput textField = form.getInputByName("userid");
@@ -78,7 +78,7 @@ public void submittingForm() throws Exception {
7878
textField.type("root");
7979
8080
// Now submit the form by clicking the button and get back the second page.
81-
final HtmlPage page2 = button.click();
81+
final HtmlPage secondPage = button.click();
8282
}
8383
}]]></source>
8484

@@ -115,7 +115,7 @@ final HtmlTextInput textField = form.getInputByName("userid");]]></source>
115115
</p>
116116
</subsection>
117117

118-
<subsection name="Text input &lt;input type='test'&gt;">
118+
<subsection name="Text input &lt;input type='text'&gt;">
119119
<p>
120120
These form elements represented as instances of class HtmlTextInput.
121121
</p>
@@ -140,7 +140,7 @@ textField.type("RBRi");]]></source>
140140
These form elements represented as instances of class HtmlTextArea.
141141
</p>
142142
<source><![CDATA[
143-
final HtmlTextArea textArea = form.getInputByName("comment");]]></source>
143+
final HtmlTextArea textArea = form.getTextAreaByName("comment");]]></source>
144144
<p>
145145
The usage of HtmlTextArea is similar to HtmlTextInput (because both derived from HtmlSelectableTextInput).
146146
This means you can also use type(String) or even setValue(String) for updating these elements.
@@ -209,8 +209,9 @@ currency.setSelectedAttribute(euro, true);]]></source>
209209
<source><![CDATA[
210210
@Test
211211
public void getElements() throws Exception {
212-
try (final WebClient webClient = new WebClient()) {
212+
try (WebClient webClient = new WebClient()) {
213213
final HtmlPage page = webClient.getPage("http://some_url");
214+
214215
final HtmlDivision div = page.getHtmlElementById("some_div_id");
215216
final HtmlAnchor anchor = page.getAnchorByName("anchor_name");
216217
}
@@ -221,12 +222,13 @@ public void getElements() throws Exception {
221222
<source><![CDATA[
222223
@Test
223224
public void getElements() throws Exception {
224-
try (final WebClient webClient = new WebClient()) {
225-
final HtmlPage page = webClient.getPage("http://some_url");
226-
NodeList inputs = page.getElementsByTagName("input");
227-
final Iterator<E> nodesIterator = nodes.iterator();
228-
// now iterate
229-
}
225+
try (WebClient webClient = new WebClient()) {
226+
final HtmlPage page = webClient.getPage("http://some_url");
227+
228+
final DomNodeList<DomElement> inputs = page.getElementsByTagName("input");
229+
final Iterator<DomElement> nodesIterator = inputs.iterator();
230+
// now iterate
231+
}
230232
}]]></source>
231233
<p>
232234
There is rich set of methods usable to locate page elements e.g.
@@ -260,8 +262,8 @@ public void getElements() throws Exception {
260262
<source><![CDATA[
261263
@Test
262264
public void xpath() throws Exception {
263-
try (final WebClient webClient = new WebClient()) {
264-
final HtmlPage page = webClient.getPage("https://htmlunit.sourceforge.io/");
265+
try (WebClient webClient = new WebClient()) {
266+
final HtmlPage page = webClient.getPage("https://www.htmlunit.org/");
265267
266268
//get list of all divs
267269
final List<?> divs = page.getByXPath("//div");
@@ -279,13 +281,13 @@ public void xpath() throws Exception {
279281
<source><![CDATA[
280282
@Test
281283
public void cssSelector() throws Exception {
282-
try (final WebClient webClient = new WebClient()) {
284+
try (WebClient webClient = new WebClient()) {
283285
final HtmlPage page = webClient.getPage("https://www.htmlunit.org/");
284286
285287
//get list of all divs
286288
final DomNodeList<DomNode> divs = page.querySelectorAll("div");
287-
for (DomNode div : divs) {
288-
....
289+
for (final DomNode div : divs) {
290+
// ....
289291
}
290292
291293
//get div which has the id 'breadcrumbs'
@@ -295,5 +297,50 @@ public void cssSelector() throws Exception {
295297
</subsection>
296298
</section>
297299

300+
<section name="Extracting text">
301+
<p>
302+
When you need to extract text from a web page, think of it as a two-step process: find it, then extract it.
303+
Use HtmlUnit's search methods (like getElementById(), XPath, or CSS selectors) to find the specific element
304+
containing the text you want. Then simply call the asNormalizedText() method on that element to get the text
305+
exactly as a user would see it in their browser.<br/>
306+
See following example:
307+
</p>
308+
<source><![CDATA[
309+
@Test
310+
public void extractTextToc() throws Exception {
311+
try (WebClient webClient = new WebClient()) {
312+
final HtmlPage page = webClient.getPage("https://www.htmlunit.org/");
313+
314+
final DomNode sponsoringDiv = page.querySelector("#bodyColumn > section:nth-child(1) > div:nth-child(2)");
315+
316+
// A normalized textual representation of this element that represents
317+
// what would be visible to the user if this page was shown in a web browser.
318+
// Whitespace is normalized like in the browser and block tags are separated by '\n'.
319+
final String content = sponsoringDiv.asNormalizedText();
320+
}
321+
}]]></source>
322+
323+
<subsection name="Extracting the whole page content">
324+
<p>
325+
If you want to extract all text from an entire page without targeting specific elements, you can call asNormalizedText()
326+
directly on the body element. This is useful for getting a quick overview of all visible content or for full-text indexing.<br/>
327+
Here's a simple example that loads a page and extracts all its text:
328+
</p>
329+
<source><![CDATA[
330+
@Test
331+
public void extractTextFromBody() throws Exception {
332+
try (WebClient webClient = new WebClient()) {
333+
final HtmlPage page = webClient.getPage("https://www.htmlunit.org/");
334+
335+
final HtmlBody body = page.getBody();
336+
337+
// A normalized textual representation of this element that represents
338+
// what would be visible to the user if this page was shown in a web browser.
339+
// Whitespace is normalized like in the browser and block tags are separated by '\n'.
340+
final String bodyContent = body.asNormalizedText();
341+
}
342+
}]]></source>
343+
</subsection>
344+
</section>
298345
</body>
299346
</document>
Lines changed: 177 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,177 @@
1+
/*
2+
* Copyright (c) 2002-2025 Gargoyle Software Inc.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
* https://www.apache.org/licenses/LICENSE-2.0
8+
*
9+
* Unless required by applicable law or agreed to in writing, software
10+
* distributed under the License is distributed on an "AS IS" BASIS,
11+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
* See the License for the specific language governing permissions and
13+
* limitations under the License.
14+
*/
15+
package org.htmlunit.doc;
16+
17+
import java.util.Iterator;
18+
import java.util.List;
19+
20+
import org.htmlunit.WebClient;
21+
import org.htmlunit.WebServerTestCase;
22+
import org.htmlunit.html.DomElement;
23+
import org.htmlunit.html.DomNode;
24+
import org.htmlunit.html.DomNodeList;
25+
import org.htmlunit.html.HtmlAnchor;
26+
import org.htmlunit.html.HtmlBody;
27+
import org.htmlunit.html.HtmlDivision;
28+
import org.htmlunit.html.HtmlForm;
29+
import org.htmlunit.html.HtmlPage;
30+
import org.htmlunit.html.HtmlSubmitInput;
31+
import org.htmlunit.html.HtmlTextInput;
32+
import org.junit.jupiter.api.Assertions;
33+
import org.junit.jupiter.api.Test;
34+
35+
/**
36+
* Tests for the sample code from the documentation to make sure
37+
* we adapt the docu or do not break the samples.
38+
*
39+
* @author Ronald Brill
40+
*/
41+
public class GettingStartedTest extends WebServerTestCase {
42+
43+
/**
44+
* @throws Exception if an error occurs
45+
*/
46+
@Test
47+
public void homePage() throws Exception {
48+
try (WebClient webClient = new WebClient()) {
49+
final HtmlPage page = webClient.getPage("https://www.htmlunit.org/");
50+
Assertions.assertEquals("HtmlUnit – Welcome to HtmlUnit", page.getTitleText());
51+
52+
final String pageAsXml = page.asXml();
53+
Assertions.assertTrue(pageAsXml.contains("<body class=\"topBarDisabled\">"));
54+
55+
final String pageAsText = page.asNormalizedText();
56+
Assertions.assertTrue(pageAsText.contains("Support for the HTTP and HTTPS protocols"));
57+
}
58+
}
59+
60+
/**
61+
* @throws Exception if an error occurs
62+
*/
63+
@Test
64+
public void xpath() throws Exception {
65+
try (WebClient webClient = new WebClient()) {
66+
final HtmlPage page = webClient.getPage("https://www.htmlunit.org/");
67+
68+
//get list of all divs
69+
final List<?> divs = page.getByXPath("//div");
70+
71+
//get div which has a 'id' attribute of 'banner'
72+
final HtmlDivision div = (HtmlDivision) page.getByXPath("//div[@id='banner']").get(0);
73+
}
74+
}
75+
76+
/**
77+
* @throws Exception if an error occurs
78+
*/
79+
@Test
80+
public void cssSelector() throws Exception {
81+
try (WebClient webClient = new WebClient()) {
82+
final HtmlPage page = webClient.getPage("https://www.htmlunit.org/");
83+
84+
//get list of all divs
85+
final DomNodeList<DomNode> divs = page.querySelectorAll("div");
86+
for (final DomNode div : divs) {
87+
// ....
88+
}
89+
90+
//get div which has the id 'breadcrumbs'
91+
final DomNode div = page.querySelector("div#breadcrumbs");
92+
}
93+
}
94+
95+
/**
96+
* @throws Exception if an error occurs
97+
*/
98+
public void submittingForm() throws Exception {
99+
try (WebClient webClient = new WebClient()) {
100+
101+
// Get the first page
102+
final HtmlPage page = webClient.getPage("http://some_url");
103+
104+
// Get the form that we are dealing with and within that form,
105+
// find the submit button and the field that we want to change.
106+
final HtmlForm form = page.getFormByName("myform");
107+
108+
final HtmlSubmitInput button = form.getInputByName("submitbutton");
109+
final HtmlTextInput textField = form.getInputByName("userid");
110+
111+
// Change the value of the text field
112+
textField.type("root");
113+
114+
// Now submit the form by clicking the button and get back the second page.
115+
final HtmlPage secondPage = button.click();
116+
}
117+
}
118+
119+
/**
120+
* @throws Exception if an error occurs
121+
*/
122+
@Test
123+
public void extractTextToc() throws Exception {
124+
try (WebClient webClient = new WebClient()) {
125+
final HtmlPage page = webClient.getPage("https://www.htmlunit.org/");
126+
127+
final DomNode sponsoringDiv = page.querySelector("#bodyColumn > section:nth-child(1) > div:nth-child(2)");
128+
129+
// A normalized textual representation of this element that represents
130+
// what would be visible to the user if this page was shown in a web browser.
131+
// Whitespace is normalized like in the browser and block tags are separated by '\n'.
132+
final String content = sponsoringDiv.asNormalizedText();
133+
}
134+
}
135+
136+
/**
137+
* @throws Exception if an error occurs
138+
*/
139+
@Test
140+
public void extractTextFromBody() throws Exception {
141+
try (WebClient webClient = new WebClient()) {
142+
final HtmlPage page = webClient.getPage("https://www.htmlunit.org/");
143+
144+
final HtmlBody body = page.getBody();
145+
146+
// A normalized textual representation of this element that represents
147+
// what would be visible to the user if this page was shown in a web browser.
148+
// Whitespace is normalized like in the browser and block tags are separated by '\n'.
149+
final String bodyContent = body.asNormalizedText();
150+
}
151+
}
152+
153+
/**
154+
* @throws Exception if an error occurs
155+
*/
156+
public void getElements() throws Exception {
157+
try (WebClient webClient = new WebClient()) {
158+
final HtmlPage page = webClient.getPage("http://some_url");
159+
160+
final HtmlDivision div = page.getHtmlElementById("some_div_id");
161+
final HtmlAnchor anchor = page.getAnchorByName("anchor_name");
162+
}
163+
}
164+
165+
/**
166+
* @throws Exception if an error occurs
167+
*/
168+
public void getElements2() throws Exception {
169+
try (WebClient webClient = new WebClient()) {
170+
final HtmlPage page = webClient.getPage("http://some_url");
171+
172+
final DomNodeList<DomElement> inputs = page.getElementsByTagName("input");
173+
final Iterator<DomElement> nodesIterator = inputs.iterator();
174+
// now iterate
175+
}
176+
}
177+
}

0 commit comments

Comments
 (0)