4141 <source ><![CDATA[
4242@Test
4343public void homePage() throws Exception {
44- try (final WebClient webClient = new WebClient()) {
44+ try (WebClient webClient = new WebClient()) {
4545 final HtmlPage page = webClient.getPage("https://www.htmlunit.org/");
46- Assert .assertEquals("HtmlUnit – Welcome to HtmlUnit", page.getTitleText());
46+ Assertions .assertEquals("HtmlUnit – Welcome to HtmlUnit", page.getTitleText());
4747
4848 final String pageAsXml = page.asXml();
49- Assert .assertTrue(pageAsXml.contains("<body class=\"topBarDisabled\">"));
49+ Assertions .assertTrue(pageAsXml.contains("<body class=\"topBarDisabled\">"));
5050
5151 final String pageAsText = page.asNormalizedText();
52- Assert .assertTrue(pageAsText.contains("Support for the HTTP and HTTPS protocols"));
52+ Assertions .assertTrue(pageAsText.contains("Support for the HTTP and HTTPS protocols"));
5353 }
5454}]]> </source >
5555 </section >
@@ -62,14 +62,14 @@ public void homePage() throws Exception {
6262 <source ><![CDATA[
6363@Test
6464public void submittingForm() throws Exception {
65- try (final WebClient webClient = new WebClient()) {
65+ try (WebClient webClient = new WebClient()) {
6666
6767 // Get the first page
68- final HtmlPage page1 = webClient.getPage("http://some_url");
68+ final HtmlPage page = webClient.getPage("http://some_url");
6969
70- // Get the form that we are dealing with and within that form,
70+ // Get the form that we are dealing with and within that form,
7171 // find the submit button and the field that we want to change.
72- final HtmlForm form = page1 .getFormByName("myform");
72+ final HtmlForm form = page .getFormByName("myform");
7373
7474 final HtmlSubmitInput button = form.getInputByName("submitbutton");
7575 final HtmlTextInput textField = form.getInputByName("userid");
@@ -78,7 +78,7 @@ public void submittingForm() throws Exception {
7878 textField.type("root");
7979
8080 // Now submit the form by clicking the button and get back the second page.
81- final HtmlPage page2 = button.click();
81+ final HtmlPage secondPage = button.click();
8282 }
8383}]]> </source >
8484
@@ -115,7 +115,7 @@ final HtmlTextInput textField = form.getInputByName("userid");]]></source>
115115 </p >
116116 </subsection >
117117
118- <subsection name =" Text input < input type='test '> " >
118+ <subsection name =" Text input < input type='text '> " >
119119 <p >
120120 These form elements represented as instances of class HtmlTextInput.
121121 </p >
@@ -140,7 +140,7 @@ textField.type("RBRi");]]></source>
140140 These form elements represented as instances of class HtmlTextArea.
141141 </p >
142142 <source ><![CDATA[
143- final HtmlTextArea textArea = form.getInputByName ("comment");]]> </source >
143+ final HtmlTextArea textArea = form.getTextAreaByName ("comment");]]> </source >
144144 <p >
145145 The usage of HtmlTextArea is similar to HtmlTextInput (because both derived from HtmlSelectableTextInput).
146146 This means you can also use type(String) or even setValue(String) for updating these elements.
@@ -209,8 +209,9 @@ currency.setSelectedAttribute(euro, true);]]></source>
209209 <source ><![CDATA[
210210@Test
211211public void getElements() throws Exception {
212- try (final WebClient webClient = new WebClient()) {
212+ try (WebClient webClient = new WebClient()) {
213213 final HtmlPage page = webClient.getPage("http://some_url");
214+
214215 final HtmlDivision div = page.getHtmlElementById("some_div_id");
215216 final HtmlAnchor anchor = page.getAnchorByName("anchor_name");
216217 }
@@ -221,12 +222,13 @@ public void getElements() throws Exception {
221222 <source ><![CDATA[
222223 @Test
223224 public void getElements() throws Exception {
224- try (final WebClient webClient = new WebClient()) {
225- final HtmlPage page = webClient.getPage("http://some_url");
226- NodeList inputs = page.getElementsByTagName("input");
227- final Iterator<E> nodesIterator = nodes.iterator();
228- // now iterate
229- }
225+ try (WebClient webClient = new WebClient()) {
226+ final HtmlPage page = webClient.getPage("http://some_url");
227+
228+ final DomNodeList<DomElement> inputs = page.getElementsByTagName("input");
229+ final Iterator<DomElement> nodesIterator = inputs.iterator();
230+ // now iterate
231+ }
230232 }]]> </source >
231233 <p >
232234 There is rich set of methods usable to locate page elements e.g.
@@ -260,8 +262,8 @@ public void getElements() throws Exception {
260262 <source ><![CDATA[
261263@Test
262264public void xpath() throws Exception {
263- try (final WebClient webClient = new WebClient()) {
264- final HtmlPage page = webClient.getPage("https://htmlunit.sourceforge.io /");
265+ try (WebClient webClient = new WebClient()) {
266+ final HtmlPage page = webClient.getPage("https://www. htmlunit.org /");
265267
266268 //get list of all divs
267269 final List<?> divs = page.getByXPath("//div");
@@ -279,13 +281,13 @@ public void xpath() throws Exception {
279281 <source ><![CDATA[
280282@Test
281283public void cssSelector() throws Exception {
282- try (final WebClient webClient = new WebClient()) {
284+ try (WebClient webClient = new WebClient()) {
283285 final HtmlPage page = webClient.getPage("https://www.htmlunit.org/");
284286
285287 //get list of all divs
286288 final DomNodeList<DomNode> divs = page.querySelectorAll("div");
287- for (DomNode div : divs) {
288- ....
289+ for (final DomNode div : divs) {
290+ // ....
289291 }
290292
291293 //get div which has the id 'breadcrumbs'
@@ -295,5 +297,50 @@ public void cssSelector() throws Exception {
295297 </subsection >
296298 </section >
297299
300+ <section name =" Extracting text" >
301+ <p >
302+ When you need to extract text from a web page, think of it as a two-step process: find it, then extract it.
303+ Use HtmlUnit's search methods (like getElementById(), XPath, or CSS selectors) to find the specific element
304+ containing the text you want. Then simply call the asNormalizedText() method on that element to get the text
305+ exactly as a user would see it in their browser.<br />
306+ See following example:
307+ </p >
308+ <source ><![CDATA[
309+ @Test
310+ public void extractTextToc() throws Exception {
311+ try (WebClient webClient = new WebClient()) {
312+ final HtmlPage page = webClient.getPage("https://www.htmlunit.org/");
313+
314+ final DomNode sponsoringDiv = page.querySelector("#bodyColumn > section:nth-child(1) > div:nth-child(2)");
315+
316+ // A normalized textual representation of this element that represents
317+ // what would be visible to the user if this page was shown in a web browser.
318+ // Whitespace is normalized like in the browser and block tags are separated by '\n'.
319+ final String content = sponsoringDiv.asNormalizedText();
320+ }
321+ }]]> </source >
322+
323+ <subsection name =" Extracting the whole page content" >
324+ <p >
325+ If you want to extract all text from an entire page without targeting specific elements, you can call asNormalizedText()
326+ directly on the body element. This is useful for getting a quick overview of all visible content or for full-text indexing.<br />
327+ Here's a simple example that loads a page and extracts all its text:
328+ </p >
329+ <source ><![CDATA[
330+ @Test
331+ public void extractTextFromBody() throws Exception {
332+ try (WebClient webClient = new WebClient()) {
333+ final HtmlPage page = webClient.getPage("https://www.htmlunit.org/");
334+
335+ final HtmlBody body = page.getBody();
336+
337+ // A normalized textual representation of this element that represents
338+ // what would be visible to the user if this page was shown in a web browser.
339+ // Whitespace is normalized like in the browser and block tags are separated by '\n'.
340+ final String bodyContent = body.asNormalizedText();
341+ }
342+ }]]> </source >
343+ </subsection >
344+ </section >
298345 </body >
299346</document >
0 commit comments