some words about text extraction with HtmlUnit added

rbri · rbri · commit 288c774187ab · 2025-10-06T18:59:04.000+02:00
diff --git a/src/site/xdoc/gettingStarted.xml b/src/site/xdoc/gettingStarted.xml
@@ -41,15 +41,15 @@
             <source><![CDATA[
 @Test
 public void homePage() throws Exception {
-    try (final WebClient webClient = new WebClient()) {
+    try (WebClient webClient = new WebClient()) {
         final HtmlPage page = webClient.getPage("https://www.htmlunit.org/");
-        Assert.assertEquals("HtmlUnit – Welcome to HtmlUnit", page.getTitleText());
+        Assertions.assertEquals("HtmlUnit – Welcome to HtmlUnit", page.getTitleText());
 
         final String pageAsXml = page.asXml();
-        Assert.assertTrue(pageAsXml.contains("<body class=\"topBarDisabled\">"));
+        Assertions.assertTrue(pageAsXml.contains("<body class=\"topBarDisabled\">"));
 
         final String pageAsText = page.asNormalizedText();
-        Assert.assertTrue(pageAsText.contains("Support for the HTTP and HTTPS protocols"));
+        Assertions.assertTrue(pageAsText.contains("Support for the HTTP and HTTPS protocols"));
     }
 }]]></source>
         </section>
@@ -62,14 +62,14 @@ public void homePage() throws Exception {
             <source><![CDATA[
 @Test
 public void submittingForm() throws Exception {
-    try (final WebClient webClient = new WebClient()) {
+    try (WebClient webClient = new WebClient()) {
 
         // Get the first page
-        final HtmlPage page1 = webClient.getPage("http://some_url");
+        final HtmlPage page = webClient.getPage("http://some_url");
 
-        // Get the form that we are dealing with and within that form, 
+        // Get the form that we are dealing with and within that form,
         // find the submit button and the field that we want to change.
-        final HtmlForm form = page1.getFormByName("myform");
+        final HtmlForm form = page.getFormByName("myform");
 
         final HtmlSubmitInput button = form.getInputByName("submitbutton");
         final HtmlTextInput textField = form.getInputByName("userid");
@@ -78,7 +78,7 @@ public void submittingForm() throws Exception {
         textField.type("root");
 
         // Now submit the form by clicking the button and get back the second page.
-        final HtmlPage page2 = button.click();
+        final HtmlPage secondPage = button.click();
     }
 }]]></source>
 
@@ -115,7 +115,7 @@ final HtmlTextInput textField = form.getInputByName("userid");]]></source>
                 </p>
             </subsection>
 
-            <subsection name="Text input &lt;input type='test'&gt;">
+            <subsection name="Text input &lt;input type='text'&gt;">
                 <p>
                     These form elements represented as instances of class HtmlTextInput.
                 </p>
@@ -140,7 +140,7 @@ textField.type("RBRi");]]></source>
                     These form elements represented as instances of class HtmlTextArea.
                 </p>
                 <source><![CDATA[
-final HtmlTextArea textArea = form.getInputByName("comment");]]></source>
+final HtmlTextArea textArea = form.getTextAreaByName("comment");]]></source>
                 <p>
                     The usage of HtmlTextArea is similar to HtmlTextInput (because both derived from HtmlSelectableTextInput).
                     This means you can also use type(String) or even setValue(String) for updating these elements.
@@ -209,8 +209,9 @@ currency.setSelectedAttribute(euro, true);]]></source>
                 <source><![CDATA[
 @Test
 public void getElements() throws Exception {
-    try (final WebClient webClient = new WebClient()) {
+    try (WebClient webClient = new WebClient()) {
         final HtmlPage page = webClient.getPage("http://some_url");
+
         final HtmlDivision div = page.getHtmlElementById("some_div_id");
         final HtmlAnchor anchor = page.getAnchorByName("anchor_name");
     }
@@ -221,12 +222,13 @@ public void getElements() throws Exception {
                 <source><![CDATA[
  @Test
  public void getElements() throws Exception {
-     try (final WebClient webClient = new WebClient()) {
-         final HtmlPage page = webClient.getPage("http://some_url");
-         NodeList inputs = page.getElementsByTagName("input");
-         final Iterator<E> nodesIterator = nodes.iterator();
-         // now iterate
-     }
+    try (WebClient webClient = new WebClient()) {
+        final HtmlPage page = webClient.getPage("http://some_url");
+
+        final DomNodeList<DomElement> inputs = page.getElementsByTagName("input");
+        final Iterator<DomElement> nodesIterator = inputs.iterator();
+        // now iterate
+    }
  }]]></source>
                 <p>
                     There is rich set of methods usable to locate page elements e.g.
@@ -260,8 +262,8 @@ public void getElements() throws Exception {
                 <source><![CDATA[
 @Test
 public void xpath() throws Exception {
-    try (final WebClient webClient = new WebClient()) {
-        final HtmlPage page = webClient.getPage("https://htmlunit.sourceforge.io/");
+    try (WebClient webClient = new WebClient()) {
+        final HtmlPage page = webClient.getPage("https://www.htmlunit.org/");
 
         //get list of all divs
         final List<?> divs = page.getByXPath("//div");
@@ -279,13 +281,13 @@ public void xpath() throws Exception {
                 <source><![CDATA[
 @Test
 public void cssSelector() throws Exception {
-    try (final WebClient webClient = new WebClient()) {
+    try (WebClient webClient = new WebClient()) {
         final HtmlPage page = webClient.getPage("https://www.htmlunit.org/");
 
         //get list of all divs
         final DomNodeList<DomNode> divs = page.querySelectorAll("div");
-        for (DomNode div : divs) {
-            ....
+        for (final DomNode div : divs) {
+            // ....
         }
 
         //get div which has the id 'breadcrumbs'
@@ -295,5 +297,50 @@ public void cssSelector() throws Exception {
             </subsection>
         </section>
 
+        <section name="Extracting text">
+            <p>
+                When you need to extract text from a web page, think of it as a two-step process: find it, then extract it.
+                Use HtmlUnit's search methods (like getElementById(), XPath, or CSS selectors) to find the specific element
+                containing the text you want. Then simply call the asNormalizedText() method on that element to get the text
+                exactly as a user would see it in their browser.<br/>
+                See following example:
+            </p>
+                <source><![CDATA[
+@Test
+public void extractTextToc() throws Exception {
+    try (WebClient webClient = new WebClient()) {
+        final HtmlPage page = webClient.getPage("https://www.htmlunit.org/");
+
+        final DomNode sponsoringDiv = page.querySelector("#bodyColumn > section:nth-child(1) > div:nth-child(2)");
+
+        // A normalized textual representation of this element that represents
+        // what would be visible to the user if this page was shown in a web browser.
+        // Whitespace is normalized like in the browser and block tags are separated by '\n'.
+        final String content = sponsoringDiv.asNormalizedText();
+    }
+}]]></source>
+
+            <subsection name="Extracting the whole page content">
+                <p>
+                    If you want to extract all text from an entire page without targeting specific elements, you can call asNormalizedText() 
+                    directly on the body element. This is useful for getting a quick overview of all visible content or for full-text indexing.<br/>
+                    Here's a simple example that loads a page and extracts all its text:
+                </p>
+                <source><![CDATA[
+@Test
+public void extractTextFromBody() throws Exception {
+    try (WebClient webClient = new WebClient()) {
+        final HtmlPage page = webClient.getPage("https://www.htmlunit.org/");
+
+        final HtmlBody body = page.getBody();
+
+        // A normalized textual representation of this element that represents
+        // what would be visible to the user if this page was shown in a web browser.
+        // Whitespace is normalized like in the browser and block tags are separated by '\n'.
+        final String bodyContent = body.asNormalizedText();
+    }
+}]]></source>
+            </subsection>
+        </section>
     </body>
 </document>
diff --git a/src/test/java/org/htmlunit/doc/GettingStartedTest.java b/src/test/java/org/htmlunit/doc/GettingStartedTest.java
@@ -0,0 +1,177 @@
+/*
+ * Copyright (c) 2002-2025 Gargoyle Software Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.htmlunit.doc;
+
+import java.util.Iterator;
+import java.util.List;
+
+import org.htmlunit.WebClient;
+import org.htmlunit.WebServerTestCase;
+import org.htmlunit.html.DomElement;
+import org.htmlunit.html.DomNode;
+import org.htmlunit.html.DomNodeList;
+import org.htmlunit.html.HtmlAnchor;
+import org.htmlunit.html.HtmlBody;
+import org.htmlunit.html.HtmlDivision;
+import org.htmlunit.html.HtmlForm;
+import org.htmlunit.html.HtmlPage;
+import org.htmlunit.html.HtmlSubmitInput;
+import org.htmlunit.html.HtmlTextInput;
+import org.junit.jupiter.api.Assertions;
+import org.junit.jupiter.api.Test;
+
+/**
+ * Tests for the sample code from the documentation to make sure
+ * we adapt the docu or do not break the samples.
+ *
+ * @author Ronald Brill
+ */
+public class GettingStartedTest extends WebServerTestCase {
+
+    /**
+     * @throws Exception if an error occurs
+     */
+    @Test
+    public void homePage() throws Exception {
+        try (WebClient webClient = new WebClient()) {
+            final HtmlPage page = webClient.getPage("https://www.htmlunit.org/");
+            Assertions.assertEquals("HtmlUnit – Welcome to HtmlUnit", page.getTitleText());
+
+            final String pageAsXml = page.asXml();
+            Assertions.assertTrue(pageAsXml.contains("<body class=\"topBarDisabled\">"));
+
+            final String pageAsText = page.asNormalizedText();
+            Assertions.assertTrue(pageAsText.contains("Support for the HTTP and HTTPS protocols"));
+        }
+    }
+
+    /**
+     * @throws Exception if an error occurs
+     */
+    @Test
+    public void xpath() throws Exception {
+        try (WebClient webClient = new WebClient()) {
+            final HtmlPage page = webClient.getPage("https://www.htmlunit.org/");
+
+            //get list of all divs
+            final List<?> divs = page.getByXPath("//div");
+
+            //get div which has a 'id' attribute of 'banner'
+            final HtmlDivision div = (HtmlDivision) page.getByXPath("//div[@id='banner']").get(0);
+        }
+    }
+
+    /**
+     * @throws Exception if an error occurs
+     */
+    @Test
+    public void cssSelector() throws Exception {
+        try (WebClient webClient = new WebClient()) {
+            final HtmlPage page = webClient.getPage("https://www.htmlunit.org/");
+
+            //get list of all divs
+            final DomNodeList<DomNode> divs = page.querySelectorAll("div");
+            for (final DomNode div : divs) {
+                // ....
+            }
+
+            //get div which has the id 'breadcrumbs'
+            final DomNode div = page.querySelector("div#breadcrumbs");
+        }
+    }
+
+    /**
+     * @throws Exception if an error occurs
+     */
+    public void submittingForm() throws Exception {
+        try (WebClient webClient = new WebClient()) {
+
+            // Get the first page
+            final HtmlPage page = webClient.getPage("http://some_url");
+
+            // Get the form that we are dealing with and within that form,
+            // find the submit button and the field that we want to change.
+            final HtmlForm form = page.getFormByName("myform");
+
+            final HtmlSubmitInput button = form.getInputByName("submitbutton");
+            final HtmlTextInput textField = form.getInputByName("userid");
+
+            // Change the value of the text field
+            textField.type("root");
+
+            // Now submit the form by clicking the button and get back the second page.
+            final HtmlPage secondPage = button.click();
+        }
+    }
+
+    /**
+     * @throws Exception if an error occurs
+     */
+    @Test
+    public void extractTextToc() throws Exception {
+        try (WebClient webClient = new WebClient()) {
+            final HtmlPage page = webClient.getPage("https://www.htmlunit.org/");
+
+            final DomNode sponsoringDiv = page.querySelector("#bodyColumn > section:nth-child(1) > div:nth-child(2)");
+
+            // A normalized textual representation of this element that represents
+            // what would be visible to the user if this page was shown in a web browser.
+            // Whitespace is normalized like in the browser and block tags are separated by '\n'.
+            final String content = sponsoringDiv.asNormalizedText();
+        }
+    }
+
+    /**
+     * @throws Exception if an error occurs
+     */
+    @Test
+    public void extractTextFromBody() throws Exception {
+        try (WebClient webClient = new WebClient()) {
+            final HtmlPage page = webClient.getPage("https://www.htmlunit.org/");
+
+            final HtmlBody body = page.getBody();
+
+            // A normalized textual representation of this element that represents
+            // what would be visible to the user if this page was shown in a web browser.
+            // Whitespace is normalized like in the browser and block tags are separated by '\n'.
+            final String bodyContent = body.asNormalizedText();
+        }
+    }
+
+    /**
+     * @throws Exception if an error occurs
+     */
+    public void getElements() throws Exception {
+        try (WebClient webClient = new WebClient()) {
+            final HtmlPage page = webClient.getPage("http://some_url");
+
+            final HtmlDivision div = page.getHtmlElementById("some_div_id");
+            final HtmlAnchor anchor = page.getAnchorByName("anchor_name");
+        }
+    }
+
+    /**
+     * @throws Exception if an error occurs
+     */
+    public void getElements2() throws Exception {
+        try (WebClient webClient = new WebClient()) {
+            final HtmlPage page = webClient.getPage("http://some_url");
+
+            final DomNodeList<DomElement> inputs = page.getElementsByTagName("input");
+            final Iterator<DomElement> nodesIterator = inputs.iterator();
+            // now iterate
+        }
+    }
+}