66
77use DOMDocument ;
88use DOMXPath ;
9+ use Exception ;
910use RuntimeException ;
1011
1112final class Extractor
@@ -14,11 +15,11 @@ final class Extractor
1415 * Extracts the contents of all <script type="text/llms.txt"> blocks from a given HTML file.
1516 *
1617 * @param string $filePath Path to the HTML file.
17- * @throws RuntimeException If a file isn't found or unreadable.
18+ * @param bool $parsed Whether the llms.txt should be parsed or not.
19+ * @throws RuntimeException|Exception If a file isn't found or unreadable.
1820 * @return string[] Extracted contents.
19- *
2021 */
21- public function extractFromFile (string $ filePath ): array
22+ public function extractFromFile (string $ filePath, bool $ parsed = false ): array
2223 {
2324 if (!\is_readable ($ filePath )) {
2425 throw new RuntimeException ("HTML file {$ filePath } not found or not readable " );
@@ -30,16 +31,18 @@ public function extractFromFile(string $filePath): array
3031 throw new RuntimeException ("Failed to read HTML file {$ filePath }" );
3132 }
3233
33- return $ this ->extractFromHtml ($ html );
34+ return $ this ->extractFromHtml ($ html, $ parsed );
3435 }
3536
3637 /**
3738 * Extracts the contents of all <script type="text/llms.txt"> blocks from raw HTML.
3839 *
3940 * @param string $html Raw HTML string.
40- * @return string[] Extracted contents.
41+ * @param bool $parsed Whether the llms.txt should be parsed or not.
42+ * @throws Exception If the llmxt.txt parsing fails.
43+ * @return array Extracted contents.
4144 */
42- public function extractFromHtml (string $ html ): array
45+ public function extractFromHtml (string $ html, bool $ parsed = false ): array
4346 {
4447 $ dom = new DOMDocument ();
4548 // Suppress warnings from malformed HTML
@@ -50,7 +53,11 @@ public function extractFromHtml(string $html): array
5053
5154 $ contents = [];
5255 foreach ($ nodes as $ node ) {
53- $ contents [] = \trim ($ node ->textContent );
56+ $ content = \trim ($ node ->textContent );
57+ if ($ parsed ) {
58+ $ content = (new LlmsTxt )->parse ($ node ->textContent );
59+ }
60+ $ contents [] = $ content ;
5461 }
5562
5663 return $ contents ;
0 commit comments