Skip to content

Commit 88a0444

Browse files
committed
chore: add support for extracting inline LLM instructions into an already parsed object
1 parent 823b1d4 commit 88a0444

File tree

3 files changed

+44
-8
lines changed

3 files changed

+44
-8
lines changed

CHANGELOG.md

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,11 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/) and this p
77

88
## [Unreleased]
99

10+
## [v3.3.0] - 2025-09-20
11+
12+
### Added
13+
- Optional parameter `$parsed` for extraction via `extractFromFile` or `extractFromHtml` as an already parsed llms.txt object.
14+
1015
## [v3.2.0] - 2025-08-28
1116

1217
### Added
@@ -118,7 +123,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/) and this p
118123

119124
- Initial release.
120125

121-
[Unreleased]: https://github.com/raphaelstolt/llms-txt-php/compare/v3.2.0...HEAD
126+
[Unreleased]: https://github.com/raphaelstolt/llms-txt-php/compare/v3.3.0...HEAD
127+
[v3.3.0]: https://github.com/raphaelstolt/llms-txt-php/compare/v3.2.0...v3.3.0
122128
[v3.2.0]: https://github.com/raphaelstolt/llms-txt-php/compare/v3.1.0...v3.2.0
123129
[v3.1.0]: https://github.com/raphaelstolt/llms-txt-php/compare/v3.0.0...v3.1.0
124130
[v3.0.0]: https://github.com/raphaelstolt/llms-txt-php/compare/v2.1.1...v3.0.0

src/Extractor.php

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66

77
use DOMDocument;
88
use DOMXPath;
9+
use Exception;
910
use RuntimeException;
1011

1112
final class Extractor
@@ -14,11 +15,11 @@ final class Extractor
1415
* Extracts the contents of all <script type="text/llms.txt"> blocks from a given HTML file.
1516
*
1617
* @param string $filePath Path to the HTML file.
17-
* @throws RuntimeException If a file isn't found or unreadable.
18+
* @param bool $parsed Whether the llms.txt should be parsed or not.
19+
* @throws RuntimeException|Exception If a file isn't found or unreadable.
1820
* @return string[] Extracted contents.
19-
*
2021
*/
21-
public function extractFromFile(string $filePath): array
22+
public function extractFromFile(string $filePath, bool $parsed = false): array
2223
{
2324
if (!\is_readable($filePath)) {
2425
throw new RuntimeException("HTML file {$filePath} not found or not readable");
@@ -30,16 +31,18 @@ public function extractFromFile(string $filePath): array
3031
throw new RuntimeException("Failed to read HTML file {$filePath}");
3132
}
3233

33-
return $this->extractFromHtml($html);
34+
return $this->extractFromHtml($html, $parsed);
3435
}
3536

3637
/**
3738
* Extracts the contents of all <script type="text/llms.txt"> blocks from raw HTML.
3839
*
3940
* @param string $html Raw HTML string.
40-
* @return string[] Extracted contents.
41+
* @param bool $parsed Whether the llms.txt should be parsed or not.
42+
* @throws Exception If the llmxt.txt parsing fails.
43+
* @return array Extracted contents.
4144
*/
42-
public function extractFromHtml(string $html): array
45+
public function extractFromHtml(string $html, bool $parsed = false): array
4346
{
4447
$dom = new DOMDocument();
4548
// Suppress warnings from malformed HTML
@@ -50,7 +53,11 @@ public function extractFromHtml(string $html): array
5053

5154
$contents = [];
5255
foreach ($nodes as $node) {
53-
$contents[] = \trim($node->textContent);
56+
$content = \trim($node->textContent);
57+
if ($parsed) {
58+
$content = (new LlmsTxt)->parse($node->textContent);
59+
}
60+
$contents[] = $content;
5461
}
5562

5663
return $contents;

tests/ExtractorTest.php

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
use PHPUnit\Framework\Attributes\Test;
88
use RuntimeException;
99
use Stolt\LlmsTxt\Extractor;
10+
use Stolt\LlmsTxt\LlmsTxt;
1011

1112
final class ExtractorTest extends TestCase
1213
{
@@ -64,6 +65,28 @@ public function extractsMultipleBlocksFromHtml(): void
6465
$this->assertSame(['first-block', 'second-block'], $this->extractor->extractFromHtml($html));
6566
}
6667

68+
#[Test]
69+
public function parsesLlmsTxtBlocksFromHtml(): void
70+
{
71+
$html = <<<HTML
72+
<html>
73+
<body>
74+
<script type="text/llms.txt"># Title one</script>
75+
Some other content.
76+
<p>And some more content.</p>
77+
<br />
78+
<script type="text/llms.txt"># Title two</script>
79+
</body>
80+
</html>
81+
HTML;
82+
83+
$this->assertInstanceof(LlmsTxt::class, $this->extractor->extractFromHtml($html, true)[0]);
84+
85+
$secondLlmsTxtBlock = $this->extractor->extractFromHtml($html, true)[1];
86+
87+
$this->assertEquals('Title two', $secondLlmsTxtBlock->getTitle());
88+
}
89+
6790
#[Test]
6891
public function returnsAnEmptyArrayWhenNoBlocksAreFound(): void
6992
{

0 commit comments

Comments
 (0)