Skip to content

Commit d4abcab

Browse files
Merge pull request #361 from EvotecIT/ParsingImprovements
2 parents 1383e6b + ac5b33f commit d4abcab

15 files changed

+991
-334
lines changed

PSParseHTML.psd1

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
CmdletsToExport = @('Close-HtmlBrowserSession', 'Compare-HTML', 'ConvertFrom-HTML', 'ConvertFrom-HtmlAttributes', 'ConvertFrom-HTMLCookie', 'ConvertFrom-HtmlForm', 'ConvertFrom-HtmlList', 'ConvertFrom-HtmlMeta', 'ConvertFrom-HtmlMicrodata', 'ConvertFrom-HtmlOpenGraph', 'ConvertFrom-HtmlTable', 'Convert-HTMLToText', 'Export-BrowserState', 'Export-HtmlBrowserSession', 'Export-HTMLOutline', 'Format-CSS', 'Format-HTML', 'Format-JavaScript', 'Get-HtmlBrowserConsoleLog', 'Get-HtmlBrowserContent', 'Get-HtmlBrowserCookie', 'Get-HtmlBrowserFormField', 'Get-HtmlBrowserInteractable', 'Get-HtmlBrowserLoginForm', 'Get-HtmlBrowserNetworkLog', 'Get-HTMLResource', 'Import-BrowserState', 'Import-HtmlBrowserSession', 'Invoke-HtmlBrowserClick', 'Invoke-HtmlBrowserDomScript', 'Invoke-HtmlBrowserLogin', 'Invoke-HtmlBrowserNavigation', 'Invoke-HtmlBrowserScript', 'Invoke-HTMLRendering', 'Measure-HtmlBrowserPerformance', 'Measure-HtmlDocumentStructure', 'New-HtmlBrowserCookie', 'Optimize-CSS', 'Optimize-Email', 'Optimize-HTML', 'Optimize-JavaScript', 'Register-HTMLRoute', 'Save-HtmlBrowserAttachment', 'Save-HtmlBrowserHar', 'Save-HtmlBrowserPdf', 'Save-HtmlBrowserScreenshot', 'Set-HtmlBrowserChecked', 'Set-HtmlBrowserClientOption', 'Set-HtmlBrowserCookie', 'Set-HtmlBrowserInput', 'Set-HtmlBrowserSelectOption', 'Show-HtmlBrowserHar', 'Start-HtmlBrowserTracing', 'Start-HtmlBrowserVideoCapture', 'Stop-HtmlBrowserTracing', 'Stop-HtmlBrowserVideoCapture', 'Submit-HtmlBrowserForm', 'Test-HtmlMicrodata', 'Unregister-HTMLRoute', 'Test-HtmlBrowser', 'Clear-HtmlBrowserCache')
55
CompanyName = 'Evotec'
66
CompatiblePSEditions = @('Desktop', 'Core')
7-
Copyright = '(c) 2011 - 2025 Przemyslaw Klys @ Evotec. All rights reserved.'
7+
Copyright = '(c) 2011 - 2026 Przemyslaw Klys @ Evotec. All rights reserved.'
88
Description = 'Module that allows to manipulate, parse, format and optimize HTML, JavaScript and CSS'
99
DotNetFrameworkVersion = '4.7.2'
1010
FunctionsToExport = @()

README.MD

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1070,7 +1070,7 @@ Console.WriteLine($"\nSlowest page: {slowest.Url} ({slowest.PageLoadTime.TotalSe
10701070

10711071
### Playwright Auto-Setup
10721072

1073-
Playwright browsers are automatically downloaded on first use. No manual setup required! The download happens once per system and is shared across all applications.
1073+
Playwright browsers are automatically downloaded on first use. No manual setup required. The download is cached per-user (default locations below).
10741074

10751075
#### How Auto-Download Works
10761076
When you first use browser testing, Playwright automatically downloads required components:
@@ -1093,6 +1093,13 @@ When you first use browser testing, Playwright automatically downloads required
10931093
- Subsequent runs use cached components - no re-download needed
10941094
- You can manually ensure browsers are installed using `HtmlBrowser.EnsureInstalledAsync()`
10951095

1096+
#### Linux: Avoiding sudo prompts
1097+
On Linux, Playwright can also install OS-level dependencies when invoked with `--with-deps` (this typically requires root/sudo).
1098+
1099+
By default, HtmlTinkerX only uses `--with-deps` when running as root to avoid unexpected sudo prompts during normal test execution. You can override this behavior by setting:
1100+
- `HTMLTINKERX_PLAYWRIGHT_WITH_DEPS=1` to force `--with-deps`
1101+
- `HTMLTINKERX_PLAYWRIGHT_WITH_DEPS=0` to never use `--with-deps`
1102+
10961103
#### Cleaning Playwright Cache
10971104
```powershell
10981105
# View cache size and clean if needed

Sources/HtmlTinkerX.Tests/HtmlBrowserInstallerTests.cs

Lines changed: 49 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ namespace HtmlTinkerX.Tests;
1616
public class HtmlBrowserInstallerTests
1717
{
1818
[Fact]
19-
public async Task EnsureInstalledAsync_InstallsDepsOnLinux()
19+
public async Task EnsureInstalledAsync_InstallsDepsOnLinux_WhenEnabled()
2020
{
2121
if (!RuntimeInformation.IsOSPlatform(OSPlatform.Linux))
2222
{
@@ -27,6 +27,7 @@ public async Task EnsureInstalledAsync_InstallsDepsOnLinux()
2727
string tempDriver = Path.Combine(Path.GetTempPath(), Guid.NewGuid().ToString());
2828
Environment.SetEnvironmentVariable("PLAYWRIGHT_BROWSERS_PATH", tempBrowsers);
2929
Environment.SetEnvironmentVariable("PLAYWRIGHT_DRIVER_SEARCH_PATH", tempDriver);
30+
Environment.SetEnvironmentVariable("HTMLTINKERX_PLAYWRIGHT_WITH_DEPS", "1");
3031

3132
var originalInstaller = HtmlBrowser.PlaywrightInstaller;
3233

@@ -54,6 +55,53 @@ public async Task EnsureInstalledAsync_InstallsDepsOnLinux()
5455
HtmlBrowser.PlaywrightInstaller = originalInstaller;
5556
Environment.SetEnvironmentVariable("PLAYWRIGHT_BROWSERS_PATH", null);
5657
Environment.SetEnvironmentVariable("PLAYWRIGHT_DRIVER_SEARCH_PATH", null);
58+
Environment.SetEnvironmentVariable("HTMLTINKERX_PLAYWRIGHT_WITH_DEPS", null);
59+
if (Directory.Exists(tempBrowsers)) Directory.Delete(tempBrowsers, true);
60+
if (Directory.Exists(tempDriver)) Directory.Delete(tempDriver, true);
61+
}
62+
}
63+
64+
[Fact]
65+
public async Task EnsureInstalledAsync_DoesNotInstallDepsOnLinux_WhenDisabled()
66+
{
67+
if (!RuntimeInformation.IsOSPlatform(OSPlatform.Linux))
68+
{
69+
return;
70+
}
71+
72+
string tempBrowsers = Path.Combine(Path.GetTempPath(), Guid.NewGuid().ToString());
73+
string tempDriver = Path.Combine(Path.GetTempPath(), Guid.NewGuid().ToString());
74+
Environment.SetEnvironmentVariable("PLAYWRIGHT_BROWSERS_PATH", tempBrowsers);
75+
Environment.SetEnvironmentVariable("PLAYWRIGHT_DRIVER_SEARCH_PATH", tempDriver);
76+
Environment.SetEnvironmentVariable("HTMLTINKERX_PLAYWRIGHT_WITH_DEPS", "0");
77+
78+
var originalInstaller = HtmlBrowser.PlaywrightInstaller;
79+
80+
try
81+
{
82+
// prepare fake driver so IsDriverPresent returns true
83+
string baseDir = Path.Combine(tempDriver, ".playwright");
84+
string platformId = PlatformExtensions.GetCurrentPlatform().ToPlatformId();
85+
string nodeDir = Path.Combine(baseDir, "node", platformId);
86+
Directory.CreateDirectory(nodeDir);
87+
Directory.CreateDirectory(Path.Combine(baseDir, "package"));
88+
File.WriteAllText(Path.Combine(nodeDir, RuntimeInformation.IsOSPlatform(OSPlatform.Windows) ? "node.exe" : "node"), "");
89+
File.WriteAllText(Path.Combine(baseDir, ".version"), typeof(Microsoft.Playwright.Playwright).Assembly.GetName().Version?.ToString(3) ?? "1.52.0");
90+
91+
string[]? captured = null;
92+
HtmlBrowser.PlaywrightInstaller = args => captured = args;
93+
94+
await HtmlBrowser.EnsureInstalledAsync(HtmlBrowserEngine.Chromium);
95+
96+
Assert.NotNull(captured);
97+
Assert.DoesNotContain("--with-deps", captured);
98+
}
99+
finally
100+
{
101+
HtmlBrowser.PlaywrightInstaller = originalInstaller;
102+
Environment.SetEnvironmentVariable("PLAYWRIGHT_BROWSERS_PATH", null);
103+
Environment.SetEnvironmentVariable("PLAYWRIGHT_DRIVER_SEARCH_PATH", null);
104+
Environment.SetEnvironmentVariable("HTMLTINKERX_PLAYWRIGHT_WITH_DEPS", null);
57105
if (Directory.Exists(tempBrowsers)) Directory.Delete(tempBrowsers, true);
58106
if (Directory.Exists(tempDriver)) Directory.Delete(tempDriver, true);
59107
}
@@ -272,4 +320,3 @@ protected override Task<HttpResponseMessage> SendAsync(HttpRequestMessage reques
272320
}
273321
}
274322
}
275-

Sources/HtmlTinkerX.Tests/HtmlHttpClientFactoryTests.cs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -132,12 +132,12 @@ public async Task Create_WithCookieContainer_StoresCookies() {
132132
using HttpClient client = HtmlHttpClientFactory.Create(out CookieContainer container);
133133
HttpResponseMessage response = await client.GetAsync(url);
134134
response.EnsureSuccessStatusCode();
135-
Cookie cookie = container.GetCookies(new System.Uri(url))["session"];
135+
Cookie? cookie = container.GetCookies(new System.Uri(url))["session"];
136136
Assert.NotNull(cookie);
137137
Assert.Equal("abc", cookie.Value);
138138
} finally {
139139
server.Stop();
140140
server.Close();
141141
}
142142
}
143-
}
143+
}

Sources/HtmlTinkerX.Tests/HtmlOpenGraphTests.cs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,6 @@ public void ParseOpenGraph_ReturnsExpected() {
2121

2222
[Fact]
2323
public void ParseOpenGraph_NullHtml_Throws() {
24-
Assert.Throws<ArgumentNullException>(() => HtmlParser.ParseOpenGraph(null));
24+
Assert.Throws<ArgumentNullException>(() => HtmlParser.ParseOpenGraph(null!));
2525
}
26-
}
26+
}

Sources/HtmlTinkerX.Tests/HtmlParserFormTests.cs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,6 @@ public void ParseFormsWithAngleSharp_ReturnsForms() {
3333
/// Throws when html content is null.
3434
/// </summary>
3535
public void ParseFormsWithAngleSharp_NullHtml_Throws() {
36-
Assert.Throws<ArgumentNullException>(() => HtmlParser.ParseFormsWithAngleSharp(null));
36+
Assert.Throws<ArgumentNullException>(() => HtmlParser.ParseFormsWithAngleSharp(null!));
3737
}
38-
}
38+
}

Sources/HtmlTinkerX.Tests/HtmlParserFromTableHelpersTests.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ public void ParseTableRows_ReturnsRows() {
2323
var doc = HtmlParser.ParseWithAngleSharp(html);
2424
var table = doc.QuerySelector("table")!;
2525
var (meta, rows, start) = HtmlParserFromTable.ReadTableMetadata(table, 0, null, false, false);
26-
var data = HtmlParserFromTable.ParseTableRows(rows, start, meta.Headers, null, false, null);
26+
var data = HtmlParserFromTable.ParseTableRows(rows, start, meta.Headers, null, false, null, HtmlCellTextFormat.Compact);
2727
Assert.Single(data);
2828
var row = data[0];
2929
Assert.Equal("1", row["A"]);

Sources/HtmlTinkerX.Tests/HtmlParserListTests.cs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,6 @@ public void ParseListsWithHtmlAgilityPackDetailed_ReturnsMetadata() {
6464

6565
[Fact]
6666
public void ParseListsWithAngleSharpDetailed_NullHtml_Throws() {
67-
Assert.Throws<ArgumentNullException>(() => HtmlParser.ParseListsWithAngleSharpDetailed(null, " "));
67+
Assert.Throws<ArgumentNullException>(() => HtmlParser.ParseListsWithAngleSharpDetailed(null!, " "));
6868
}
69-
}
69+
}
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
namespace HtmlTinkerX;
2+
3+
/// <summary>
4+
/// Controls how text is extracted from table cells.
5+
/// </summary>
6+
public enum HtmlCellTextFormat {
7+
/// <summary>
8+
/// Single-line, collapses whitespace (legacy/default).
9+
/// </summary>
10+
Compact = 0,
11+
/// <summary>
12+
/// Preserves block boundaries and list items with new-lines and bullets.
13+
/// </summary>
14+
Lines = 1,
15+
/// <summary>
16+
/// Similar to Lines but uses Markdown-friendly bullets.
17+
/// </summary>
18+
Markdown = 2
19+
}

Sources/HtmlTinkerX/HtmlParser.cs

Lines changed: 29 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -93,18 +93,20 @@ public static async Task<HtmlDocument> ParseUrlWithHtmlAgilityPackAsync(string u
9393
/// <param name="allProperties">Whether to pad rows with missing cells.</param>
9494
/// <param name="skipFooter">Whether to skip HTML table footer elements.</param>
9595
/// <param name="cleanHeaders">Whether to automatically clean special characters from header names.</param>
96-
/// <param name="emptyValuePlaceholder">Value to use for empty cells.</param>
97-
/// <returns>List of table parse results with metadata.</returns>
98-
public static List<HtmlTableResult> ParseTablesWithAngleSharpDetailed(
99-
string html,
100-
IDictionary<string, string>? replaceContent = null,
101-
IDictionary<string, string>? replaceHeaders = null,
102-
bool allProperties = false,
103-
bool skipFooter = false,
104-
bool cleanHeaders = false,
105-
string? emptyValuePlaceholder = null) {
106-
return HtmlParserFromTable.ParseTablesWithAngleSharpDetailed(html, replaceContent, replaceHeaders, allProperties, skipFooter, cleanHeaders, emptyValuePlaceholder);
107-
}
96+
/// <param name="emptyValuePlaceholder">Value to use for empty cells.</param>
97+
/// <param name="cellTextFormat">Controls how cell text is flattened (compact, lines, markdown).</param>
98+
/// <returns>List of table parse results with metadata.</returns>
99+
public static List<HtmlTableResult> ParseTablesWithAngleSharpDetailed(
100+
string html,
101+
IDictionary<string, string>? replaceContent = null,
102+
IDictionary<string, string>? replaceHeaders = null,
103+
bool allProperties = false,
104+
bool skipFooter = false,
105+
bool cleanHeaders = false,
106+
string? emptyValuePlaceholder = null,
107+
HtmlCellTextFormat cellTextFormat = HtmlCellTextFormat.Compact) {
108+
return HtmlParserFromTable.ParseTablesWithAngleSharpDetailed(html, replaceContent, replaceHeaders, allProperties, skipFooter, cleanHeaders, emptyValuePlaceholder, cellTextFormat);
109+
}
108110

109111
/// <summary>
110112
/// Extracts table data from HTML markup using AngleSharp.
@@ -152,19 +154,21 @@ public static List<HtmlTableResult> ParseTablesWithAngleSharpDetailed(
152154
/// <param name="allProperties">Whether to pad rows with missing cells.</param>
153155
/// <param name="skipFooter">Whether to skip HTML table footer elements.</param>
154156
/// <param name="cleanHeaders">Whether to automatically clean special characters from header names.</param>
155-
/// <param name="emptyValuePlaceholder">Value to use for empty cells.</param>
157+
/// <param name="emptyValuePlaceholder">Value to use for empty cells.</param>
158+
/// <param name="cellTextFormat">Controls how cell text is flattened (compact, lines, markdown).</param>
156159
/// <returns>List of table parse results with metadata.</returns>
157-
public static List<HtmlTableResult> ParseTablesWithHtmlAgilityPackDetailed(
158-
string html,
159-
bool reverseTable = false,
160-
IDictionary<string, string>? replaceContent = null,
161-
IDictionary<string, string>? replaceHeaders = null,
162-
bool allProperties = false,
163-
bool skipFooter = false,
164-
bool cleanHeaders = false,
165-
string? emptyValuePlaceholder = null) {
166-
return HtmlParserFromTable.ParseTablesWithHtmlAgilityPackDetailed(html, reverseTable, replaceContent, replaceHeaders, allProperties, skipFooter, cleanHeaders, emptyValuePlaceholder);
167-
}
160+
public static List<HtmlTableResult> ParseTablesWithHtmlAgilityPackDetailed(
161+
string html,
162+
bool reverseTable = false,
163+
IDictionary<string, string>? replaceContent = null,
164+
IDictionary<string, string>? replaceHeaders = null,
165+
bool allProperties = false,
166+
bool skipFooter = false,
167+
bool cleanHeaders = false,
168+
string? emptyValuePlaceholder = null,
169+
HtmlCellTextFormat cellTextFormat = HtmlCellTextFormat.Compact) {
170+
return HtmlParserFromTable.ParseTablesWithHtmlAgilityPackDetailed(html, reverseTable, replaceContent, replaceHeaders, allProperties, skipFooter, cleanHeaders, emptyValuePlaceholder, cellTextFormat);
171+
}
168172

169173
/// <summary>
170174
/// Extracts table data from HTML markup using HtmlAgilityPack.
@@ -465,4 +469,4 @@ public static void EnsureUniqueNames(IList<string> names) {
465469
}
466470
}
467471
}
468-
}
472+
}

0 commit comments

Comments
 (0)