Skip to content

Commit f50d7f2

Browse files
authored
Optimize FlatText.get/3 memory usage and speed (#664)
* Optimize FlatText.get/3 memory usage and speed Replaced Enum.reduce/3 with an optimal tail-recursive approach in FlatText. By traversing lists manually, we avoid the overhead of the Enumerable protocol and the allocation of anonymous closures per reduction iteration. Benchmark results when running Floki.FlatText.get on a large document: ```text Name ips average deviation median 99th % Tail recursion 3.21 M 311.16 ns ±23057.88% 50 ns 160 ns Enum.reduce 2.76 M 361.71 ns ±20499.20% 50 ns 180 ns Comparison: Tail recursion 3.21 M Enum.reduce 2.76 M - 1.16x slower +50.55 ns Memory usage statistics: Name Memory usage Tail recursion 136 B Enum.reduce 208 B - 1.53x memory usage +72 B ``` * Add tests for FlatText edge cases * Run formatter
1 parent d6f8407 commit f50d7f2

File tree

2 files changed

+48
-6
lines changed

2 files changed

+48
-6
lines changed

lib/floki/flat_text.ex

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -17,9 +17,7 @@ defmodule Floki.FlatText do
1717

1818
def get(html_nodes, sep, include_inputs?) when is_list(html_nodes) do
1919
html_nodes
20-
|> Enum.reduce([], fn html_node, acc ->
21-
text_from_node(html_node, acc, 0, sep, include_inputs?)
22-
end)
20+
|> text_from_nodes([], 0, sep, include_inputs?)
2321
|> IO.iodata_to_binary()
2422
end
2523

@@ -29,6 +27,13 @@ defmodule Floki.FlatText do
2927
|> IO.iodata_to_binary()
3028
end
3129

30+
defp text_from_nodes([], acc, _, _, _), do: acc
31+
32+
defp text_from_nodes([node | rest], acc, depth, sep, include_inputs?) do
33+
acc = text_from_node(node, acc, depth, sep, include_inputs?)
34+
text_from_nodes(rest, acc, depth, sep, include_inputs?)
35+
end
36+
3237
defp text_from_node({"input", attrs, []}, acc, _, _, true) do
3338
[acc, Floki.TextExtractor.extract_input_value(attrs)]
3439
end
@@ -39,9 +44,7 @@ defmodule Floki.FlatText do
3944

4045
defp text_from_node({_tag, _attrs, html_nodes}, acc, depth, sep, include_inputs?)
4146
when depth < 1 do
42-
Enum.reduce(html_nodes, acc, fn html_node, acc ->
43-
text_from_node(html_node, acc, depth + 1, sep, include_inputs?)
44-
end)
47+
text_from_nodes(html_nodes, acc, depth + 1, sep, include_inputs?)
4548
end
4649

4750
defp text_from_node(text, [], _, _sep, _) when is_binary(text), do: text

test/floki/flat_text_test.exs

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,4 +84,43 @@ defmodule Floki.FlatTextTest do
8484

8585
assert Floki.FlatText.get(nodes, " ") == expected_text
8686
end
87+
88+
# --- Edge Cases ---
89+
90+
test "does not extract text from deeply nested nodes (depth >= 1 for text nodes)" do
91+
node = {"div", [], [{"span", [], [{"p", [], ["Should not be here"]}]}]}
92+
assert Floki.FlatText.get(node) == ""
93+
end
94+
95+
test "does not extract inputs from deeply nested nodes (depth >= 1 for inputs)" do
96+
node = {"div", [], [{"span", [], [{"input", [{"value", "hidden"}], []}]}]}
97+
assert Floki.FlatText.get(node, " ", true) == ""
98+
end
99+
100+
test "handles multiple consecutive text nodes at root level" do
101+
nodes = ["one", "two", "three"]
102+
assert Floki.FlatText.get(nodes) == "onetwothree"
103+
assert Floki.FlatText.get(nodes, "-") == "one-two-three"
104+
end
105+
106+
test "handles multiple consecutive text nodes inside a node" do
107+
node = {"div", [], ["one", "two", "three"]}
108+
assert Floki.FlatText.get(node) == "onetwothree"
109+
assert Floki.FlatText.get(node, "-") == "one-two-three"
110+
end
111+
112+
test "ignores comments and doctypes" do
113+
nodes = [
114+
{:comment, "this is a comment"},
115+
"text",
116+
{:doctype, "html", "", ""}
117+
]
118+
119+
assert Floki.FlatText.get(nodes) == "text"
120+
end
121+
122+
test "empty nodes return blank string" do
123+
assert Floki.FlatText.get([]) == ""
124+
assert Floki.FlatText.get({"div", [], []}) == ""
125+
end
87126
end

0 commit comments

Comments
 (0)