Skip to content

Commit e778d27

Browse files
committed
added tree metadata
1 parent dd99ac5 commit e778d27

File tree

12 files changed

+31010
-4
lines changed

12 files changed

+31010
-4
lines changed

examples/domtree_example.py

Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
from langchain_community.document_loaders import AsyncHtmlLoader
2+
import time
3+
from scrapegraphai.asdt import DOMTree
4+
5+
def index_subtrees(subtrees):
6+
from collections import defaultdict
7+
structure_index = defaultdict(list)
8+
content_index = defaultdict(list)
9+
10+
for subtree in subtrees:
11+
structure_hash = subtree.root.structure_hash
12+
content_hash = subtree.root.content_hash
13+
14+
structure_index[structure_hash].append(subtree)
15+
content_index[content_hash].append(subtree)
16+
17+
return structure_index, content_index
18+
19+
def find_matching_subtrees(index):
20+
matches = []
21+
for hash_key, subtrees in index.items():
22+
if len(subtrees) > 1:
23+
# Generate pairs of matched subtrees
24+
for i in range(len(subtrees)):
25+
for j in range(i + 1, len(subtrees)):
26+
matches.append((subtrees[i], subtrees[j]))
27+
return matches
28+
29+
def print_subtree_details(subtree):
30+
""" A helper function to print subtree details for comparison. """
31+
nodes = []
32+
subtree.traverse(lambda node: nodes.append(f"{node.value}: {node.attributes.get('content', '')}"))
33+
return " | ".join(nodes)
34+
35+
def print_matches_side_by_side(matches):
36+
for match_pair in matches:
37+
subtree1, subtree2 = match_pair
38+
subtree1_details = print_subtree_details(subtree1)
39+
subtree2_details = print_subtree_details(subtree2)
40+
print("Match Pair:")
41+
print("Subtree 1:", subtree1_details)
42+
print("Subtree 2:", subtree2_details)
43+
print("\n" + "-"*100 + "\n")
44+
45+
# *********************************************************************************************************************
46+
# Usage example:
47+
# *********************************************************************************************************************
48+
49+
loader = AsyncHtmlLoader('https://www.wired.com/category/science/')
50+
document = loader.load()
51+
html_content = document[0].page_content
52+
53+
curr_time = time.time()
54+
# Instantiate a DOMTree with HTML content
55+
dom_tree = DOMTree(html_content)
56+
nodes, metadatas = dom_tree.collect_text_nodes() # Collect text nodes for analysis
57+
for node, metadata in zip(nodes, metadatas):
58+
print("Text:", node)
59+
print("Metadata:", metadata)
60+
61+
# graph = dom_tree.visualize(exclude_tags=['script', 'style', 'meta', 'link'])
62+
# subtrees = dom_tree.get_subtrees() # Retrieve subtrees rooted at fork nodes
63+
64+
# Index subtrees by structure and content
65+
# structure_index, content_index = index_subtrees(subtrees)
66+
67+
# # Find matches based on structure
68+
# structure_matches = find_matching_subtrees(structure_index)
69+
# print("Structure-based matches found:", len(structure_matches))
70+
71+
# # Print structure-based matches side by side
72+
# print_matches_side_by_side(structure_matches)
73+
74+
# # Optionally, do the same for content-based matches if needed
75+
# content_matches = find_matching_subtrees(content_index)
76+
# print("Content-based matches found:", len(content_matches))
77+
# print_matches_side_by_side(content_matches)
78+
79+
print(f"Time taken to build DOM tree: {time.time() - curr_time:.2f} seconds")
80+
81+
# Optionally, traverse each subtree
82+
# for subtree in subtrees:
83+
# print("Subtree rooted at:", subtree.root.value)
84+
# subtree.traverse(lambda node: print(node))
85+
# Traverse the DOMTree and print each node
86+
# dom_tree.traverse(lambda node: print(node))

examples/faiss_vector.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
from langchain_community.document_loaders import TextLoader
2+
from langchain_community.vectorstores import FAISS
3+
from langchain_openai import OpenAIEmbeddings
4+
from langchain_text_splitters import CharacterTextSplitter
5+
from langchain_community.document_loaders import AsyncHtmlLoader
6+
import time
7+
from scrapegraphai.asdt import DOMTree
8+
from dotenv import load_dotenv
9+
import os
10+
11+
load_dotenv()
12+
openai_key = os.getenv("OPENAI_APIKEY")
13+
embeddings = OpenAIEmbeddings(api_key=openai_key)
14+
15+
loader = AsyncHtmlLoader('https://perinim.github.io/projects/')
16+
document = loader.load()
17+
html_content = document[0].page_content
18+
19+
curr_time = time.time()
20+
# Instantiate a DOMTree with HTML content
21+
dom_tree = DOMTree(html_content)
22+
text_nodes, metadata = dom_tree.collect_text_nodes() # Collect text nodes for analysis
23+
24+
print(f"Time taken to collect text nodes: {time.time() - curr_time}")
25+
26+
db_texts = FAISS.from_texts(
27+
texts=text_nodes,
28+
embedding=embeddings,
29+
metadatas=metadata
30+
)
31+
32+
# Query for similar text
33+
query = "List me all the projects"
34+

html_structure

Lines changed: 256 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,256 @@
1+
digraph {
2+
rankdir=LR
3+
"[document]_1826340115328" [label="[document]"]
4+
text_1826340115200 [label=text]
5+
"[document]_1826340115328" -> text_1826340115200
6+
body_1826340440768 [label=body]
7+
"[document]_1826340115328" -> body_1826340440768
8+
header_1826340440960 [label=header]
9+
body_1826340440768 -> header_1826340440960
10+
nav_1826340441152 [label=nav]
11+
header_1826340440960 -> nav_1826340441152
12+
div_1826340441344 [label=div]
13+
nav_1826340441152 -> div_1826340441344
14+
a_1826340441536 [label=a]
15+
div_1826340441344 -> a_1826340441536
16+
span_1826340441728 [label=span]
17+
a_1826340441536 -> span_1826340441728
18+
text_1826340441920 [label=text]
19+
span_1826340441728 -> text_1826340441920
20+
text_1826340442240 [label=text]
21+
a_1826340441536 -> text_1826340442240
22+
button_1826340442560 [label=button]
23+
div_1826340441344 -> button_1826340442560
24+
span_1826340442752 [label=span]
25+
button_1826340442560 -> span_1826340442752
26+
text_1826340442880 [label=text]
27+
span_1826340442752 -> text_1826340442880
28+
span_1826340443200 [label=span]
29+
button_1826340442560 -> span_1826340443200
30+
span_1826340443456 [label=span]
31+
button_1826340442560 -> span_1826340443456
32+
span_1826340443712 [label=span]
33+
button_1826340442560 -> span_1826340443712
34+
div_1826340444032 [label=div]
35+
div_1826340441344 -> div_1826340444032
36+
ul_1826340444224 [label=ul]
37+
div_1826340444032 -> ul_1826340444224
38+
li_1826340444416 [label=li]
39+
ul_1826340444224 -> li_1826340444416
40+
a_1826340444608 [label=a]
41+
li_1826340444416 -> a_1826340444608
42+
text_1826340444800 [label=text]
43+
a_1826340444608 -> text_1826340444800
44+
li_1826340445120 [label=li]
45+
li_1826340444416 -> li_1826340445120
46+
a_1826340445312 [label=a]
47+
li_1826340445120 -> a_1826340445312
48+
text_1826340445504 [label=text]
49+
a_1826340445312 -> text_1826340445504
50+
span_1826340445760 [label=span]
51+
a_1826340445312 -> span_1826340445760
52+
text_1826340445952 [label=text]
53+
span_1826340445760 -> text_1826340445952
54+
div_1826340446336 [label=div]
55+
li_1826340445120 -> div_1826340446336
56+
a_1826340446528 [label=a]
57+
div_1826340446336 -> a_1826340446528
58+
text_1826340446720 [label=text]
59+
a_1826340446528 -> text_1826340446720
60+
div_1826340447040 [label=div]
61+
div_1826340446336 -> div_1826340447040
62+
a_1826340447296 [label=a]
63+
div_1826340446336 -> a_1826340447296
64+
text_1826340447488 [label=text]
65+
a_1826340447296 -> text_1826340447488
66+
li_1826340447872 [label=li]
67+
li_1826340445120 -> li_1826340447872
68+
a_1826340448064 [label=a]
69+
li_1826340447872 -> a_1826340448064
70+
text_1826340448256 [label=text]
71+
a_1826340448064 -> text_1826340448256
72+
li_1826340448576 [label=li]
73+
li_1826340447872 -> li_1826340448576
74+
button_1826340448768 [label=button]
75+
li_1826340448576 -> button_1826340448768
76+
i_1826340448960 [label=i]
77+
button_1826340448768 -> i_1826340448960
78+
i_1826340449216 [label=i]
79+
button_1826340448768 -> i_1826340449216
80+
progress_1826340450048 [label=progress]
81+
header_1826340440960 -> progress_1826340450048
82+
div_1826340450240 [label=div]
83+
progress_1826340450048 -> div_1826340450240
84+
span_1826340450432 [label=span]
85+
div_1826340450240 -> span_1826340450432
86+
div_1826340450880 [label=div]
87+
body_1826340440768 -> div_1826340450880
88+
div_1826340451072 [label=div]
89+
div_1826340450880 -> div_1826340451072
90+
header_1826340451264 [label=header]
91+
div_1826340451072 -> header_1826340451264
92+
h1_1826340451456 [label=h1]
93+
header_1826340451264 -> h1_1826340451456
94+
text_1826340451648 [label=text]
95+
h1_1826340451456 -> text_1826340451648
96+
p_1826340451968 [label=p]
97+
header_1826340451264 -> p_1826340451968
98+
article_1826340452288 [label=article]
99+
div_1826340451072 -> article_1826340452288
100+
div_1826340452480 [label=div]
101+
article_1826340452288 -> div_1826340452480
102+
div_1826340452672 [label=div]
103+
div_1826340452480 -> div_1826340452672
104+
div_1826340452864 [label=div]
105+
div_1826340452672 -> div_1826340452864
106+
div_1826340453120 [label=div]
107+
div_1826340452672 -> div_1826340453120
108+
a_1826340453312 [label=a]
109+
div_1826340453120 -> a_1826340453312
110+
div_1826340453504 [label=div]
111+
a_1826340453312 -> div_1826340453504
112+
figure_1826340453696 [label=figure]
113+
div_1826340453504 -> figure_1826340453696
114+
picture_1826340453888 [label=picture]
115+
figure_1826340453696 -> picture_1826340453888
116+
source_1826340454080 [label=source]
117+
picture_1826340453888 -> source_1826340454080
118+
source_1826340454336 [label=source]
119+
picture_1826340453888 -> source_1826340454336
120+
source_1826340487424 [label=source]
121+
picture_1826340453888 -> source_1826340487424
122+
img_1826340487680 [label=img]
123+
picture_1826340453888 -> img_1826340487680
124+
div_1826340488064 [label=div]
125+
div_1826340453504 -> div_1826340488064
126+
h4_1826340488256 [label=h4]
127+
div_1826340488064 -> h4_1826340488256
128+
text_1826340488384 [label=text]
129+
h4_1826340488256 -> text_1826340488384
130+
p_1826340488704 [label=p]
131+
div_1826340488064 -> p_1826340488704
132+
text_1826340488832 [label=text]
133+
p_1826340488704 -> text_1826340488832
134+
div_1826340489088 [label=div]
135+
p_1826340488704 -> div_1826340489088
136+
div_1826340489664 [label=div]
137+
div_1826340452672 -> div_1826340489664
138+
div_1826340489920 [label=div]
139+
div_1826340452672 -> div_1826340489920
140+
a_1826340490112 [label=a]
141+
div_1826340489920 -> a_1826340490112
142+
div_1826340490304 [label=div]
143+
a_1826340490112 -> div_1826340490304
144+
figure_1826340490496 [label=figure]
145+
div_1826340490304 -> figure_1826340490496
146+
picture_1826340490688 [label=picture]
147+
figure_1826340490496 -> picture_1826340490688
148+
source_1826340490880 [label=source]
149+
picture_1826340490688 -> source_1826340490880
150+
source_1826340491136 [label=source]
151+
picture_1826340490688 -> source_1826340491136
152+
source_1826340491392 [label=source]
153+
picture_1826340490688 -> source_1826340491392
154+
img_1826340491648 [label=img]
155+
picture_1826340490688 -> img_1826340491648
156+
div_1826340492032 [label=div]
157+
div_1826340490304 -> div_1826340492032
158+
h4_1826340492224 [label=h4]
159+
div_1826340492032 -> h4_1826340492224
160+
text_1826340492352 [label=text]
161+
h4_1826340492224 -> text_1826340492352
162+
p_1826340492672 [label=p]
163+
div_1826340492032 -> p_1826340492672
164+
text_1826340492800 [label=text]
165+
p_1826340492672 -> text_1826340492800
166+
div_1826340493056 [label=div]
167+
p_1826340492672 -> div_1826340493056
168+
div_1826340493632 [label=div]
169+
div_1826340452672 -> div_1826340493632
170+
div_1826340493952 [label=div]
171+
div_1826340452672 -> div_1826340493952
172+
a_1826340494144 [label=a]
173+
div_1826340493952 -> a_1826340494144
174+
div_1826340494336 [label=div]
175+
a_1826340494144 -> div_1826340494336
176+
figure_1826340494528 [label=figure]
177+
div_1826340494336 -> figure_1826340494528
178+
picture_1826340494720 [label=picture]
179+
figure_1826340494528 -> picture_1826340494720
180+
source_1826340494912 [label=source]
181+
picture_1826340494720 -> source_1826340494912
182+
source_1826340495168 [label=source]
183+
picture_1826340494720 -> source_1826340495168
184+
source_1826340495424 [label=source]
185+
picture_1826340494720 -> source_1826340495424
186+
img_1826340495680 [label=img]
187+
picture_1826340494720 -> img_1826340495680
188+
div_1826340496064 [label=div]
189+
div_1826340494336 -> div_1826340496064
190+
h4_1826340496256 [label=h4]
191+
div_1826340496064 -> h4_1826340496256
192+
text_1826340496384 [label=text]
193+
h4_1826340496256 -> text_1826340496384
194+
p_1826340496704 [label=p]
195+
div_1826340496064 -> p_1826340496704
196+
text_1826340496832 [label=text]
197+
p_1826340496704 -> text_1826340496832
198+
div_1826340497088 [label=div]
199+
p_1826340496704 -> div_1826340497088
200+
div_1826340497664 [label=div]
201+
div_1826340452672 -> div_1826340497664
202+
div_1826340497920 [label=div]
203+
div_1826340452672 -> div_1826340497920
204+
a_1826340498112 [label=a]
205+
div_1826340497920 -> a_1826340498112
206+
div_1826340498304 [label=div]
207+
a_1826340498112 -> div_1826340498304
208+
figure_1826340498496 [label=figure]
209+
div_1826340498304 -> figure_1826340498496
210+
picture_1826340498688 [label=picture]
211+
figure_1826340498496 -> picture_1826340498688
212+
source_1826340498880 [label=source]
213+
picture_1826340498688 -> source_1826340498880
214+
source_1826340499136 [label=source]
215+
picture_1826340498688 -> source_1826340499136
216+
source_1826340499392 [label=source]
217+
picture_1826340498688 -> source_1826340499392
218+
img_1826340499648 [label=img]
219+
picture_1826340498688 -> img_1826340499648
220+
div_1826340500032 [label=div]
221+
div_1826340498304 -> div_1826340500032
222+
h4_1826340500224 [label=h4]
223+
div_1826340500032 -> h4_1826340500224
224+
text_1826340500352 [label=text]
225+
h4_1826340500224 -> text_1826340500352
226+
p_1826340500672 [label=p]
227+
div_1826340500032 -> p_1826340500672
228+
text_1826340500800 [label=text]
229+
p_1826340500672 -> text_1826340500800
230+
div_1826340501056 [label=div]
231+
p_1826340500672 -> div_1826340501056
232+
footer_1826340501952 [label=footer]
233+
body_1826340440768 -> footer_1826340501952
234+
div_1826340502144 [label=div]
235+
footer_1826340501952 -> div_1826340502144
236+
text_1826340502272 [label=text]
237+
div_1826340502144 -> text_1826340502272
238+
a_1826340502528 [label=a]
239+
div_1826340502144 -> a_1826340502528
240+
text_1826340502720 [label=text]
241+
a_1826340502528 -> text_1826340502720
242+
text_1826340503040 [label=text]
243+
div_1826340502144 -> text_1826340503040
244+
a_1826340503296 [label=a]
245+
div_1826340502144 -> a_1826340503296
246+
text_1826340503488 [label=text]
247+
a_1826340503296 -> text_1826340503488
248+
text_1826340536576 [label=text]
249+
div_1826340502144 -> text_1826340536576
250+
a_1826340536896 [label=a]
251+
div_1826340502144 -> a_1826340536896
252+
text_1826340537088 [label=text]
253+
a_1826340536896 -> text_1826340537088
254+
text_1826340537408 [label=text]
255+
div_1826340502144 -> text_1826340537408
256+
}

html_structure.png

483 KB
Loading

scrapegraphai/asdt/__init__.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
"""
2+
__init__.py file for asdt module.
3+
"""
4+
5+
from .dom_tree import DOMTree

0 commit comments

Comments
 (0)