1
+ import re
2
+
1
3
from bs4 import BeautifulSoup
2
- from selfie_lib import Camera , Snapshot , StringSelfie , expect_selfie
4
+ from markdownify import markdownify as md
5
+ from selfie_lib import Camera , CompoundLens , Snapshot , StringSelfie , expect_selfie
3
6
from werkzeug .test import TestResponse
4
7
5
8
REDIRECTS = {
@@ -21,20 +24,42 @@ def _web_camera(response: TestResponse) -> Snapshot:
21
24
return Snapshot .of (response .data .decode ()).plus_facet ("status" , response .status )
22
25
23
26
24
- def _pretty_print_html (html : str ) -> str :
25
- return BeautifulSoup (html , "html.parser" ).prettify ()
27
+ def _pretty_print_html (html : str ):
28
+ return BeautifulSoup (html , "html.parser" ).prettify () if "<html" in html else None
26
29
27
30
28
- def _pretty_print_lens (snapshot : Snapshot ) -> Snapshot :
29
- if "<html" in snapshot .subject .value_string ():
30
- return snapshot .plus_or_replace (
31
- "" , _pretty_print_html (snapshot .subject .value_string ())
32
- )
31
+ def _html_to_md (html : str ):
32
+ if "<html" not in html :
33
+ return None
33
34
else :
34
- return snapshot
35
+ # Remove <br> tags
36
+ clean_html = re .sub (r"<br.*?>" , "" , html )
37
+
38
+ # Convert HTML to Markdown
39
+ md_text = md (clean_html )
40
+
41
+ # Remove specific patterns from lines
42
+ md_text = re .sub (r"(?m)^====+" , "" , md_text )
43
+ md_text = re .sub (r"(?m)^---+" , "" , md_text )
44
+ md_text = re .sub (r"(?m)^\*\*\*[^\* ]+" , "" , md_text )
45
+
46
+ # Replace multiple newlines with double newlines
47
+ md_text = re .sub (r"\n\n+" , "\n \n " , md_text )
48
+
49
+ # Trim each line
50
+ trim_lines = "\n " .join (line .strip () for line in md_text .split ("\n " ))
51
+
52
+ return trim_lines .strip ()
53
+
35
54
55
+ HTML_LENS = (
56
+ CompoundLens ()
57
+ .mutate_facet ("" , _pretty_print_html )
58
+ .replace_all_regex ("http://localhost:\\ d+/" , "https://demo.selfie.dev/" )
59
+ .set_facet_from ("md" , "" , _html_to_md )
60
+ )
36
61
37
- WEB_CAMERA = Camera .of (_web_camera ).with_lens (_pretty_print_lens )
62
+ WEB_CAMERA = Camera .of (_web_camera ).with_lens (HTML_LENS )
38
63
39
64
40
65
def web_selfie (response : TestResponse ) -> StringSelfie :
0 commit comments