Skip to content

Commit f83a3f3

Browse files
authored
Merge pull request #81 from karlicoss/fix-myactivity-parsing
parse_html.activity._parse_html_activity: fix regression in bs4 >= 4.13.0 and add a unit test for html activity parsing
2 parents 2803158 + 187c9bc commit f83a3f3

File tree

3 files changed

+111
-4
lines changed

3 files changed

+111
-4
lines changed

google_takeout_parser/parse_html/activity.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44

55
from pathlib import Path
66
from datetime import datetime
7-
from typing import Any, List, Iterator, Optional, Tuple, Union, Dict, Iterable
7+
from typing import List, Iterator, Optional, Tuple, Union, Dict, Iterable
88
from urllib.parse import urlparse, parse_qs
99

1010
import bs4
@@ -336,10 +336,12 @@ def _parse_html_activity(p: Path) -> Iterator[Res[Activity]]:
336336
file_dt = datetime.fromtimestamp(p.stat().st_mtime)
337337
data = p.read_text()
338338

339-
def soup_filter(tag: str, data: Dict[str, Any]) -> bool:
340-
return tag == "div" and "outer-cell" in data.get("class", "")
339+
def contains_outer_cell(cls: Optional[str]) -> bool:
340+
return cls is not None and "outer-cell" in cls
341341

342-
soup = bs4.BeautifulSoup(data, "lxml", parse_only=bs4.SoupStrainer(soup_filter)) # type: ignore[arg-type] # this overload is missing from stubs
342+
strainer = bs4.SoupStrainer(name="div", attrs={"class": contains_outer_cell})
343+
344+
soup = bs4.BeautifulSoup(data, "lxml", parse_only=strainer)
343345

344346
outer_divs: Iterable[bs4.element.Tag] = soup.children # type: ignore[assignment] # mypy can't guess they will actually be tags..
345347
for outer_div in outer_divs:

tests/test_html.py

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
from datetime import datetime, timezone
2+
3+
from google_takeout_parser.models import Activity, LocationInfo
4+
from google_takeout_parser.parse_html.activity import _parse_html_activity
5+
6+
from .common import this_dir
7+
8+
9+
def test_parse_html_activity() -> None:
10+
11+
activity_html = this_dir / "testdata/HtmlTakeout/My Activity/Chrome/MyActivity.html"
12+
13+
results = list(_parse_html_activity(activity_html))
14+
15+
assert results == [
16+
Activity(
17+
header="Search",
18+
title="Visited https://productforums.google.com/forum/",
19+
time=datetime(2018, 1, 31, 22, 54, 50, tzinfo=timezone.utc),
20+
description=None,
21+
titleUrl="https://productforums.google.com/forum/",
22+
subtitles=[],
23+
details=[],
24+
locationInfos=[],
25+
products=["Search"],
26+
),
27+
Activity(
28+
header="Search",
29+
title="Visited http://www.adobe.com/creativecloud.html",
30+
time=datetime(2017, 2, 8, 0, 32, 39, tzinfo=timezone.utc),
31+
description=None,
32+
titleUrl="https://www.google.com/url?q=http://www.adobe.com/creativecloud.html&usg=AFQjCNH6fum5tBw7J0dbmUYKGFPduC0vSg",
33+
subtitles=[],
34+
details=[],
35+
locationInfos=[],
36+
products=["Search"],
37+
),
38+
Activity(
39+
header="Search",
40+
title="Searched for adobe creative cloud",
41+
time=datetime(2017, 2, 8, 0, 32, 36, tzinfo=timezone.utc),
42+
description=None,
43+
titleUrl="https://www.google.com/search?q=adobe+creative+cloud",
44+
subtitles=[],
45+
details=[],
46+
locationInfos=[
47+
LocationInfo(
48+
name="From your home: https://google.com/maps?q=25.800819,",
49+
url=None,
50+
source="80.186310",
51+
sourceUrl="https://google.com/maps?q=25.800819,-80.186310",
52+
),
53+
LocationInfo(name=None, url=None, source="", sourceUrl=None),
54+
],
55+
products=["Search"],
56+
),
57+
]
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
<html><head><title>My Activity History</title><style type="text/css">
2+
body {
3+
padding: 5px;
4+
background: #EEEEEE;
5+
}
6+
7+
.mdl-cell {
8+
background-color: #FFFFFF;
9+
}
10+
11+
.content-cell.mdl-cell {
12+
color: rgba(0, 0, 0, 0.54);
13+
}
14+
15+
.header-cell.mdl-cell {
16+
border-bottom-style: solid;
17+
border-bottom-width: 1px;
18+
border-bottom-color: rgba(0, 0, 0, 0.1);
19+
}
20+
21+
.image-preview {
22+
width:72px;
23+
height:72px;
24+
}
25+
26+
</style></head><body><div class="mdl-grid">
27+
28+
29+
30+
31+
32+
<div class="outer-cell mdl-cell mdl-cell--12-col mdl-shadow--2dp"><div class="mdl-grid"><div class="header-cell mdl-cell mdl-cell--12-col"><p class="mdl-typography--title">Search<br></p></div>
33+
<div class="content-cell mdl-cell mdl-cell--6-col mdl-typography--body-1">Visited&nbsp;<a href="https://productforums.google.com/forum/">https://productforums.google.com/forum/</a><br>Jan 31, 2018, 10:54:50 PM</div>
34+
<div class="content-cell mdl-cell mdl-cell--6-col mdl-typography--body-1 mdl-typography--text-right"></div>
35+
<div class="content-cell mdl-cell mdl-cell--12-col mdl-typography--caption"><b>Products:</b><br>&emsp;Search<br></div></div></div></div>
36+
<div class="outer-cell mdl-cell mdl-cell--12-col mdl-shadow--2dp"><div class="mdl-grid">
37+
<div class="header-cell mdl-cell mdl-cell--12-col"><p class="mdl-typography--title">Search<br></p></div>
38+
<div class="content-cell mdl-cell mdl-cell--6-col mdl-typography--body-1">Visited&nbsp;<a href="https://www.google.com/url?q=http://www.adobe.com/creativecloud.html&amp;usg=AFQjCNH6fum5tBw7J0dbmUYKGFPduC0vSg">http://www.adobe.com/creativecloud.html</a><br>Feb 8, 2017, 12:32:39 AM</div>
39+
<div class="content-cell mdl-cell mdl-cell--6-col mdl-typography--body-1 mdl-typography--text-right"></div>
40+
<div class="content-cell mdl-cell mdl-cell--12-col mdl-typography--caption"><b>Products:</b><br>&emsp;Search<br></div></div></div>
41+
<div class="outer-cell mdl-cell mdl-cell--12-col mdl-shadow--2dp"><div class="mdl-grid">
42+
<div class="header-cell mdl-cell mdl-cell--12-col"><p class="mdl-typography--title">Search<br></p></div>
43+
<div class="content-cell mdl-cell mdl-cell--6-col mdl-typography--body-1">Searched for&nbsp;<a href="https://www.google.com/search?q=adobe+creative+cloud">adobe creative cloud</a><br>Feb 8, 2017, 12:32:36 AM</div>
44+
<div class="content-cell mdl-cell mdl-cell--6-col mdl-typography--body-1 mdl-typography--text-right"></div>
45+
<div class="content-cell mdl-cell mdl-cell--12-col mdl-typography--caption"><b>Products:</b><br>&emsp;Search<br><b>Locations:</b><br>&emsp;From your home: <a href="https://google.com/maps?q=25.800819,-80.186310">https://google.com/maps?q=25.800819,-80.186310</a>
46+
<br>
47+
</div></div></div></div></body>
48+
</html>

0 commit comments

Comments
 (0)