Skip to content

Commit b2935eb

Browse files
committed
Observe some length limits for notion, fix #5
1 parent 79f1fb9 commit b2935eb

File tree

9 files changed

+140
-103
lines changed

9 files changed

+140
-103
lines changed

html2notion/translate/html2json_base.py

Lines changed: 70 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,14 @@ class Block(Enum):
1919
TO_DO = "to_do"
2020
EQUATION = "equation"
2121

22+
2223
class Html2JsonBase:
24+
# https://developers.notion.com/reference/request-limits
25+
URL_MAX_LENGTH = 2000
26+
TEXT_MAX_LENGTH = 2000
27+
EXPRESSION_MAX_LENGTH = 1000
28+
RICHTEXT_ARRAY_LENGTH = 100
29+
2330
_registry = {}
2431
_text_annotations = {
2532
"bold": bool,
@@ -148,40 +155,45 @@ def generate_inline_obj(self, tag: PageElement):
148155
res_obj = []
149156
text_with_parents = Html2JsonBase.extract_text_and_parents(tag)
150157
for (text, parent_tags) in text_with_parents:
151-
# Split the text into chunks of 2000 characters
152-
text_chunks = [text[i:i+2000] for i in range(0, len(text), 2000)]
153-
for chunk in text_chunks:
154-
text_params = {"plain_text": chunk}
155-
for parent in parent_tags:
156-
Html2JsonBase.parse_one_style(parent, text_params)
157-
# process inline line break
158-
if chunk == "<br>":
159-
try:
160-
res_obj[-1]["text"]["content"] += "\n"
161-
res_obj[-1]["plain_text"] += "\n"
162-
except Exception as e:
163-
pass
164-
# logger.error(f'{res_obj}, {str(e)}')
165-
continue
166-
167-
link_url = text_params.get("url", "")
168-
if text_params.get("url", "") and is_valid_url(link_url):
169-
text_obj = self.generate_link(**text_params)
170-
# Here image is a independent block, split out in the outer layer
171-
elif text_params.get("src", ""):
172-
text_obj = self.generate_image(**text_params)
173-
else:
158+
text_params = {"plain_text": text}
159+
for parent in parent_tags:
160+
Html2JsonBase.parse_one_style(parent, text_params)
161+
if text == "<br>":
162+
try:
163+
res_obj[-1]["text"]["content"] += "\n"
164+
res_obj[-1]["plain_text"] += "\n"
165+
except Exception as e:
166+
pass
167+
continue
168+
169+
link_url = text_params.get("url", "")
170+
text_obj = {}
171+
if text_params.get("url", "") and is_valid_url(link_url):
172+
text_obj = self.generate_link(**text_params)
173+
# Here image is a independent block, split out in the outer layer
174+
elif text_params.get("src", ""):
175+
text_obj = self.generate_image(**text_params)
176+
else:
177+
if len(text) <= self.TEXT_MAX_LENGTH:
174178
text_obj = self.generate_text(**text_params)
175-
if text_obj:
176-
res_obj.append(text_obj)
179+
else:
180+
for chunk in [text[i:i+self.TEXT_MAX_LENGTH] for i in range(0, len(text), self.TEXT_MAX_LENGTH)]:
181+
text_params["plain_text"] = chunk
182+
text_obj = self.generate_text(**text_params)
183+
if text_obj:
184+
res_obj.append(text_obj)
185+
text_obj = None
186+
if text_obj:
187+
res_obj.append(text_obj)
177188
return res_obj
178189

179190
def generate_link(self, **kwargs):
180191
link_url = kwargs.get("url", "")
181192
plain_text = kwargs.get("plain_text", "")
182-
if not plain_text:
193+
if not plain_text or not is_valid_url(link_url):
183194
return
184195

196+
link_url = link_url[:self.URL_MAX_LENGTH]
185197
self.import_stat.add_notion_text(plain_text)
186198
return {
187199
"href": link_url,
@@ -255,6 +267,12 @@ def is_same_annotations_text(text_one: dict, text_another: dict):
255267
if text_one["type"] != "text" or text_another["type"] != "text":
256268
return False
257269
attributes = ["annotations", "href"]
270+
271+
# When merging, be careful not to let the text length exceed the limit
272+
total_size = len(text_one["text"]["content"]) + len(text_another["text"]["content"])
273+
if total_size > Html2JsonBase.TEXT_MAX_LENGTH:
274+
return False
275+
258276
return all(text_one.get(attr) == text_another.get(attr) for attr in attributes)
259277

260278
@staticmethod
@@ -386,7 +404,7 @@ def convert_paragraph(self, soup):
386404

387405
# Split out image into a independent blocks
388406
split_objs = Html2JsonBase.split_image_src(json_obj)
389-
return split_objs
407+
return Html2JsonBase.ensure_array_len(split_objs)
390408

391409
def convert_divider(self, soup):
392410
return {
@@ -507,7 +525,7 @@ def split_image_src(text_obj):
507525
rich_text = text_obj["paragraph"]["rich_text"]
508526
need_split = any(text.get("object") == "block" for text in rich_text)
509527
if not need_split:
510-
return text_obj
528+
return [text_obj]
511529

512530
split_obj = []
513531
cur_obj = {
@@ -552,6 +570,30 @@ def get_valid_language(language):
552570
return language
553571
return "plain text"
554572

573+
@staticmethod
574+
def ensure_array_len(blocks):
575+
final_objs = []
576+
for obj in blocks:
577+
if "paragraph" not in obj or "rich_text" not in obj["paragraph"] or len(
578+
obj["paragraph"]["rich_text"]) <= Html2JsonBase.RICHTEXT_ARRAY_LENGTH:
579+
final_objs.append(obj)
580+
continue
581+
582+
# If the length of rich_text is greater than RICHTEXT_ARRAY_LENGTH, we split it
583+
rich_text_arr = obj["paragraph"]["rich_text"]
584+
rich_texts = [rich_text_arr[i:i+Html2JsonBase.RICHTEXT_ARRAY_LENGTH]
585+
for i in range(0, len(rich_text_arr), Html2JsonBase.RICHTEXT_ARRAY_LENGTH)]
586+
for rich_text in rich_texts:
587+
new_json_obj = {
588+
"object": "block",
589+
"type": "paragraph",
590+
"paragraph": {
591+
"rich_text": rich_text
592+
}
593+
}
594+
final_objs.append(new_json_obj)
595+
return final_objs
596+
555597
@classmethod
556598
def register(cls, input_type, subclass):
557599
cls._registry[input_type] = subclass

html2notion/translate/html2json_markdown.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -161,7 +161,7 @@ def convert_equation(self, soup: Tag):
161161
"rich_text": []
162162
}
163163
}
164-
expression = soup.get_text()
164+
expression = soup.get_text()[:Html2JsonBase.EXPRESSION_MAX_LENGTH]
165165
equation = json_obj["paragraph"]["rich_text"]
166166
equation.append({
167167
"type": "equation",

html2notion/translate/html2json_yinxiang.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,6 @@ def convert_code(self, soup):
8181
css_dict = Html2JsonBase.get_tag_style(soup)
8282
language = css_dict.get('--en-codeblockLanguage', 'plain text')
8383
json_obj["code"]["language"] = language
84-
8584
return json_obj
8685

8786
def convert_quote(self, soup):

tests/conftest.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
import pytest
2+
import os
3+
from html2notion.utils import test_prepare_conf, logger
4+
5+
6+
@pytest.fixture(autouse=True, scope='module')
7+
def prepare_conf():
8+
if 'GITHUB_ACTIONS' not in os.environ:
9+
test_prepare_conf()
10+
logger.info("prepare_conf_fixture")

tests/test_batchimport.py

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -13,14 +13,6 @@
1313
process_once_time = 0.5
1414

1515

16-
@pytest.fixture(scope="session", autouse=True)
17-
def prepare_conf_fixture():
18-
if 'GITHUB_ACTIONS' not in os.environ:
19-
from html2notion.utils import test_prepare_conf, logger
20-
test_prepare_conf()
21-
logger.info("prepare_conf_fixture")
22-
23-
2416
async def mock_notion_api_request(file_path, *args, **kwargs):
2517
class MockResponse:
2618
def __init__(self, status_code, file_content, elapsed_time):

tests/test_cosupload.py

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -12,14 +12,6 @@
1212
from html2notion.utils.log import log_only_local
1313

1414

15-
@pytest.fixture(scope="session", autouse=True)
16-
def prepare_conf_fixture():
17-
if 'GITHUB_ACTIONS' not in os.environ:
18-
from html2notion.utils import test_prepare_conf
19-
test_prepare_conf()
20-
log_only_local("prepare_conf_fixture")
21-
22-
2315
async def mock_cos_upload_request(file_path, *args, **kwargs):
2416
if 'GITHUB_ACTIONS' not in os.environ:
2517
from html2notion.utils import config

tests/test_notionexport.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
1-
from html2notion.translate.notion_export import NotionExporter
21
import os
32
import json
3+
from html2notion.translate.notion_export import NotionExporter
4+
from html2notion.utils import config
45

56

67
def test_check_is_delete():
@@ -40,8 +41,6 @@ def test_export_blocks():
4041
api_key = os.environ['notion_api_key']
4142
page_id = os.environ['notion_page_id_1']
4243
else:
43-
from html2notion.utils import config, test_prepare_conf
44-
test_prepare_conf()
4544
api_key = config['notion']['api_key']
4645
page_id = config['notion']['page_id']
4746

tests/test_reqlimit.py

Lines changed: 57 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -1,62 +1,70 @@
1-
import os
1+
import json
22
from html2notion.translate.html2json_yinxiang import Html2JsonYinXiang
33
from html2notion.translate.import_stats import ImportStats
44

55

6-
paragram_rich_content = f'<div>{"Some words" * 400} more words</div>'
76
block_max_conent = "Some words" * 200
8-
paragram_rich_block = [
9-
{
10-
"object": "block",
11-
"type": "paragraph",
12-
"paragraph": {
13-
"rich_text": [
14-
{
15-
"plain_text": block_max_conent,
16-
"text": {
17-
"content": block_max_conent
18-
},
19-
"type": "text"
20-
},
21-
{
22-
"plain_text": block_max_conent,
23-
"text": {
24-
"content": block_max_conent
25-
},
26-
"type": "text"
27-
},
28-
{
29-
"plain_text": " more words",
30-
"text": {
31-
"content": " more words"
32-
},
33-
"type": "text"
34-
}
35-
]
36-
}
37-
}
38-
]
7+
one_text_obj = {
8+
"plain_text": block_max_conent,
9+
"text": {
10+
"content": block_max_conent
11+
},
12+
"type": "text"
13+
}
14+
remain_text_obj = {
15+
"plain_text": " more words",
16+
"text": {
17+
"content": " more words"
18+
},
19+
"type": "text"
20+
}
3921

4022

4123
def test_reqlimit():
42-
if 'GITHUB_ACTIONS' not in os.environ:
43-
from html2notion.utils import test_prepare_conf, logger
44-
test_prepare_conf()
45-
logger.info("prepare_conf_fixture")
24+
paragram_rich_block = [
25+
{
26+
"object": "block",
27+
"type": "paragraph",
28+
"paragraph": {
29+
"rich_text": [
30+
one_text_obj, one_text_obj, remain_text_obj
31+
]
32+
}
33+
}
34+
]
35+
36+
paragram_rich_content = f'<body><div>{block_max_conent * 2} more words</div></body>'
37+
import_stats = ImportStats()
38+
yinxiang = Html2JsonYinXiang(paragram_rich_content, import_stats)
39+
yinxiang.process()
40+
json_obj = yinxiang.children
41+
# print(json.dumps(json_obj, indent=4))
42+
assert json_obj == paragram_rich_block
4643

47-
html_jsons = {
48-
paragram_rich_content: paragram_rich_block,
49-
}
5044

51-
for html_content in html_jsons:
52-
body_content = '<body>' + html_content + '</body>'
53-
import_stats = ImportStats()
54-
yinxiang = Html2JsonYinXiang(body_content, import_stats)
55-
yinxiang.process()
56-
json_obj = yinxiang.children
57-
# print(json.dumps(json_obj, indent=4))
58-
assert json_obj == html_jsons[html_content]
45+
def test_code_reqlimit():
46+
code_rich_content = f'<body><div style="-en-codeblock: true">{block_max_conent * 2} more words</div></body>'
47+
import_stats = ImportStats()
48+
yinxiang = Html2JsonYinXiang(code_rich_content, import_stats)
49+
yinxiang.process()
50+
json_obj = yinxiang.children
51+
# print(json.dumps(json_obj, indent=4))
52+
53+
split_block_result = [
54+
{
55+
"object": "block",
56+
"type": "code",
57+
"code": {
58+
"rich_text": [
59+
one_text_obj, one_text_obj, remain_text_obj
60+
],
61+
"language": "plain text"
62+
}
63+
}
64+
]
65+
assert json_obj == split_block_result
5966

6067

6168
if __name__ == '__main__':
62-
test_reqlimit()
69+
# test_reqlimit()
70+
test_code_reqlimit()

tests/test_yinxiang.py

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -838,11 +838,6 @@
838838
]
839839

840840
def test_convert():
841-
if 'GITHUB_ACTIONS' not in os.environ:
842-
from html2notion.utils import test_prepare_conf, logger
843-
test_prepare_conf()
844-
logger.info("prepare_conf_fixture")
845-
846841
html_jsons = {
847842
paragram_br_content: paragram_br_block,
848843
link_content: link_block,

0 commit comments

Comments
 (0)