Observe some length limits for notion, fix #5

selfboot · selfboot · commit b2935eb8b82f · 2023-05-18T21:11:43.000+08:00
diff --git a/html2notion/translate/html2json_base.py b/html2notion/translate/html2json_base.py
@@ -19,7 +19,14 @@ class Block(Enum):
     TO_DO = "to_do"
     EQUATION = "equation"
 
+
 class Html2JsonBase:
+    # https://developers.notion.com/reference/request-limits
+    URL_MAX_LENGTH = 2000
+    TEXT_MAX_LENGTH = 2000
+    EXPRESSION_MAX_LENGTH = 1000
+    RICHTEXT_ARRAY_LENGTH = 100
+
     _registry = {}
     _text_annotations = {
         "bold": bool,
@@ -148,40 +155,45 @@ def generate_inline_obj(self, tag: PageElement):
         res_obj = []
         text_with_parents = Html2JsonBase.extract_text_and_parents(tag)
         for (text, parent_tags) in text_with_parents:
-            # Split the text into chunks of 2000 characters
-            text_chunks = [text[i:i+2000] for i in range(0, len(text), 2000)]
-            for chunk in text_chunks:
-                text_params = {"plain_text": chunk}
-                for parent in parent_tags:
-                    Html2JsonBase.parse_one_style(parent, text_params)
-                # process inline line break
-                if chunk == "<br>":
-                    try:
-                        res_obj[-1]["text"]["content"] += "\n"
-                        res_obj[-1]["plain_text"] += "\n"
-                    except Exception as e:
-                        pass
-                        # logger.error(f'{res_obj}, {str(e)}')
-                    continue
-                
-                link_url = text_params.get("url", "")
-                if text_params.get("url", "") and is_valid_url(link_url):
-                    text_obj = self.generate_link(**text_params)
-                # Here image is a independent block, split out in the outer layer
-                elif text_params.get("src", ""):
-                    text_obj = self.generate_image(**text_params)
-                else:
+            text_params = {"plain_text": text}
+            for parent in parent_tags:
+                Html2JsonBase.parse_one_style(parent, text_params)
+            if text == "<br>":
+                try:
+                    res_obj[-1]["text"]["content"] += "\n"
+                    res_obj[-1]["plain_text"] += "\n"
+                except Exception as e:
+                    pass
+                continue
+
+            link_url = text_params.get("url", "")
+            text_obj = {}
+            if text_params.get("url", "") and is_valid_url(link_url):
+                text_obj = self.generate_link(**text_params)
+            # Here image is a independent block, split out in the outer layer
+            elif text_params.get("src", ""):
+                text_obj = self.generate_image(**text_params)
+            else:
+                if len(text) <= self.TEXT_MAX_LENGTH:
                     text_obj = self.generate_text(**text_params)
-                if text_obj:
-                    res_obj.append(text_obj)
+                else:
+                    for chunk in [text[i:i+self.TEXT_MAX_LENGTH] for i in range(0, len(text), self.TEXT_MAX_LENGTH)]:
+                        text_params["plain_text"] = chunk
+                        text_obj = self.generate_text(**text_params)
+                        if text_obj:
+                            res_obj.append(text_obj)
+                    text_obj = None
+            if text_obj:
+                res_obj.append(text_obj)
         return res_obj
 
     def generate_link(self, **kwargs):
         link_url = kwargs.get("url", "")
         plain_text = kwargs.get("plain_text", "")
-        if not plain_text:
+        if not plain_text or not is_valid_url(link_url):
             return
 
+        link_url = link_url[:self.URL_MAX_LENGTH]
         self.import_stat.add_notion_text(plain_text)
         return {
             "href": link_url,
@@ -255,6 +267,12 @@ def is_same_annotations_text(text_one: dict, text_another: dict):
         if text_one["type"] != "text" or text_another["type"] != "text":
             return False
         attributes = ["annotations", "href"]
+
+        # When merging, be careful not to let the text length exceed the limit
+        total_size = len(text_one["text"]["content"]) + len(text_another["text"]["content"])
+        if total_size > Html2JsonBase.TEXT_MAX_LENGTH:
+            return False
+
         return all(text_one.get(attr) == text_another.get(attr) for attr in attributes)
 
     @staticmethod
@@ -386,7 +404,7 @@ def convert_paragraph(self, soup):
 
         # Split out image into a independent blocks
         split_objs = Html2JsonBase.split_image_src(json_obj)
-        return split_objs
+        return Html2JsonBase.ensure_array_len(split_objs)
 
     def convert_divider(self, soup):
         return {
@@ -507,7 +525,7 @@ def split_image_src(text_obj):
         rich_text = text_obj["paragraph"]["rich_text"]
         need_split = any(text.get("object") == "block" for text in rich_text)
         if not need_split:
-            return text_obj
+            return [text_obj]
         
         split_obj = []
         cur_obj = {
@@ -552,6 +570,30 @@ def get_valid_language(language):
             return language
         return "plain text"
     
+    @staticmethod
+    def ensure_array_len(blocks):
+        final_objs = []
+        for obj in blocks:
+            if "paragraph" not in obj or "rich_text" not in obj["paragraph"] or len(
+                    obj["paragraph"]["rich_text"]) <= Html2JsonBase.RICHTEXT_ARRAY_LENGTH:
+                final_objs.append(obj)
+                continue
+
+            # If the length of rich_text is greater than RICHTEXT_ARRAY_LENGTH, we split it
+            rich_text_arr = obj["paragraph"]["rich_text"]
+            rich_texts = [rich_text_arr[i:i+Html2JsonBase.RICHTEXT_ARRAY_LENGTH]
+                          for i in range(0, len(rich_text_arr), Html2JsonBase.RICHTEXT_ARRAY_LENGTH)]
+            for rich_text in rich_texts:
+                new_json_obj = {
+                    "object": "block",
+                    "type": "paragraph",
+                    "paragraph": {
+                        "rich_text": rich_text
+                    }
+                }
+                final_objs.append(new_json_obj)
+        return final_objs
+
     @classmethod
     def register(cls, input_type, subclass):
         cls._registry[input_type] = subclass
diff --git a/html2notion/translate/html2json_markdown.py b/html2notion/translate/html2json_markdown.py
@@ -161,7 +161,7 @@ def convert_equation(self, soup: Tag):
                 "rich_text": []
             }
         }
-        expression = soup.get_text()
+        expression = soup.get_text()[:Html2JsonBase.EXPRESSION_MAX_LENGTH]
         equation = json_obj["paragraph"]["rich_text"]
         equation.append({
             "type": "equation",
diff --git a/html2notion/translate/html2json_yinxiang.py b/html2notion/translate/html2json_yinxiang.py
@@ -81,7 +81,6 @@ def convert_code(self, soup):
         css_dict = Html2JsonBase.get_tag_style(soup)
         language = css_dict.get('--en-codeblockLanguage', 'plain text')
         json_obj["code"]["language"] = language
-        
         return json_obj
 
     def convert_quote(self, soup):
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -0,0 +1,10 @@
+import pytest
+import os
+from html2notion.utils import test_prepare_conf, logger
+
+
+@pytest.fixture(autouse=True, scope='module')
+def prepare_conf():
+    if 'GITHUB_ACTIONS' not in os.environ:
+        test_prepare_conf()
+        logger.info("prepare_conf_fixture")
diff --git a/tests/test_batchimport.py b/tests/test_batchimport.py
@@ -13,14 +13,6 @@
 process_once_time = 0.5
 
 
-@pytest.fixture(scope="session", autouse=True)
-def prepare_conf_fixture():
-    if 'GITHUB_ACTIONS' not in os.environ:
-        from html2notion.utils import test_prepare_conf, logger
-        test_prepare_conf()
-        logger.info("prepare_conf_fixture")
-
-
 async def mock_notion_api_request(file_path, *args, **kwargs):
     class MockResponse:
         def __init__(self, status_code, file_content, elapsed_time):
diff --git a/tests/test_cosupload.py b/tests/test_cosupload.py
@@ -12,14 +12,6 @@
 from html2notion.utils.log import log_only_local
 
 
-@pytest.fixture(scope="session", autouse=True)
-def prepare_conf_fixture():
-    if 'GITHUB_ACTIONS' not in os.environ:
-        from html2notion.utils import test_prepare_conf
-        test_prepare_conf()
-        log_only_local("prepare_conf_fixture")
-
-
 async def mock_cos_upload_request(file_path, *args, **kwargs):
     if 'GITHUB_ACTIONS' not in os.environ:
         from html2notion.utils import config
diff --git a/tests/test_notionexport.py b/tests/test_notionexport.py
@@ -1,6 +1,7 @@
-from html2notion.translate.notion_export import NotionExporter
 import os
 import json
+from html2notion.translate.notion_export import NotionExporter
+from html2notion.utils import config
 
 
 def test_check_is_delete():
@@ -40,8 +41,6 @@ def test_export_blocks():
         api_key = os.environ['notion_api_key']
         page_id = os.environ['notion_page_id_1']
     else:
-        from html2notion.utils import config, test_prepare_conf
-        test_prepare_conf()
         api_key = config['notion']['api_key']
         page_id = config['notion']['page_id']
 
diff --git a/tests/test_reqlimit.py b/tests/test_reqlimit.py
@@ -1,62 +1,70 @@
-import os
+import json
 from html2notion.translate.html2json_yinxiang import Html2JsonYinXiang
 from html2notion.translate.import_stats import ImportStats
 
 
-paragram_rich_content = f'<div>{"Some words" * 400} more words</div>'
 block_max_conent = "Some words" * 200
-paragram_rich_block = [
-    {
-        "object": "block",
-        "type": "paragraph",
-        "paragraph": {
-            "rich_text": [
-                {
-                    "plain_text": block_max_conent,
-                    "text": {
-                        "content": block_max_conent
-                    },
-                    "type": "text"
-                },
-                {
-                    "plain_text": block_max_conent,
-                    "text": {
-                        "content": block_max_conent
-                    },
-                    "type": "text"
-                },
-                {
-                    "plain_text": " more words",
-                    "text": {
-                        "content": " more words"
-                    },
-                    "type": "text"
-                }
-            ]
-        }
-    }
-]
+one_text_obj = {
+    "plain_text": block_max_conent,
+    "text": {
+        "content": block_max_conent
+    },
+    "type": "text"
+}
+remain_text_obj = {
+    "plain_text": " more words",
+    "text": {
+        "content": " more words"
+    },
+    "type": "text"
+}
 
 
 def test_reqlimit():
-    if 'GITHUB_ACTIONS' not in os.environ:
-        from html2notion.utils import test_prepare_conf, logger
-        test_prepare_conf()
-        logger.info("prepare_conf_fixture")
+    paragram_rich_block = [
+        {
+            "object": "block",
+            "type": "paragraph",
+            "paragraph": {
+                "rich_text": [
+                    one_text_obj, one_text_obj, remain_text_obj
+                ]
+            }
+        }
+    ]
+
+    paragram_rich_content = f'<body><div>{block_max_conent * 2} more words</div></body>'
+    import_stats = ImportStats()
+    yinxiang = Html2JsonYinXiang(paragram_rich_content, import_stats)
+    yinxiang.process()
+    json_obj = yinxiang.children
+    # print(json.dumps(json_obj, indent=4))
+    assert json_obj == paragram_rich_block
 
-    html_jsons = {
-        paragram_rich_content: paragram_rich_block,
-    }
 
-    for html_content in html_jsons:
-        body_content = '<body>' + html_content + '</body>'
-        import_stats = ImportStats()
-        yinxiang = Html2JsonYinXiang(body_content, import_stats)
-        yinxiang.process()
-        json_obj = yinxiang.children
-        # print(json.dumps(json_obj, indent=4))
-        assert json_obj == html_jsons[html_content]
+def test_code_reqlimit():
+    code_rich_content = f'<body><div style="-en-codeblock: true">{block_max_conent * 2} more words</div></body>'
+    import_stats = ImportStats()
+    yinxiang = Html2JsonYinXiang(code_rich_content, import_stats)
+    yinxiang.process()
+    json_obj = yinxiang.children
+    # print(json.dumps(json_obj, indent=4))
+
+    split_block_result = [
+        {
+            "object": "block",
+            "type": "code",
+            "code": {
+                "rich_text": [
+                    one_text_obj, one_text_obj, remain_text_obj
+                ],
+                "language": "plain text"
+            }
+        }
+    ]
+    assert json_obj == split_block_result
 
 
 if __name__ == '__main__':
-    test_reqlimit()
+    # test_reqlimit()
+    test_code_reqlimit()
diff --git a/tests/test_yinxiang.py b/tests/test_yinxiang.py
@@ -838,11 +838,6 @@
 ]
 
 def test_convert():
-    if 'GITHUB_ACTIONS' not in os.environ:
-        from html2notion.utils import test_prepare_conf, logger
-        test_prepare_conf()
-        logger.info("prepare_conf_fixture")
-
     html_jsons = {
         paragram_br_content: paragram_br_block,
         link_content: link_block,

Original file line number	Diff line number	Diff line change
`@@ -161,7 +161,7 @@ def convert_equation(self, soup: Tag):`
`161`	`161`	`"rich_text": []`
`162`	`162`	`}`
`163`	`163`	`}`
`164`		`- expression = soup.get_text()`
	`164`	`+ expression = soup.get_text()[:Html2JsonBase.EXPRESSION_MAX_LENGTH]`
`165`	`165`	`equation = json_obj["paragraph"]["rich_text"]`
`166`	`166`	`equation.append({`
`167`	`167`	`"type": "equation",`