@@ -19,7 +19,14 @@ class Block(Enum):
1919 TO_DO = "to_do"
2020 EQUATION = "equation"
2121
22+
2223class Html2JsonBase :
24+ # https://developers.notion.com/reference/request-limits
25+ URL_MAX_LENGTH = 2000
26+ TEXT_MAX_LENGTH = 2000
27+ EXPRESSION_MAX_LENGTH = 1000
28+ RICHTEXT_ARRAY_LENGTH = 100
29+
2330 _registry = {}
2431 _text_annotations = {
2532 "bold" : bool ,
@@ -148,40 +155,45 @@ def generate_inline_obj(self, tag: PageElement):
148155 res_obj = []
149156 text_with_parents = Html2JsonBase .extract_text_and_parents (tag )
150157 for (text , parent_tags ) in text_with_parents :
151- # Split the text into chunks of 2000 characters
152- text_chunks = [text [i :i + 2000 ] for i in range (0 , len (text ), 2000 )]
153- for chunk in text_chunks :
154- text_params = {"plain_text" : chunk }
155- for parent in parent_tags :
156- Html2JsonBase .parse_one_style (parent , text_params )
157- # process inline line break
158- if chunk == "<br>" :
159- try :
160- res_obj [- 1 ]["text" ]["content" ] += "\n "
161- res_obj [- 1 ]["plain_text" ] += "\n "
162- except Exception as e :
163- pass
164- # logger.error(f'{res_obj}, {str(e)}')
165- continue
166-
167- link_url = text_params .get ("url" , "" )
168- if text_params .get ("url" , "" ) and is_valid_url (link_url ):
169- text_obj = self .generate_link (** text_params )
170- # Here image is a independent block, split out in the outer layer
171- elif text_params .get ("src" , "" ):
172- text_obj = self .generate_image (** text_params )
173- else :
158+ text_params = {"plain_text" : text }
159+ for parent in parent_tags :
160+ Html2JsonBase .parse_one_style (parent , text_params )
161+ if text == "<br>" :
162+ try :
163+ res_obj [- 1 ]["text" ]["content" ] += "\n "
164+ res_obj [- 1 ]["plain_text" ] += "\n "
165+ except Exception as e :
166+ pass
167+ continue
168+
169+ link_url = text_params .get ("url" , "" )
170+ text_obj = {}
171+ if text_params .get ("url" , "" ) and is_valid_url (link_url ):
172+ text_obj = self .generate_link (** text_params )
173+ # Here image is a independent block, split out in the outer layer
174+ elif text_params .get ("src" , "" ):
175+ text_obj = self .generate_image (** text_params )
176+ else :
177+ if len (text ) <= self .TEXT_MAX_LENGTH :
174178 text_obj = self .generate_text (** text_params )
175- if text_obj :
176- res_obj .append (text_obj )
179+ else :
180+ for chunk in [text [i :i + self .TEXT_MAX_LENGTH ] for i in range (0 , len (text ), self .TEXT_MAX_LENGTH )]:
181+ text_params ["plain_text" ] = chunk
182+ text_obj = self .generate_text (** text_params )
183+ if text_obj :
184+ res_obj .append (text_obj )
185+ text_obj = None
186+ if text_obj :
187+ res_obj .append (text_obj )
177188 return res_obj
178189
179190 def generate_link (self , ** kwargs ):
180191 link_url = kwargs .get ("url" , "" )
181192 plain_text = kwargs .get ("plain_text" , "" )
182- if not plain_text :
193+ if not plain_text or not is_valid_url ( link_url ) :
183194 return
184195
196+ link_url = link_url [:self .URL_MAX_LENGTH ]
185197 self .import_stat .add_notion_text (plain_text )
186198 return {
187199 "href" : link_url ,
@@ -255,6 +267,12 @@ def is_same_annotations_text(text_one: dict, text_another: dict):
255267 if text_one ["type" ] != "text" or text_another ["type" ] != "text" :
256268 return False
257269 attributes = ["annotations" , "href" ]
270+
271+ # When merging, be careful not to let the text length exceed the limit
272+ total_size = len (text_one ["text" ]["content" ]) + len (text_another ["text" ]["content" ])
273+ if total_size > Html2JsonBase .TEXT_MAX_LENGTH :
274+ return False
275+
258276 return all (text_one .get (attr ) == text_another .get (attr ) for attr in attributes )
259277
260278 @staticmethod
@@ -386,7 +404,7 @@ def convert_paragraph(self, soup):
386404
387405 # Split out image into a independent blocks
388406 split_objs = Html2JsonBase .split_image_src (json_obj )
389- return split_objs
407+ return Html2JsonBase . ensure_array_len ( split_objs )
390408
391409 def convert_divider (self , soup ):
392410 return {
@@ -507,7 +525,7 @@ def split_image_src(text_obj):
507525 rich_text = text_obj ["paragraph" ]["rich_text" ]
508526 need_split = any (text .get ("object" ) == "block" for text in rich_text )
509527 if not need_split :
510- return text_obj
528+ return [ text_obj ]
511529
512530 split_obj = []
513531 cur_obj = {
@@ -552,6 +570,30 @@ def get_valid_language(language):
552570 return language
553571 return "plain text"
554572
573+ @staticmethod
574+ def ensure_array_len (blocks ):
575+ final_objs = []
576+ for obj in blocks :
577+ if "paragraph" not in obj or "rich_text" not in obj ["paragraph" ] or len (
578+ obj ["paragraph" ]["rich_text" ]) <= Html2JsonBase .RICHTEXT_ARRAY_LENGTH :
579+ final_objs .append (obj )
580+ continue
581+
582+ # If the length of rich_text is greater than RICHTEXT_ARRAY_LENGTH, we split it
583+ rich_text_arr = obj ["paragraph" ]["rich_text" ]
584+ rich_texts = [rich_text_arr [i :i + Html2JsonBase .RICHTEXT_ARRAY_LENGTH ]
585+ for i in range (0 , len (rich_text_arr ), Html2JsonBase .RICHTEXT_ARRAY_LENGTH )]
586+ for rich_text in rich_texts :
587+ new_json_obj = {
588+ "object" : "block" ,
589+ "type" : "paragraph" ,
590+ "paragraph" : {
591+ "rich_text" : rich_text
592+ }
593+ }
594+ final_objs .append (new_json_obj )
595+ return final_objs
596+
555597 @classmethod
556598 def register (cls , input_type , subclass ):
557599 cls ._registry [input_type ] = subclass
0 commit comments