Skip to content

Commit a4ef19b

Browse files
committed
Update chunker
1 parent 37f3a6b commit a4ef19b

File tree

1 file changed

+7
-8
lines changed

1 file changed

+7
-8
lines changed

image_processing/src/image_processing/semantic_text_chunker.py

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ def __init__(
1515
num_surrounding_sentences: int = 1,
1616
similarity_threshold: float = 0.8,
1717
max_chunk_tokens: int = 200,
18-
min_chunk_tokens: int = 50
18+
min_chunk_tokens: int = 50,
1919
):
2020
self.num_surrounding_sentences = num_surrounding_sentences
2121
self.similarity_threshold = similarity_threshold
@@ -266,7 +266,7 @@ def look_ahead_and_behind_sentences(
266266
next_sentence_is_table_or_figure,
267267
) in enumerate(
268268
is_table_or_figure_map[
269-
current_sentence_index: current_sentence_index
269+
current_sentence_index : current_sentence_index
270270
+ surround_sentences_gap_to_test
271271
]
272272
):
@@ -300,8 +300,7 @@ def retrive_current_chunk_at_n(n):
300300
else:
301301
return current_chunk[n]
302302

303-
current_chunk_tokens = self.num_tokens_from_string(
304-
" ".join(current_chunk))
303+
current_chunk_tokens = self.num_tokens_from_string(" ".join(current_chunk))
305304

306305
if len(current_chunk) >= 2 and current_chunk_tokens >= self.min_chunk_tokens:
307306
logging.info("Comparing chunks")
@@ -403,13 +402,13 @@ def retrieve_current_chunk():
403402
new_is_table_or_figure_map.append(False)
404403
if forwards_direction:
405404
current_chunk = sentences[
406-
current_sentence_index: current_sentence_index
405+
current_sentence_index : current_sentence_index
407406
+ min_of_distance_to_next_figure_or_num_surrounding_sentences
408407
]
409408
else:
410409
current_chunk = sentences[
411-
current_sentence_index: current_sentence_index
412-
- min_of_distance_to_next_figure_or_num_surrounding_sentences: -1
410+
current_sentence_index : current_sentence_index
411+
- min_of_distance_to_next_figure_or_num_surrounding_sentences : -1
413412
]
414413
index += min_of_distance_to_next_figure_or_num_surrounding_sentences
415414
continue
@@ -490,7 +489,7 @@ async def process_semantic_text_chunker(record: dict, text_chunker) -> dict:
490489
logging.error("Chunking Error: %s", e)
491490
return {
492491
"recordId": record["recordId"],
493-
"data": {},
492+
"data": None,
494493
"errors": [
495494
{
496495
"message": "Failed to chunk data. Check function app logs for more details of exact failure."

0 commit comments

Comments
 (0)