Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 5 additions & 3 deletions apps/common/handle/impl/doc_split_handle.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,8 +113,10 @@ class DocSplitHandle(BaseSplitHandle):
def paragraph_to_md(paragraph: Paragraph, doc: Document, images_list, get_image_id):
try:
psn = paragraph.style.name
if psn.startswith('Heading'):
title = "".join(["#" for i in range(int(psn.replace("Heading ", '')))]) + " " + paragraph.text
if psn.startswith('Heading') or psn.startswith('TOC 标题') or psn.startswith('标题'):
title = "".join(["#" for i in range(
int(psn.replace("Heading ", '').replace('TOC 标题', '').replace('标题',
'')))]) + " " + paragraph.text
images = reduce(lambda x, y: [*x, *y],
[get_paragraph_element_images(e, doc, images_list, get_image_id) for e in
paragraph._element],
Expand Down Expand Up @@ -202,4 +204,4 @@ def get_content(self, file, save_image):
return content
except BaseException as e:
traceback.print_exception(e)
return f'{e}'
return f'{e}'
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There are a few suggestions for optimizing and fixing this code:

  1. Remove Redundant Characters: The code currently uses replace to remove different versions of "标题". Consider normalizing these to ensure consistency.

  2. Refactor Conditional Logic: Use separate conditions instead of multiple chained if statements to improve readability.

  3. Simplify Error Handling: Simplify error handling by using a common error message format.

Here’s an optimized version of the code with these considerations:

class DocSplitHandle(BaseSplitHandle):
    def paragraph_to_md(self, paragraph: Paragraph, doc: Document, images_list, get_image_id):
        try:
            psn = paragraph.style.name
            if psn.startswith(('Heading', ' TOC 标题', '标题')):
                levels = sum(1 for c in psn[psn.index(' ') + 1:].split()) + 1
                title = self._build_heading(levels, paragraph.text)
                images = sum(get_paragraph_element_images(e, doc, images_list, get_image_id) for e in paragraph._element)
            else:
                title = paragraph.text
                images = []
            
            return f"#{title}\n\n{images}"

        except BaseException as e:
            traceback.print_exception(e)
            return f'Error processing {e}'

    def _build_heading(self, level, text):
        # Build heading string based on level
        return '#' * level + ' ' + text

    def get_content(self, file_path, save_image):
        try:
            document_manager = load_document(file_path)
            content = ''
            for section in document_manager.sections:
                content += self.paragraph_to_md(section.headings[0], document_manager.document, [], lambda x, y: [])
                
                if not all(img['path'] is None for img in section.images): 
                    content += '\n![]('  # Start image reference
            
    ... (rest of the get_content method remains unchanged)

Changes Made:

  • Normalized condition checking for titles by splitting the logic into _build_heading.
  • Created helper functions for clarity.
  • Unified error handling messages.

Loading