Skip to content

Multi-level outline numbering incorrectly rendered as single-level in DOCX #2758

@atok666

Description

@atok666

Bug

When parsing DOCX documents that use Word's multi-level outline numbering (e.g., "3.1", "3.2"), Docling renders the numbering as single-level only (e.g., "1", "2", "3"), losing the hierarchical structure.

Expected: "3.1 Introduction", "3.2 Solution Overview", "3.3 Requirements"
Actual: "1 Introduction", "2 Solution Overview", "3 Requirements"

Steps to reproduce

from docx import Document
from docx.oxml.ns import qn
from docx.oxml import OxmlElement
from pathlib import Path

# Create document with multi-level outline numbering
doc = Document()
numbering_part = doc.part.numbering_part
numbering_xml = numbering_part._element

# Create abstract numbering definition
abstractNum = OxmlElement('w:abstractNum')
abstractNum.set(qn('w:abstractNumId'), '1')

# Level 0 (Chapter - "3")
lvl0 = OxmlElement('w:lvl')
lvl0.set(qn('w:ilvl'), '0')
start0 = OxmlElement('w:start')
start0.set(qn('w:val'), '3')
lvl0.append(start0)
numFmt0 = OxmlElement('w:numFmt')
numFmt0.set(qn('w:val'), 'decimal')
lvl0.append(numFmt0)
lvlText0 = OxmlElement('w:lvlText')
lvlText0.set(qn('w:val'), '%1')
lvl0.append(lvlText0)
abstractNum.append(lvl0)

# Level 1 (Section - "3.1", "3.2")
lvl1 = OxmlElement('w:lvl')
lvl1.set(qn('w:ilvl'), '1')
start1 = OxmlElement('w:start')
start1.set(qn('w:val'), '1')
lvl1.append(start1)
numFmt1 = OxmlElement('w:numFmt')
numFmt1.set(qn('w:val'), 'decimal')
lvl1.append(numFmt1)
lvlText1 = OxmlElement('w:lvlText')
lvlText1.set(qn('w:val'), '%1.%2')
lvl1.append(lvlText1)
abstractNum.append(lvl1)

numbering_xml.insert(0, abstractNum)

num = OxmlElement('w:num')
num.set(qn('w:numId'), '1')
abstractNumId = OxmlElement('w:abstractNumId')
abstractNumId.set(qn('w:val'), '1')
num.append(abstractNumId)
numbering_xml.append(num)

def add_numbered_heading(doc, text, level, num_id):
    p = doc.add_paragraph()
    p.style = f'Heading {level}'
    pPr = p._p.get_or_add_pPr()
    numPr = OxmlElement('w:numPr')
    ilvl = OxmlElement('w:ilvl')
    ilvl.set(qn('w:val'), str(level - 1))
    numPr.append(ilvl)
    numId_elem = OxmlElement('w:numId')
    numId_elem.set(qn('w:val'), str(num_id))
    numPr.append(numId_elem)
    pPr.append(numPr)
    run = p.add_run(text)
    return p

add_numbered_heading(doc, "Introduction", 2, 1)
doc.add_paragraph("Content under 3.1")
add_numbered_heading(doc, "Solution Overview", 2, 1)
doc.add_paragraph("Content under 3.2")
add_numbered_heading(doc, "Requirements", 2, 1)
doc.add_paragraph("Content under 3.3")

doc.save("/tmp/test_outline_numbering.docx")

# Now parse with Docling
from docling.document_converter import DocumentConverter

converter = DocumentConverter()
result = converter.convert("/tmp/test_outline_numbering.docx")

for item in result.document.iterate_items():
    element = item[0] if isinstance(item, tuple) else item
    label = element.label.value if hasattr(element.label, 'value') else str(element.label)
    text = element.text if hasattr(element, 'text') else ""
    print(f"{label}: {text}")

Output:
section_header: 1 Introduction
text: Content under 3.1
section_header: 2 Solution Overview
text: Content under 3.2
section_header: 3 Requirements
text: Content under 3.3

Docling version

2.64.0

Python version

Python 3.11.14

Metadata

Metadata

Assignees

No one assigned

    Labels

    bugSomething isn't workingdocxissue related to docx backendgood first issueIssues and pull requests for new contributors

    Type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions