docling-project · krrome · Nov 24, 2025
diff --git a/docling/backend/docling_parse_v4_backend.py b/docling/backend/docling_parse_v4_backend.py
@@ -7,7 +7,7 @@
 import pypdfium2 as pdfium
 from docling_core.types.doc import BoundingBox, CoordOrigin
 from docling_core.types.doc.page import SegmentedPdfPage, TextCell
-from docling_parse.pdf_parser import DoclingPdfParser, PdfDocument
+from docling_parse.pdf_parser import DoclingPdfParser, PdfDocument, PdfTableOfContents
 from PIL import Image
 from pypdfium2 import PdfPage
 
@@ -225,6 +225,9 @@ def page_count(self) -> int:
 
         return len_2
 
+    def get_table_of_contents(self) -> Optional[PdfTableOfContents]:
+        return self.dp_doc.get_table_of_contents()
+
     def load_page(
         self, page_no: int, create_words: bool = True, create_textlines: bool = True
     ) -> DoclingParseV4PageBackend:

diff --git a/docling/backend/pypdfium2_backend.py b/docling/backend/pypdfium2_backend.py
@@ -401,6 +401,10 @@ def load_page(self, page_no: int) -> PyPdfiumPageBackend:
     def is_valid(self) -> bool:
         return self.page_count() > 0
 
+    def get_table_of_contents(self) -> list[dict]:
+        with pypdfium2_lock:
+            return list(self._pdoc.get_toc())
+
     def unload(self):
         super().unload()
         with pypdfium2_lock:

diff --git a/docling/models/header_hierarchy/__init__.py b/docling/models/header_hierarchy/__init__.py
diff --git a/docling/models/header_hierarchy/hierarchy_builder.py b/docling/models/header_hierarchy/hierarchy_builder.py
@@ -0,0 +1,67 @@
+import logging
+
+from pydantic import BaseModel
+
+from docling.datamodel.document import ConversionResult
+from docling.models.header_hierarchy.metadata_hierarchy import HierarchyBuilderMetadata
+from docling.models.header_hierarchy.style_based_hierarchy import StyleBasedHierarchy
+from docling.models.header_hierarchy.types.hierarchical_header import HierarchicalHeader
+from docling.models.readingorder_model import ReadingOrderPageElement
+
+_log = logging.getLogger(__name__)
+
+
+class PDFHeaderHierarchyOptions(BaseModel):
+    use_toc_hierarchy: bool = True
+    # reset_additional_headers_to_toc: bool = True
+
+    remove_duplicate_headers: bool = True
+    infer_hierarchy_from_style: bool = True
+    infer_hierarchy_from_numbering: bool = True
+    min_prop_numbered: float = 0.3
+
+    raise_on_error: bool = False
+
+
+class HierarchyBuilder:
+    def __init__(self, options: PDFHeaderHierarchyOptions):
+        self.options = options
+
+    def __call__(
+        self,
+        conv_res: ConversionResult,
+        sorted_elements: list[ReadingOrderPageElement],
+    ) -> HierarchicalHeader:
+        root = HierarchicalHeader()
+        if self.options.use_toc_hierarchy:
+            try:
+                hbm = HierarchyBuilderMetadata(
+                    conv_res=conv_res,
+                    sorted_elements=sorted_elements,
+                    raise_on_error=self.options.raise_on_error,
+                )
+                root = hbm.infer()
+            except Exception as e:
+                if self.options.raise_on_error:
+                    raise e
+                else:
+                    _log.error(
+                        f"HierarchyBuilderMetadata infer failed with exception {type(e)}: '{e}'"
+                    )
+
+        if (
+            self.options.infer_hierarchy_from_numbering
+            or self.options.infer_hierarchy_from_style
+        ) and not root.children:
+            sbh = StyleBasedHierarchy(
+                conv_res=conv_res,
+                sorted_elements=sorted_elements,
+                raise_on_error=self.options.raise_on_error,
+                remove_duplicate_headers=self.options.remove_duplicate_headers,
+                infer_hierarchy_from_style=self.options.infer_hierarchy_from_style,
+                infer_hierarchy_from_numbering=self.options.infer_hierarchy_from_numbering,
+                min_prop_numbered=self.options.min_prop_numbered,
+            )
+            root = sbh.process()
+
+        return root
diff --git a/docling/models/header_hierarchy/metadata_hierarchy.py b/docling/models/header_hierarchy/metadata_hierarchy.py
@@ -0,0 +1,146 @@
+import re
+from collections.abc import Generator
+from contextlib import contextmanager
+from functools import cached_property
+from io import BytesIO
+from logging import Logger
+from pathlib import Path, PurePath
+from typing import Optional, Union
+
+from docling_core.types.doc import BoundingBox, DocItemLabel, ListItem, TextItem
+from docling_ibm_models.reading_order.reading_order_rb import (
+    PageElement as ReadingOrderPageElement,
+)
+from docling_parse.pdf_parser import PdfTableOfContents
+
+from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
+from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
+from docling.datamodel.base_models import DocumentStream, PageElement, TextElement
+from docling.datamodel.document import ConversionResult
+from docling.models.header_hierarchy.types.hierarchical_header import HierarchicalHeader
+
+logger = Logger(__name__)
+
+
+class HeaderNotFoundException(Exception):
+    def __init__(self, heading: str):
+        super().__init__(f"Following heading was not found in the document: {heading}")
+
+
+class ImplausibleHeadingStructureException(Exception):
+    def __init__(self) -> None:
+        super().__init__(
+            "Hierarchy demands equal level heading, but no common parent was found!"
+        )
+
+
+class PdfBackendIncompatible(Exception):
+    def __init__(self, backend) -> None:
+        super().__init__(
+            f"The selected backend is '{type(backend)}' instead of 'DoclingParseV4DocumentBackend'."
+        )
+
+
+class InvalidSourceTypeException(Exception):
+    pass
+
+
+class HierarchyBuilderMetadata:
+    def __init__(
+        self,
+        conv_res: ConversionResult,
+        sorted_elements: list[ReadingOrderPageElement],
+        raise_on_error: bool = False,
+    ):
+        # if not isinstance(conv_res.input._backend, (DoclingParseV4DocumentBackend, PyPdfiumDocumentBackend)):
+        if not isinstance(conv_res.input._backend, DoclingParseV4DocumentBackend):
+            raise PdfBackendIncompatible(conv_res.input._backend)
+        backend: DoclingParseV4DocumentBackend = conv_res.input._backend
+        self.toc_meta: Optional[PdfTableOfContents] = backend.get_table_of_contents()
+        self.conv_res: ConversionResult = conv_res
+        self.all_elements: list[PageElement] = conv_res.assembled.elements
+        self.all_cids: list[str] = [
+            f"#/{element.page_no}/{element.cluster.id}" for element in self.all_elements
+        ]
+        self.sorted_ro_elements: list[ReadingOrderPageElement] = sorted_elements
+        self.raise_on_error: bool = raise_on_error
+        self.cid_to_page_element: dict[str, PageElement] = dict(
+            zip(self.all_cids, self.all_elements)
+        )
+
+    def _iterate_toc(
+        self, toc_element: Optional[PdfTableOfContents] = None, level: int = 0
+    ):
+        if toc_element is None:
+            toc_element = self.toc_meta
+        if toc_element:
+            if toc_element.text != "<root>":
+                yield level, toc_element.text
+            for child in toc_element.children:
+                yield from self._iterate_toc(child, level + 1)
+
+    def infer(self) -> HierarchicalHeader:
+        root = HierarchicalHeader()
+        current = root
+
+        # my problem is that I will need the font information in PdfTextCell, but at the same time I need the ordered text elements (with self refs ideally)
+
+        for level, title in self._iterate_toc():
+            new_parent = None
+            this_element = None
+            orig_text = ""
+            ref = None
+            last_i: int = 0
+            # identify the text item in the document
+            for _i, ro_element in enumerate(self.sorted_ro_elements[last_i:]):
+                element = self.cid_to_page_element[ro_element.ref.cref]
+                # skip all page elements that are before the last ("current") header
+                # if element.page_no < last_page or element.cluster.id <= last_cid:
+                #     continue
+                # Future to do: fixme - better to look for an overlap with the 'to' pointer if possible...
+                if not isinstance(element, TextElement):
+                    continue
+                orig_text = "".join([c.orig for c in element.cluster.cells])
+
+                if re.sub(r"[^A-Za-z0-9]", "", title) == re.sub(
+                    r"[^A-Za-z0-9]", "", orig_text
+                ):
+                    this_element = element
+                    last_i = last_i + _i
+                    ref = ro_element.ref.cref
+                    break
+            if this_element is None:
+                if self.raise_on_error:
+                    raise HeaderNotFoundException(title)
+                else:
+                    logger.warning(HeaderNotFoundException(title))
+                    continue
+
+            if this_element.label != DocItemLabel.SECTION_HEADER:
+                this_element.label = DocItemLabel.SECTION_HEADER
+
+            if current.level_toc is None or level > current.level_toc:
+                # print(f"gt: {this_fs_level, this_style_attr} VS: {current.level_fontsize, current.style_attrs}")
+                new_parent = current
+            elif level == current.level_toc:
+                # print(f"eq: {this_fs_level, this_style_attr} VS: {current.level_fontsize, current.style_attrs}")
+                if current.parent is not None:
+                    new_parent = current.parent
+                else:
+                    raise ImplausibleHeadingStructureException()
+            else:
+                # go back up in hierarchy and try to find a new parent
+                new_parent = current
+                while new_parent.parent is not None and (level <= new_parent.level_toc):
+                    new_parent = new_parent.parent
+                # print(f"fit parent for : {this_fs_level, this_style_attr} parent: {new_parent.level_fontsize, new_parent.style_attrs}")
+            new_obj = HierarchicalHeader(
+                text=orig_text,
+                parent=new_parent,
+                level_toc=level,
+                doc_ref=ref,
+            )
+            new_parent.children.append(new_obj)
+            current = new_obj
+
+        return root
diff --git a/docling/models/header_hierarchy/parsers.py b/docling/models/header_hierarchy/parsers.py
@@ -0,0 +1,118 @@
+import re
+
+
+class InvalidLetterException(Exception):
+    def __init__(self, letter: str):
+        super().__init__(f"Invalid letter: {letter}")
+
+
+def infer_header_level_numerical(header_text: str) -> list[int]:
+    # Match dot-, space-, or minus-separated numbers at the start
+    match = re.match(r"^((?:\d+[.\s-])+)\d+", header_text.strip())
+    if match:
+        # Count the number of numeric groups (split by dot or space)
+        numbering = match.group(0)
+        # Split by dot or space, filter out empty strings
+        try:
+            groups = [int(g) for g in re.split(r"[.\s]", numbering) if g]
+        except ValueError:
+            return []
+        return groups
+    # Handle single number at the start (e.g., "2 Heading")
+    match_single = re.match(r"^\d+", header_text.strip())
+    if match_single:
+        return [int(match_single.group(0))]
+    # No numbering found
+    return []
+
+
+def letter_to_number(letter: str) -> int:
+    """Convert a single letter (A-Z or a-z) to its corresponding number (A/a=1, B/b=2, ...)."""
+    letter = letter.strip()
+    if len(letter) != 1 or not letter.isalpha():
+        raise InvalidLetterException(letter)
+    return ord(letter.lower()) - ord("a") + 1
+
+
+def infer_header_level_letter(header_text: str) -> list[int]:
+    """
+    Detects whether a header starts with a letter-numbered marker (A, B, C, ... or a, b, c, ...)
+    and returns the numeric equivalent along with the raw match.
+    """
+    header_text = header_text.strip()
+    # Match patterns like "A. ", "b) ", "C - Heading"
+    match = re.match(r"^([A-Za-z])(?:[.)\s-]+)", header_text)
+    if match:
+        letter = match.group(1)
+        try:
+            return [letter_to_number(letter)]
+        except InvalidLetterException:
+            return []
+
+    return []
+
+
+# Roman numeral conversion helper
+def roman_to_int(roman: str) -> int:
+    roman = roman.upper()
+    roman_map = {
+        "M": 1000,
+        "CM": 900,
+        "D": 500,
+        "CD": 400,
+        "C": 100,
+        "XC": 90,
+        "L": 50,
+        "XL": 40,
+        "X": 10,
+        "IX": 9,
+        "V": 5,
+        "IV": 4,
+        "I": 1,
+    }
+    i, result = 0, 0
+    while i < len(roman):
+        # Check 2-letter symbols first (like 'CM', 'IX', etc.)
+        if i + 1 < len(roman) and roman[i : i + 2] in roman_map:
+            result += roman_map[roman[i : i + 2]]
+            i += 2
+        else:
+            result += roman_map[roman[i]]
+            i += 1
+    return result
+
+
+def infer_header_level_roman(header_text: str) -> list[int]:
+    """
+    Detects Roman numeral headers (at beginning of the string)
+    and returns list of integer numbering levels.
+
+    Examples:
+        "II. Methods" -> [2]
+        "IV-2 Results" -> [4, 2]
+        "XIII Introduction" -> [13]
+        "XI.2.3 Subsection" -> [11, 2, 3]
+    """
+    text = header_text.strip()
+
+    # Match Roman numerals at start, optionally combined with dots/numbers
+    match = re.match(r"^((?:[IVXLCDM]+[.\s-])+|[IVXLCDM]+$)", text, re.IGNORECASE)
+
+    if match:
+        numbering = match.group(0)
+        # Split into tokens by dot, dash, space
+        tokens = [t for t in re.split(r"[.\s-]", numbering) if t]
+
+        groups = []
+        try:
+            for tok in tokens:
+                if re.fullmatch(r"[IVXLCDM]+", tok, flags=re.IGNORECASE):
+                    groups.append(roman_to_int(tok))
+                elif tok.isdigit():
+                    groups.append(int(tok))
+        except KeyError:
+            # KeyError from converting roman numbers to int.
+            pass
+        return groups
+
+    return []