Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion docling/backend/docling_parse_v4_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import pypdfium2 as pdfium
from docling_core.types.doc import BoundingBox, CoordOrigin
from docling_core.types.doc.page import SegmentedPdfPage, TextCell
from docling_parse.pdf_parser import DoclingPdfParser, PdfDocument
from docling_parse.pdf_parser import DoclingPdfParser, PdfDocument, PdfTableOfContents
from PIL import Image
from pypdfium2 import PdfPage

Expand Down Expand Up @@ -225,6 +225,9 @@ def page_count(self) -> int:

return len_2

def get_table_of_contents(self) -> Optional[PdfTableOfContents]:
return self.dp_doc.get_table_of_contents()

def load_page(
self, page_no: int, create_words: bool = True, create_textlines: bool = True
) -> DoclingParseV4PageBackend:
Expand Down
4 changes: 4 additions & 0 deletions docling/backend/pypdfium2_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -401,6 +401,10 @@ def load_page(self, page_no: int) -> PyPdfiumPageBackend:
def is_valid(self) -> bool:
return self.page_count() > 0

def get_table_of_contents(self) -> list[dict]:
with pypdfium2_lock:
return list(self._pdoc.get_toc())

def unload(self):
super().unload()
with pypdfium2_lock:
Expand Down
Empty file.
67 changes: 67 additions & 0 deletions docling/models/header_hierarchy/hierarchy_builder.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
import logging

from pydantic import BaseModel

from docling.datamodel.document import ConversionResult
from docling.models.header_hierarchy.metadata_hierarchy import HierarchyBuilderMetadata
from docling.models.header_hierarchy.style_based_hierarchy import StyleBasedHierarchy
from docling.models.header_hierarchy.types.hierarchical_header import HierarchicalHeader
from docling.models.readingorder_model import ReadingOrderPageElement

_log = logging.getLogger(__name__)


class PDFHeaderHierarchyOptions(BaseModel):
use_toc_hierarchy: bool = True
# reset_additional_headers_to_toc: bool = True

remove_duplicate_headers: bool = True
infer_hierarchy_from_style: bool = True
infer_hierarchy_from_numbering: bool = True
min_prop_numbered: float = 0.3

raise_on_error: bool = False


class HierarchyBuilder:
def __init__(self, options: PDFHeaderHierarchyOptions):
self.options = options

def __call__(
self,
conv_res: ConversionResult,
sorted_elements: list[ReadingOrderPageElement],
) -> HierarchicalHeader:
root = HierarchicalHeader()
if self.options.use_toc_hierarchy:
try:
hbm = HierarchyBuilderMetadata(
conv_res=conv_res,
sorted_elements=sorted_elements,
raise_on_error=self.options.raise_on_error,
)
root = hbm.infer()
except Exception as e:
if self.options.raise_on_error:
raise e
else:
_log.error(
f"HierarchyBuilderMetadata infer failed with exception {type(e)}: '{e}'"
)

if (
self.options.infer_hierarchy_from_numbering
or self.options.infer_hierarchy_from_style
) and not root.children:
sbh = StyleBasedHierarchy(
conv_res=conv_res,
sorted_elements=sorted_elements,
raise_on_error=self.options.raise_on_error,
remove_duplicate_headers=self.options.remove_duplicate_headers,
infer_hierarchy_from_style=self.options.infer_hierarchy_from_style,
infer_hierarchy_from_numbering=self.options.infer_hierarchy_from_numbering,
min_prop_numbered=self.options.min_prop_numbered,
)
root = sbh.process()

return root
146 changes: 146 additions & 0 deletions docling/models/header_hierarchy/metadata_hierarchy.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
import re
from collections.abc import Generator
from contextlib import contextmanager
from functools import cached_property
from io import BytesIO
from logging import Logger
from pathlib import Path, PurePath
from typing import Optional, Union

from docling_core.types.doc import BoundingBox, DocItemLabel, ListItem, TextItem
from docling_ibm_models.reading_order.reading_order_rb import (
PageElement as ReadingOrderPageElement,
)
from docling_parse.pdf_parser import PdfTableOfContents

from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.base_models import DocumentStream, PageElement, TextElement
from docling.datamodel.document import ConversionResult
from docling.models.header_hierarchy.types.hierarchical_header import HierarchicalHeader

logger = Logger(__name__)


class HeaderNotFoundException(Exception):
def __init__(self, heading: str):
super().__init__(f"Following heading was not found in the document: {heading}")


class ImplausibleHeadingStructureException(Exception):
def __init__(self) -> None:
super().__init__(
"Hierarchy demands equal level heading, but no common parent was found!"
)


class PdfBackendIncompatible(Exception):
def __init__(self, backend) -> None:
super().__init__(
f"The selected backend is '{type(backend)}' instead of 'DoclingParseV4DocumentBackend'."
)


class InvalidSourceTypeException(Exception):
pass


class HierarchyBuilderMetadata:
def __init__(
self,
conv_res: ConversionResult,
sorted_elements: list[ReadingOrderPageElement],
raise_on_error: bool = False,
):
# if not isinstance(conv_res.input._backend, (DoclingParseV4DocumentBackend, PyPdfiumDocumentBackend)):
if not isinstance(conv_res.input._backend, DoclingParseV4DocumentBackend):
raise PdfBackendIncompatible(conv_res.input._backend)
backend: DoclingParseV4DocumentBackend = conv_res.input._backend
self.toc_meta: Optional[PdfTableOfContents] = backend.get_table_of_contents()
self.conv_res: ConversionResult = conv_res
self.all_elements: list[PageElement] = conv_res.assembled.elements
self.all_cids: list[str] = [
f"#/{element.page_no}/{element.cluster.id}" for element in self.all_elements
]
self.sorted_ro_elements: list[ReadingOrderPageElement] = sorted_elements
self.raise_on_error: bool = raise_on_error
self.cid_to_page_element: dict[str, PageElement] = dict(
zip(self.all_cids, self.all_elements)
)

def _iterate_toc(
self, toc_element: Optional[PdfTableOfContents] = None, level: int = 0
):
if toc_element is None:
toc_element = self.toc_meta
if toc_element:
if toc_element.text != "<root>":
yield level, toc_element.text
for child in toc_element.children:
yield from self._iterate_toc(child, level + 1)

def infer(self) -> HierarchicalHeader:
root = HierarchicalHeader()
current = root

# my problem is that I will need the font information in PdfTextCell, but at the same time I need the ordered text elements (with self refs ideally)

for level, title in self._iterate_toc():
new_parent = None
this_element = None
orig_text = ""
ref = None
last_i: int = 0
# identify the text item in the document
for _i, ro_element in enumerate(self.sorted_ro_elements[last_i:]):
element = self.cid_to_page_element[ro_element.ref.cref]
# skip all page elements that are before the last ("current") header
# if element.page_no < last_page or element.cluster.id <= last_cid:
# continue
# Future to do: fixme - better to look for an overlap with the 'to' pointer if possible...
if not isinstance(element, TextElement):
continue
orig_text = "".join([c.orig for c in element.cluster.cells])

if re.sub(r"[^A-Za-z0-9]", "", title) == re.sub(
r"[^A-Za-z0-9]", "", orig_text
):
this_element = element
last_i = last_i + _i
ref = ro_element.ref.cref
break
if this_element is None:
if self.raise_on_error:
raise HeaderNotFoundException(title)
else:
logger.warning(HeaderNotFoundException(title))
continue

if this_element.label != DocItemLabel.SECTION_HEADER:
this_element.label = DocItemLabel.SECTION_HEADER

if current.level_toc is None or level > current.level_toc:
# print(f"gt: {this_fs_level, this_style_attr} VS: {current.level_fontsize, current.style_attrs}")
new_parent = current
elif level == current.level_toc:
# print(f"eq: {this_fs_level, this_style_attr} VS: {current.level_fontsize, current.style_attrs}")
if current.parent is not None:
new_parent = current.parent
else:
raise ImplausibleHeadingStructureException()
else:
# go back up in hierarchy and try to find a new parent
new_parent = current
while new_parent.parent is not None and (level <= new_parent.level_toc):
new_parent = new_parent.parent
# print(f"fit parent for : {this_fs_level, this_style_attr} parent: {new_parent.level_fontsize, new_parent.style_attrs}")
new_obj = HierarchicalHeader(
text=orig_text,
parent=new_parent,
level_toc=level,
doc_ref=ref,
)
new_parent.children.append(new_obj)
current = new_obj

return root
118 changes: 118 additions & 0 deletions docling/models/header_hierarchy/parsers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
import re


class InvalidLetterException(Exception):
def __init__(self, letter: str):
super().__init__(f"Invalid letter: {letter}")


def infer_header_level_numerical(header_text: str) -> list[int]:
# Match dot-, space-, or minus-separated numbers at the start
match = re.match(r"^((?:\d+[.\s-])+)\d+", header_text.strip())
if match:
# Count the number of numeric groups (split by dot or space)
numbering = match.group(0)
# Split by dot or space, filter out empty strings
try:
groups = [int(g) for g in re.split(r"[.\s]", numbering) if g]
except ValueError:
return []
return groups
# Handle single number at the start (e.g., "2 Heading")
match_single = re.match(r"^\d+", header_text.strip())
if match_single:
return [int(match_single.group(0))]
# No numbering found
return []


def letter_to_number(letter: str) -> int:
"""Convert a single letter (A-Z or a-z) to its corresponding number (A/a=1, B/b=2, ...)."""
letter = letter.strip()
if len(letter) != 1 or not letter.isalpha():
raise InvalidLetterException(letter)
return ord(letter.lower()) - ord("a") + 1


def infer_header_level_letter(header_text: str) -> list[int]:
"""
Detects whether a header starts with a letter-numbered marker (A, B, C, ... or a, b, c, ...)
and returns the numeric equivalent along with the raw match.
"""
header_text = header_text.strip()
# Match patterns like "A. ", "b) ", "C - Heading"
match = re.match(r"^([A-Za-z])(?:[.)\s-]+)", header_text)
if match:
letter = match.group(1)
try:
return [letter_to_number(letter)]
except InvalidLetterException:
return []

return []


# Roman numeral conversion helper
def roman_to_int(roman: str) -> int:
roman = roman.upper()
roman_map = {
"M": 1000,
"CM": 900,
"D": 500,
"CD": 400,
"C": 100,
"XC": 90,
"L": 50,
"XL": 40,
"X": 10,
"IX": 9,
"V": 5,
"IV": 4,
"I": 1,
}
i, result = 0, 0
while i < len(roman):
# Check 2-letter symbols first (like 'CM', 'IX', etc.)
if i + 1 < len(roman) and roman[i : i + 2] in roman_map:
result += roman_map[roman[i : i + 2]]
i += 2
else:
result += roman_map[roman[i]]
i += 1
return result


def infer_header_level_roman(header_text: str) -> list[int]:
"""
Detects Roman numeral headers (at beginning of the string)
and returns list of integer numbering levels.

Examples:
"II. Methods" -> [2]
"IV-2 Results" -> [4, 2]
"XIII Introduction" -> [13]
"XI.2.3 Subsection" -> [11, 2, 3]
"""
text = header_text.strip()

# Match Roman numerals at start, optionally combined with dots/numbers
match = re.match(r"^((?:[IVXLCDM]+[.\s-])+|[IVXLCDM]+$)", text, re.IGNORECASE)

if match:
numbering = match.group(0)
# Split into tokens by dot, dash, space
tokens = [t for t in re.split(r"[.\s-]", numbering) if t]

groups = []
try:
for tok in tokens:
if re.fullmatch(r"[IVXLCDM]+", tok, flags=re.IGNORECASE):
groups.append(roman_to_int(tok))
elif tok.isdigit():
groups.append(int(tok))
except KeyError:
# KeyError from converting roman numbers to int.
pass
return groups

return []
Loading