-
Notifications
You must be signed in to change notification settings - Fork 168
Description
Bug 💥
Getting "RecursionError: maximum recursion depth exceeded" error, while following the get started notebook: https://github.com/deepdoctection/notebooks/blob/main/Analyzer_Get_Started.ipynb
Desktop (please complete the following information, if any other than the one in the install requirements):
- OS: mac OS
Additional context 🧬
Please find the code below.
import deepdoctection as dd
from pathlib import Path
from matplotlib import pyplot as plt
from IPython.core.display import HTML
# Simple PDF extraction example using DeepDoctection
# Make sure you have installed: pip install transformers python-doctr deepdoctection
def extract_pdf_text(pdf_path):
analyzer = dd.get_dd_analyzer() # instantiate the built-in analyzer
df = analyzer.analyze(path=pdf_path)
df.reset_state() # Trigger some initialization
doc = iter(df)
page = next(doc)
type(page)
# print(page.text)
if __name__ == "__main__":
extract_pdf_text("sample.pdf")
Here are the error logs:
[0622 22:36.56 @file_utils.py:30] INF PyTorch version 2.7.1 available.
[0622 22:36.56 @file_utils.py:93] INF Disabling Tensorflow because USE_TORCH is set
[0622 22:37.02 @dd.py:129] INF Config:
{'DEVICE': device(type='mps'),
'LANGUAGE': None,
'LAYOUT_LINK': {'CHILD_CATEGORIES': [<LayoutType.CAPTION>],
'PARENTAL_CATEGORIES': [<LayoutType.FIGURE>, <LayoutType.TABLE>]},
'LAYOUT_NMS_PAIRS': {'COMBINATIONS': [[<LayoutType.TABLE>, <LayoutType.TITLE>],
[<LayoutType.TABLE>, <LayoutType.TEXT>],
[<LayoutType.TABLE>, <LayoutType.KEY_VALUE_AREA>],
[<LayoutType.TABLE>, <LayoutType.LIST_ITEM>],
[<LayoutType.TABLE>, <LayoutType.LIST>],
[<LayoutType.TABLE>, <LayoutType.FIGURE>],
[<LayoutType.TITLE>, <LayoutType.TEXT>],
[<LayoutType.TEXT>, <LayoutType.KEY_VALUE_AREA>],
[<LayoutType.TEXT>, <LayoutType.LIST_ITEM>],
[<LayoutType.TEXT>, <LayoutType.CAPTION>],
[<LayoutType.KEY_VALUE_AREA>, <LayoutType.LIST_ITEM>],
[<LayoutType.FIGURE>, <LayoutType.CAPTION>]],
'PRIORITY': [<LayoutType.TABLE>, <LayoutType.TABLE>, <LayoutType.TABLE>,
<LayoutType.TABLE>, <LayoutType.TABLE>, <LayoutType.TABLE>,
<LayoutType.TEXT>, <LayoutType.TEXT>, None, <LayoutType.CAPTION>,
<LayoutType.KEY_VALUE_AREA>, <LayoutType.FIGURE>],
'THRESHOLDS': [0.001, 0.01, 0.01, 0.001, 0.01, 0.01, 0.05, 0.01, 0.01, 0.01,
0.01, 0.001]},
'LIB': 'PT',
'OCR': {'CONFIG': {'TESSERACT': 'dd/conf_tesseract.yaml'},
'USE_DOCTR': True,
'USE_TESSERACT': False,
'USE_TEXTRACT': False,
'WEIGHTS': {'DOCTR_RECOGNITION': {'PT': 'doctr/crnn_vgg16_bn/pt/crnn_vgg16_bn-9762b0b0.pt',
'TF': 'doctr/crnn_vgg16_bn/tf/crnn_vgg16_bn-76b7f2c6.zip'},
'DOCTR_WORD': {'PT': 'doctr/db_resnet50/pt/db_resnet50-ac60cadc.pt',
'TF': 'doctr/db_resnet50/tf/db_resnet50-adcafc63.zip'}}},
'PDF_MINER': {'X_TOLERANCE': 3, 'Y_TOLERANCE': 3},
'PT': {'CELL': {'FILTER': None,
'PAD': {'BOTTOM': 60, 'LEFT': 60, 'RIGHT': 60, 'TOP': 60},
'PADDING': False,
'WEIGHTS': 'cell/d2_model_1849999_cell_inf_only.pt',
'WEIGHTS_TS': 'cell/d2_model_1849999_cell_inf_only.ts'},
'ENFORCE_WEIGHTS': {'CELL': False, 'ITEM': False, 'LAYOUT': False},
'ITEM': {'FILTER': ['table'],
'PAD': {'BOTTOM': 60, 'LEFT': 60, 'RIGHT': 60, 'TOP': 60},
'PADDING': False,
'WEIGHTS': 'deepdoctection/tatr_tab_struct_v2/pytorch_model.bin',
'WEIGHTS_TS': 'item/d2_model_1639999_item_inf_only.ts'},
'LAYOUT': {'FILTER': None,
'PAD': {'BOTTOM': 0, 'LEFT': 0, 'RIGHT': 0, 'TOP': 0},
'PADDING': False,
'WEIGHTS': 'Aryn/deformable-detr-DocLayNet/model.safetensors',
'WEIGHTS_TS': 'layout/d2_model_0829999_layout_inf_only.ts'}},
'SEGMENTATION': {'ASSIGNMENT_RULE': 'ioa',
'CELL_NAMES': [<CellType.HEADER>, <CellType.BODY>, <LayoutType.CELL>],
'FULL_TABLE_TILING': True,
'ITEM_NAMES': [<LayoutType.ROW>, <LayoutType.COLUMN>],
'PUBTABLES_CELL_NAMES': [<LayoutType.CELL>],
'PUBTABLES_ITEM_HEADER_CELL_NAMES': [<CellType.COLUMN_HEADER>,
<CellType.ROW_HEADER>,
<CellType.PROJECTED_ROW_HEADER>],
'PUBTABLES_ITEM_HEADER_THRESHOLDS': [0.6, 0.0001],
'PUBTABLES_ITEM_NAMES': [<LayoutType.ROW>, <LayoutType.COLUMN>],
'PUBTABLES_SPANNING_CELL_NAMES': [<CellType.SPANNING>],
'PUBTABLES_SUB_ITEM_NAMES': [<CellType.ROW_NUMBER>, <CellType.COLUMN_NUMBER>],
'REMOVE_IOU_THRESHOLD_COLS': 0.2,
'REMOVE_IOU_THRESHOLD_ROWS': 0.2,
'STRETCH_RULE': 'equal',
'SUB_ITEM_NAMES': [<CellType.ROW_NUMBER>, <CellType.COLUMN_NUMBER>],
'TABLE_NAME': <LayoutType.TABLE>,
'THRESHOLD_COLS': 0.4,
'THRESHOLD_ROWS': 0.4},
'TEXT_CONTAINER': <LayoutType.WORD>,
'TEXT_ORDERING': {'BROKEN_LINE_TOLERANCE': 0.003,
'FLOATING_TEXT_BLOCK_CATEGORIES': (<LayoutType.TEXT>, <LayoutType.TITLE>,
<LayoutType.LIST>,
<LayoutType.KEY_VALUE_AREA>),
'HEIGHT_TOLERANCE': 2.0,
'INCLUDE_RESIDUAL_TEXT_CONTAINER': True,
'PARAGRAPH_BREAK': 0.035,
'STARTING_POINT_TOLERANCE': 0.005,
'TEXT_BLOCK_CATEGORIES': (<LayoutType.TEXT>, <LayoutType.TITLE>,
<LayoutType.LIST_ITEM>, <LayoutType.LIST>,
<LayoutType.CAPTION>, <LayoutType.PAGE_HEADER>,
<LayoutType.PAGE_FOOTER>, <LayoutType.PAGE_NUMBER>,
<LayoutType.MARK>, <LayoutType.KEY_VALUE_AREA>,
<LayoutType.FIGURE>, <CellType.SPANNING>,
<LayoutType.CELL>)},
'TF': {'CELL': {'FILTER': None, 'WEIGHTS': 'cell/model-1800000_inf_only.data-00000-of-00001'},
'ITEM': {'FILTER': None, 'WEIGHTS': 'item/model-1620000_inf_only.data-00000-of-00001'},
'LAYOUT': {'FILTER': None, 'WEIGHTS': 'layout/model-800000_inf_only.data-00000-of-00001'}},
'USE_LAYOUT': True,
'USE_LAYOUT_LINK': False,
'USE_LAYOUT_NMS': True,
'USE_LINE_MATCHER': False,
'USE_OCR': True,
'USE_PDF_MINER': False,
'USE_ROTATOR': False,
'USE_TABLE_REFINEMENT': False,
'USE_TABLE_SEGMENTATION': True,
'WORD_MATCHING': {'MAX_PARENT_ONLY': True,
'PARENTAL_CATEGORIES': (<LayoutType.TEXT>, <LayoutType.TITLE>,
<LayoutType.LIST_ITEM>, <LayoutType.LIST>,
<LayoutType.CAPTION>, <LayoutType.PAGE_HEADER>,
<LayoutType.PAGE_FOOTER>, <LayoutType.PAGE_NUMBER>,
<LayoutType.MARK>, <LayoutType.KEY_VALUE_AREA>,
<LayoutType.FIGURE>, <CellType.SPANNING>,
<LayoutType.CELL>),
'RULE': 'ioa',
'THRESHOLD': 0.3}}
[0622 22:37.03 @fs.py:138] INF File db_resnet50-ac60cadc.pt exists! Skip download.
Traceback (most recent call last):
File "/Users/john/Code/Python/DeepDoc/.projectenv/lib/python3.12/site-packages/deepdoctection/utils/file_utils.py", line 667, in get_doctr_requirement
return get_doctr_requirement()
^^^^^^^^^^^^^^^^^^^^^^^
File "/Users/john/Code/Python/DeepDoc/.projectenv/lib/python3.12/site-packages/deepdoctection/utils/file_utils.py", line 667, in get_doctr_requirement
return get_doctr_requirement()
^^^^^^^^^^^^^^^^^^^^^^^
File "/Users/john/Code/Python/DeepDoc/.projectenv/lib/python3.12/site-packages/deepdoctection/utils/file_utils.py", line 667, in get_doctr_requirement
return get_doctr_requirement()
^^^^^^^^^^^^^^^^^^^^^^^
[Previous line repeated 995 more times]
File "/Users/john/Code/Python/DeepDoc/.projectenv/lib/python3.12/site-packages/deepdoctection/utils/file_utils.py", line 666, in get_doctr_requirement
if not get_poppler_version():
^^^^^^^^^^^^^^^^^^^^^
File "/Users/john/Code/Python/DeepDoc/.projectenv/lib/python3.12/site-packages/deepdoctection/utils/file_utils.py", line 428, in get_poppler_version
if pdf_to_ppm_available():
^^^^^^^^^^^^^^^^^^^^^^
RecursionError: maximum recursion depth exceeded