Skip to content

Проблема TabbyPdfError(Exception in thread "main" java.lang.OutOfMemoryError: Java heap space при парсинге документов #489

@FatherOctber

Description

@FatherOctber

При парсинге документа поймали джавовый ООМ:

`2024-08-27 16:25:07,893 - /dedoc_root/dedoc/dedoc_manager.py - INFO - Get file tmpggtlpvfp.pdf with parameters {'document_type': 'diploma', 'structure_type': 'tree', 'return_format': 'html', 'with_attachments': 'false', 'need_content_analysis': 'false', 'recursion_deep_attachments': '10', 'return_base64': 'false', 'need_pdf_table_analysis': 'true', 'table_type': '', 'orient_analysis_cells': 'false', 'orient_cell_angle': '90', 'pdf_with_text_layer': 'auto_tabby', 'language': 'rus', 'pages': ':', 'is_one_column_document': 'true', 'document_orientation': 'auto', 'need_header_footer_analysis': 'false', 'need_binarization': 'false', 'delimiter': None, 'encoding': None, 'html_fields': '', 'handle_invisible_table': 'true', 'attachments_dir': '/tmp/tmpkupfsv1p'}
2024-08-27 16:25:07,893 - /dedoc_root/dedoc/dedoc_manager.py - INFO - Start handle /tmp/tmpkupfsv1p/tmpggtlpvfp.pdf
2024-08-27 16:25:10,058 - /dedoc_root/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py - INFO - Reading PDF pages from 1 to 8
2024-08-27 16:26:01,569 - /dedoc_root/dedoc/readers/pdf_reader/pdf_auto_reader/pdf_auto_reader.py - INFO - Assume document 1724765107_455.pdf has a correct textual layer
2024-08-27 16:26:03,394 - /dedoc_root/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py - INFO - Reading PDF pages from 1 to 1801
2024-08-27 16:32:51,051 - /dedoc_root/dedoc/api/dedoc_api.py - ERROR - Exception TabbyPdfError(Exception in thread "main" java.lang.OutOfMemoryError: Java heap space
at java.base/java.util.Arrays.copyOf(Arrays.java:3745)
at java.base/java.lang.AbstractStringBuilder.ensureCapacityInternal(AbstractStringBuilder.java:172)
at java.base/java.lang.AbstractStringBuilder.append(AbstractStringBuilder.java:538)
at java.base/java.lang.StringBuffer.append(StringBuffer.java:317)
at java.base/java.io.StringWriter.write(StringWriter.java:106)
at org.json.JSONObject.write(JSONObject.java:2565)
at org.json.JSONObject.writeValue(JSONObject.java:2484)
at org.json.JSONArray.write(JSONArray.java:1540)
at org.json.JSONObject.writeValue(JSONObject.java:2486)
at org.json.JSONObject.write(JSONObject.java:2571)
at org.json.JSONObject.writeValue(JSONObject.java:2484)
at org.json.JSONArray.write(JSONArray.java:1540)
at org.json.JSONObject.writeValue(JSONObject.java:2486)
at org.json.JSONObject.write(JSONObject.java:2571)
at org.json.JSONObject.writeValue(JSONObject.java:2484)
at org.json.JSONArray.write(JSONArray.java:1540)
at org.json.JSONObject.writeValue(JSONObject.java:2486)
at org.json.JSONObject.write(JSONObject.java:2571)
at org.json.JSONObject.toString(JSONObject.java:2354)
at org.json.JSONObject.toString(JSONObject.java:2319)
at writers.JsonDocumentWriter.write(JsonDocumentWriter.java:51)
at DedocTableExtractor.printJSON(DedocTableExtractor.java:185)
at DedocTableExtractor.extract(DedocTableExtractor.java:131)
at DedocTableExtractor.run(DedocTableExtractor.java:89)
at DedocTableExtractor.main(DedocTableExtractor.java:58)
)
Traceback (most recent call last):
File "/dedoc_root/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py", line 296, in __run
result = subprocess.run(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.DEVNULL, check=True)
File "/usr/lib/python3.9/subprocess.py", line 528, in run
raise CalledProcessError(retcode, process.args,
subprocess.CalledProcessError: Command '['java', '-jar', '/dedoc_root/dedoc/readers/pdf_reader/pdf_txtlayer_reader/tabbypdf/jars/ispras_tbl_extr.jar', '-i', '/tmp/tmpks3qpt9b/1724765107_455.pdf', '-sp', '1', '-ep', '1801']' returned non-zero exit status 1.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
File "/usr/local/lib/python3.9/dist-packages/starlette/middleware/exceptions.py", line 68, in call
await self.app(scope, receive, sender)
File "/usr/local/lib/python3.9/dist-packages/starlette/routing.py", line 718, in call
await route.handle(scope, receive, send)
File "/usr/local/lib/python3.9/dist-packages/starlette/routing.py", line 276, in handle
await self.app(scope, receive, send)
File "/usr/local/lib/python3.9/dist-packages/starlette/routing.py", line 66, in app
response = await func(request)
File "/usr/local/lib/python3.9/dist-packages/fastapi/routing.py", line 299, in app
raise e
File "/usr/local/lib/python3.9/dist-packages/fastapi/routing.py", line 294, in app
raw_response = await run_endpoint_function(
File "/usr/local/lib/python3.9/dist-packages/fastapi/routing.py", line 191, in run_endpoint_function
return await dependant.call(**values)
File "/dedoc_root/dedoc/api/dedoc_api.py", line 81, in upload
document_tree = manager.parse(file_path, parameters={**dict(parameters), "attachments_dir": tmpdir})
File "/dedoc_root/dedoc/dedoc_manager.py", line 81, in parse
raise e
File "/dedoc_root/dedoc/dedoc_manager.py", line 74, in parse
return self.__parse_no_error_handling(file_path=file_path, parameters=parameters)
File "/dedoc_root/dedoc/dedoc_manager.py", line 107, in __parse_no_error_handling
converted_file_path, unstructured_document = self.__read_with_mime_auto_detection(
File "/dedoc_root/dedoc/dedoc_manager.py", line 153, in __read_with_mime_auto_detection
converted_file_path, document = self.__parse_file(file_path=file_path, file_name=file_name, parameters=parameters, mime=mime, extension=extension)
File "/dedoc_root/dedoc/dedoc_manager.py", line 177, in __parse_file
unstructured_document = self.reader.read(file_path=converted_file_path, parameters=parameters, mime=mime, extension=extension)
File "/dedoc_root/dedoc/readers/reader_composition.py", line 39, in read
unstructured_document = reader.read(file_path=file_path, parameters=parameters)
File "/dedoc_root/dedoc/readers/pdf_reader/pdf_auto_reader/pdf_auto_reader.py", line 59, in read
result = self.__handle_correct_text_layer(is_first_page_correct=txtlayer_parameters.is_first_page_correct,
File "/dedoc_root/dedoc/readers/pdf_reader/pdf_auto_reader/pdf_auto_reader.py", line 99, in __handle_correct_text_layer
result = reader.read(file_path=path, parameters=parameters)
File "/dedoc_root/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py", line 62, in read
lines, tables, tables_on_images, attachments, document_metadata = self.__extract(path=file_path, parameters=parameters, warnings=warnings)
File "/dedoc_root/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py", line 105, in __extract
document = self.__process_pdf(path=path, start_page=first_tabby_page, end_page=last_tabby_page)
File "/dedoc_root/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py", line 308, in __process_pdf
output = self.__run(path=path, start_page=start_page, end_page=end_page)
File "/dedoc_root/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py", line 303, in __run
raise TabbyPdfError(e.stderr.decode(encoding))
dedoc.common.exceptions.tabby_pdf_error.TabbyPdfError: TabbyPdfError(Exception in thread "main" java.lang.OutOfMemoryError: Java heap space
at java.base/java.util.Arrays.copyOf(Arrays.java:3745)
at java.base/java.lang.AbstractStringBuilder.ensureCapacityInternal(AbstractStringBuilder.java:172)
at java.base/java.lang.AbstractStringBuilder.append(AbstractStringBuilder.java:538)
at java.base/java.lang.StringBuffer.append(StringBuffer.java:317)
at java.base/java.io.StringWriter.write(StringWriter.java:106)
at org.json.JSONObject.write(JSONObject.java:2565)
at org.json.JSONObject.writeValue(JSONObject.java:2484)
at org.json.JSONArray.write(JSONArray.java:1540)
at org.json.JSONObject.writeValue(JSONObject.java:2486)
at org.json.JSONObject.write(JSONObject.java:2571)
at org.json.JSONObject.writeValue(JSONObject.java:2484)
at org.json.JSONArray.write(JSONArray.java:1540)
at org.json.JSONObject.writeValue(JSONObject.java:2486)
at org.json.JSONObject.write(JSONObject.java:2571)
at org.json.JSONObject.writeValue(JSONObject.java:2484)
at org.json.JSONArray.write(JSONArray.java:1540)
at org.json.JSONObject.writeValue(JSONObject.java:2486)
at org.json.JSONObject.write(JSONObject.java:2571)
at org.json.JSONObject.toString(JSONObject.java:2354)
at org.json.JSONObject.toString(JSONObject.java:2319)
at writers.JsonDocumentWriter.write(JsonDocumentWriter.java:51)
at DedocTableExtractor.printJSON(DedocTableExtractor.java:185)
at DedocTableExtractor.extract(DedocTableExtractor.java:131)
at DedocTableExtractor.run(DedocTableExtractor.java:89)
at DedocTableExtractor.main(DedocTableExtractor.java:58)
)

INFO: 127.0.0.1:53028 - "POST /upload HTTP/1.1" 500 Internal Server Error`

v2.2.6

Metadata

Metadata

Labels

No labels
No labels

Type

No type

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions