diff --git a/mindee/image/__init__.py b/mindee/image/__init__.py index f562ff76..e69de29b 100644 --- a/mindee/image/__init__.py +++ b/mindee/image/__init__.py @@ -1,3 +0,0 @@ -from mindee.image.image_compressor import compress_image - -__all__ = ["compress_image"] diff --git a/mindee/image/extracted_image.py b/mindee/image/extracted_image.py index 6d0df276..3ad6af8a 100644 --- a/mindee/image/extracted_image.py +++ b/mindee/image/extracted_image.py @@ -1,14 +1,12 @@ from __future__ import annotations -import io from pathlib import Path -from typing import Any +from typing import Any, BinaryIO from mindee.dependencies.checkers import PILLOW_AVAILABLE from mindee.dependencies.decorators import requires_pillow from mindee.error.mindee_error import MindeeError -from mindee.input.file_input import FileInput -from mindee.input.local_input_source import LocalInputSource +from mindee.input.bytes_input import BytesInput from mindee.logger import logger if PILLOW_AVAILABLE: @@ -21,78 +19,68 @@ class ExtractedImage: """Generic class for image extraction.""" + buffer: BinaryIO + filename: str _page_id: int """Id of the page the image was extracted from.""" _element_id: int """Id of the element on a given page.""" - filename: str - """Name of the file the image was extracted from.""" def __init__( - self, input_source: LocalInputSource, page_id: int, element_id: int + self, + img_byte_stream: BinaryIO, + filename: str, + page_id: int, + element_id: int, ) -> None: """ Initialize the ExtractedImage with a buffer and an internal file name. - :param input_source: Local source for input. + :param img_byte_stream: The raw image bytes. + :param filename: Name of the file. :param page_id: ID of the page the element was found on. :param element_id: ID of the element in a page. """ - self.buffer = io.BytesIO(input_source.file_object.read()) - self.buffer.name = input_source.filename - self.filename = input_source.filename - if input_source.is_pdf(): - extension = "jpg" - else: - extension = Path(input_source.filename).resolve().suffix + self.buffer = img_byte_stream self.buffer.seek(0) - pg_number = str(page_id).zfill(3) - elem_number = str(element_id).zfill(3) - self.internal_file_name = ( - f"{input_source.filename}_page{pg_number}-{elem_number}.{extension}" - ) + self.filename = filename self._page_id = page_id self._element_id = 0 if element_id is None else element_id @requires_pillow - def save_to_file(self, output_path: Path | str, file_format: str | None = None): + def save_to_file(self, output_path: Path | str): """ Saves the document to a file. :param output_path: Path to save the file to. - :param file_format: Optional PIL-compatible format for the file. Inferred from file extension if not provided. :raises MindeeError: If an invalid path or filename is provided. """ + out_path = Path(output_path) + if not out_path.resolve().is_dir(): + raise MindeeError("Provided path is not a directory.") + out_file_path = out_path / self.filename try: - resolved_path = Path(output_path).resolve() - if not file_format and len(resolved_path.suffix) < 1: - raise ValueError("Invalid file format.") self.buffer.seek(0) image = Image.open(self.buffer) - if file_format: - image.save(resolved_path, format=file_format) - else: - image.save(resolved_path) - logger.info("File saved successfully to '%s'.", resolved_path) - except TypeError as e: - raise MindeeError("Invalid path/filename provided.") from e + image.save(out_file_path) + logger.info("File saved successfully to '%s'.", out_file_path) except Exception as e: print(e) raise MindeeError(f"Could not save file {Path(output_path).name}.") from e - def as_input_source(self) -> FileInput: + def as_input_source(self) -> BytesInput: """ Return the file as a Mindee-compatible BufferInput source. :returns: A BufferInput source. """ self.buffer.seek(0) - return FileInput(self.buffer) + return BytesInput(self.buffer.read(), self.filename) @property def page_id(self): """ - ID of the page the receipt was found on. + ID of the page the image was found on. :return: A valid page ID. """ diff --git a/mindee/image/extracted_images.py b/mindee/image/extracted_images.py new file mode 100644 index 00000000..cf55e57e --- /dev/null +++ b/mindee/image/extracted_images.py @@ -0,0 +1,12 @@ +from pathlib import Path + +from mindee.image.extracted_image import ExtractedImage + + +class ExtractedImages(list[ExtractedImage]): + """List of extracted images.""" + + def save_all_to_disk(self, output_path: Path | str) -> None: + """Save all extracted images to disk.""" + for image in self: + image.save_to_file(output_path) diff --git a/mindee/image/image_extractor.py b/mindee/image/image_extractor.py index 0a33168a..681b46f9 100644 --- a/mindee/image/image_extractor.py +++ b/mindee/image/image_extractor.py @@ -1,6 +1,7 @@ from __future__ import annotations import io +from pathlib import Path from typing import Any, BinaryIO from mindee.dependencies import requires_pypdfium2 @@ -10,7 +11,6 @@ from mindee.geometry.point import Point from mindee.geometry.polygon import Polygon, get_min_max_x, get_min_max_y from mindee.image.extracted_image import ExtractedImage -from mindee.input.bytes_input import BytesInput from mindee.input.local_input_source import LocalInputSource if PYPDFIUM2_AVAILABLE: @@ -29,7 +29,7 @@ @requires_pillow @requires_pypdfium2 -def attach_image_as_new_file( # type: ignore +def _attach_image_as_new_file( # type: ignore input_buffer: BinaryIO, ) -> pdfium.PdfDocument: """ @@ -66,7 +66,7 @@ def extract_image_from_polygon( width: float, height: float, file_format: str, -) -> bytes: +) -> BinaryIO: """ Crops the image from the given polygon. @@ -87,11 +87,11 @@ def extract_image_from_polygon( int(min_max_y.max * height), ) ) - return save_image_to_buffer(cropped_image, file_format) + return _save_image_to_buffer(cropped_image, file_format) @requires_pillow -def save_image_to_buffer(image: Image.Image, file_format: str) -> bytes: +def _save_image_to_buffer(image: Image.Image, file_format: str) -> BinaryIO: """ Saves an image as a buffer. @@ -102,7 +102,7 @@ def save_image_to_buffer(image: Image.Image, file_format: str) -> bytes: buffer = io.BytesIO() image.save(buffer, format=file_format) buffer.seek(0) - return buffer.read() + return buffer @requires_pillow @@ -145,7 +145,8 @@ def extract_multiple_images_from_source( :param polygons: List of coordinates to pull the elements from. :return: List of byte arrays representing the extracted elements. """ - page = load_pdf_doc(input_source).get_page(page_id) + stem = Path(input_source.filename).stem + page = _load_pdf_doc(input_source).get_page(page_id) page_content = page.render().to_pil() width, height = page.get_size() @@ -159,20 +160,17 @@ def extract_multiple_images_from_source( ) extracted_elements.append( ExtractedImage( - BytesInput( - image_data, - f"{input_source.filename}_page{page_id + 1}-{element_id}.{file_extension}", - ), + image_data, + f"{stem}_page-{(page_id + 1):03d}-item-{(element_id + 1):03d}.{file_extension}", page_id, element_id, ) ) - return extracted_elements @requires_pypdfium2 -def load_pdf_doc(input_file: LocalInputSource) -> pdfium.PdfDocument: # type: ignore +def _load_pdf_doc(input_file: LocalInputSource) -> pdfium.PdfDocument: # type: ignore """ Loads a PDF document from a local input source. @@ -183,4 +181,4 @@ def load_pdf_doc(input_file: LocalInputSource) -> pdfium.PdfDocument: # type: i input_file.file_object.seek(0) return pdfium.PdfDocument(input_file.file_object.read()) - return attach_image_as_new_file(input_file.file_object) + return _attach_image_as_new_file(input_file.file_object) diff --git a/mindee/input/local_input_source.py b/mindee/input/local_input_source.py index 1a8ecf15..0fe78842 100644 --- a/mindee/input/local_input_source.py +++ b/mindee/input/local_input_source.py @@ -10,7 +10,7 @@ from mindee.dependencies.checkers import PYPDFIUM2_AVAILABLE from mindee.error.mimetype_error import MimeTypeError from mindee.error.mindee_error import MindeeError, MindeeSourceError -from mindee.image import compress_image +from mindee.image.image_compressor import compress_image from mindee.input.page_options import KEEP_ONLY, REMOVE, PageOptions from mindee.logger import logger from mindee.pdf.pdf_compressor import compress_pdf diff --git a/mindee/pdf/extracted_pdf.py b/mindee/pdf/extracted_pdf.py index 21a0137f..7d1ec8e9 100644 --- a/mindee/pdf/extracted_pdf.py +++ b/mindee/pdf/extracted_pdf.py @@ -18,18 +18,22 @@ class ExtractedPDF: """An extracted sub-Pdf.""" - pdf_bytes: BinaryIO + buffer: BinaryIO filename: str + _page_indexes: tuple[int, int] - def __init__(self, pdf_bytes: BinaryIO, filename: str): - self.pdf_bytes = pdf_bytes + def __init__( + self, pdf_byte_stream: BinaryIO, filename: str, page_indexes: tuple[int, int] + ): + self.buffer = pdf_byte_stream self.filename = filename + self._page_indexes = page_indexes @requires_pypdfium2 def get_page_count(self) -> int: """Get the number of pages in the PDF file.""" try: - pdf = pdfium.PdfDocument(self.pdf_bytes) + pdf = pdfium.PdfDocument(self.buffer) return len(pdf) except Exception as e: raise MindeeError( @@ -40,21 +44,28 @@ def save_to_file(self, output_path: Path | str): """ Writes the contents of the current PDF object to a file. - :param output_path: Path of the destination file. If - not extension is provided, pdf will be appended by default. + :param output_path: Path of the destination file. + If no extension is provided, '.pdf' will be appended by default. """ out_path = Path(output_path) - if out_path.resolve().is_dir(): - raise MindeeError("Provided path is not a file.") - if not output_path or not out_path.parent.exists(): - raise MindeeError("Invalid save path provided {}.") - if out_path.suffix.lower() != "pdf": - out_path = out_path.parent / (out_path.stem + "." + "pdf") - self.pdf_bytes.seek(0) - with open(out_path, "wb") as out_file: - out_file.write(self.pdf_bytes.read()) + if not out_path.resolve().is_dir(): + raise MindeeError("Provided path is not a directory.") + out_file_path = out_path / self.filename + + try: + self.buffer.seek(0) + with open(out_file_path, "wb") as out_file: + out_file.write(self.buffer.read()) + except Exception as e: + print(e) + raise MindeeError(f"Could not save file {out_file_path}.") from e def as_input_source(self) -> BytesInput: """Returns the current PDF object as a usable BytesInput source.""" - self.pdf_bytes.seek(0) - return BytesInput(self.pdf_bytes.read(), self.filename) + self.buffer.seek(0) + return BytesInput(self.buffer.read(), self.filename) + + @property + def page_indexes(self) -> tuple[int, int]: + """This PDF was extracted from this page range of the original PDF.""" + return self._page_indexes diff --git a/mindee/pdf/extracted_pdfs.py b/mindee/pdf/extracted_pdfs.py new file mode 100644 index 00000000..2701b627 --- /dev/null +++ b/mindee/pdf/extracted_pdfs.py @@ -0,0 +1,13 @@ +from pathlib import Path + +from mindee.pdf.extracted_pdf import ExtractedPDF + + +class ExtractedPDFs(list[ExtractedPDF]): + """List of extracted PDFs.""" + + def save_all_to_disk(self, output_path: Path | str) -> None: + """Save all extracted images to disk.""" + + for image in self: + image.save_to_file(output_path) diff --git a/mindee/pdf/pdf_extractor.py b/mindee/pdf/pdf_extractor.py index 3e081aa3..7ea91897 100644 --- a/mindee/pdf/pdf_extractor.py +++ b/mindee/pdf/pdf_extractor.py @@ -68,7 +68,7 @@ def extract_sub_documents( """ Extract the sub-documents from the main pdf, based on the given list of page indexes. - :param page_indexes: List of list of numbers, representing page indexes. + :param page_indexes: 2D list of numbers, representing page indexes. :return: A list of created PDFS. """ extracted_pdfs: list[ExtractedPDF] = [] @@ -80,10 +80,12 @@ def extract_sub_documents( for page_index in page_index_elem: if page_index > self.get_page_count(): raise MindeeError(f"Index {page_index} is out of range.") - formatted_max_index = f"{page_index_elem[len(page_index_elem) - 1] + 1:03d}" - field_filename = f"{stem}_{(page_index_elem[0] + 1):03d}-{formatted_max_index}{extension}" + first_page = page_index_elem[0] + last_page = page_index_elem[len(page_index_elem) - 1] extracted_pdf = ExtractedPDF( - self.cut_pages(page_index_elem), field_filename + self.cut_pages(page_index_elem), + f"{stem}_pages-{(first_page + 1):03d}-{(last_page + 1):03d}{extension}", + (first_page, last_page), ) extracted_pdfs.append(extracted_pdf) return extracted_pdfs diff --git a/mindee/v2/file_operations/crop.py b/mindee/v2/file_operations/crop.py index 3f69b88c..15c0fe80 100644 --- a/mindee/v2/file_operations/crop.py +++ b/mindee/v2/file_operations/crop.py @@ -1,9 +1,9 @@ from mindee.error import MindeeError from mindee.geometry import Point, Polygon from mindee.image.extracted_image import ExtractedImage +from mindee.image.extracted_images import ExtractedImages from mindee.image.image_extractor import extract_multiple_images_from_source from mindee.input.local_input_source import LocalInputSource -from mindee.v2.file_operations.crop_files import CropFiles from mindee.v2.parsing.inference.field import FieldLocation from mindee.v2.product.crop.crop_item import CropItem @@ -25,7 +25,7 @@ def extract_single_crop( def extract_multiple_crops( input_source: LocalInputSource, crops: list[CropItem] -) -> CropFiles: +) -> ExtractedImages: """ Extracts individual receipts from multi-receipts documents. @@ -49,4 +49,4 @@ def extract_multiple_crops( polygon, ) ) - return CropFiles(images) + return ExtractedImages(images) diff --git a/mindee/v2/file_operations/crop_files.py b/mindee/v2/file_operations/crop_files.py deleted file mode 100644 index 4bb9f341..00000000 --- a/mindee/v2/file_operations/crop_files.py +++ /dev/null @@ -1,20 +0,0 @@ -from pathlib import Path - -from mindee.image.extracted_image import ExtractedImage - - -class CropFiles(list[ExtractedImage]): - """Crop files.""" - - def save_all_to_disk(self, path: Path | str, prefix: str = "crop"): - """ - Save all extracted crops to disk. - - :param path: Path to save the extracted splits to. - :param prefix: Prefix to add to the filename, defaults to 'crop'. - """ - if isinstance(path, str): - path = Path(path) - path.mkdir(parents=True, exist_ok=True) - for idx, split in enumerate(self, start=1): - split.save_to_file(path / f"{prefix}_{idx:03}.jpg") diff --git a/mindee/v2/file_operations/split.py b/mindee/v2/file_operations/split.py index 686f3929..8259b65f 100644 --- a/mindee/v2/file_operations/split.py +++ b/mindee/v2/file_operations/split.py @@ -1,8 +1,8 @@ from mindee.error import MindeeError from mindee.input.local_input_source import LocalInputSource from mindee.pdf.extracted_pdf import ExtractedPDF +from mindee.pdf.extracted_pdfs import ExtractedPDFs from mindee.pdf.pdf_extractor import PDFExtractor -from mindee.v2.file_operations.split_files import SplitFiles def extract_single_split( @@ -21,7 +21,7 @@ def extract_single_split( def extract_multiple_splits( input_source: LocalInputSource, splits: list[list[int]], -) -> SplitFiles: +) -> ExtractedPDFs: """ Extracts splits as complete PDFs from the document. @@ -35,4 +35,4 @@ def extract_multiple_splits( page_groups.append(list(range(split[0], split[1] + 1))) if len(splits) < 1: raise MindeeError("No indexes provided.") - return SplitFiles(pdf_extractor.extract_sub_documents(page_groups)) + return ExtractedPDFs(pdf_extractor.extract_sub_documents(page_groups)) diff --git a/mindee/v2/file_operations/split_files.py b/mindee/v2/file_operations/split_files.py deleted file mode 100644 index 8c23057b..00000000 --- a/mindee/v2/file_operations/split_files.py +++ /dev/null @@ -1,20 +0,0 @@ -from pathlib import Path - -from mindee.pdf.extracted_pdf import ExtractedPDF - - -class SplitFiles(list[ExtractedPDF]): - """Split files.""" - - def save_all_to_disk(self, path: str | Path, prefix: str = "split"): - """ - Save all extracted splits to disk. - - :param path: Path to save the extracted splits to. - :param prefix: Prefix to add to the filename, defaults to 'split'. - """ - if isinstance(path, str): - path = Path(path) - path.mkdir(parents=True, exist_ok=True) - for idx, split in enumerate(self, start=1): - split.save_to_file(path / f"{prefix}_{idx:03}.pdf") diff --git a/mindee/v2/product/crop/crop_result.py b/mindee/v2/product/crop/crop_result.py index 47561e90..d103e5b9 100644 --- a/mindee/v2/product/crop/crop_result.py +++ b/mindee/v2/product/crop/crop_result.py @@ -1,7 +1,7 @@ +from mindee.image.extracted_images import ExtractedImages from mindee.input.local_input_source import LocalInputSource from mindee.parsing.common.string_dict import StringDict from mindee.v2.file_operations.crop import extract_multiple_crops -from mindee.v2.file_operations.crop_files import CropFiles from mindee.v2.product.crop.crop_item import CropItem @@ -20,7 +20,9 @@ def __str__(self) -> str: out_str = f"Crops\n====={crops}" return out_str - def extract_from_input_source(self, input_source: LocalInputSource) -> CropFiles: + def extract_from_input_source( + self, input_source: LocalInputSource + ) -> ExtractedImages: """ Apply all the crops to a file and return a single extracted PDF. diff --git a/mindee/v2/product/split/split_result.py b/mindee/v2/product/split/split_result.py index ab3921bf..dd3ac9da 100644 --- a/mindee/v2/product/split/split_result.py +++ b/mindee/v2/product/split/split_result.py @@ -1,7 +1,7 @@ from mindee.input.local_input_source import LocalInputSource from mindee.parsing.common.string_dict import StringDict +from mindee.pdf.extracted_pdfs import ExtractedPDFs from mindee.v2.file_operations.split import extract_multiple_splits -from mindee.v2.file_operations.split_files import SplitFiles from mindee.v2.product.split.split_range import SplitRange @@ -20,7 +20,9 @@ def __str__(self) -> str: out_str = f"Splits\n======{splits}" return out_str - def extract_from_input_source(self, input_source: LocalInputSource) -> SplitFiles: + def extract_from_input_source( + self, input_source: LocalInputSource + ) -> ExtractedPDFs: """ Apply all the crops to a file and return a single extracted PDF. diff --git a/tests/input/test_compression.py b/tests/input/test_compression.py index 6b4371ab..5f9a941e 100644 --- a/tests/input/test_compression.py +++ b/tests/input/test_compression.py @@ -6,7 +6,7 @@ import pytest -from mindee.image import compress_image +from mindee.image.image_compressor import compress_image from mindee.input import PathInput from mindee.pdf.pdf_compressor import compress_pdf from mindee.pdf.pdf_utils import extract_text_from_pdf diff --git a/tests/v1/extraction/test_invoice_splitter_auto_extraction.py b/tests/v1/extraction/test_invoice_splitter_auto_extraction.py index 8c4d46c5..1a6be6de 100644 --- a/tests/v1/extraction/test_invoice_splitter_auto_extraction.py +++ b/tests/v1/extraction/test_invoice_splitter_auto_extraction.py @@ -53,13 +53,11 @@ def test_pdf_should_extract_invoices_strict(): ) for i, extracted_pdf in enumerate(extracted_base_pdfs): assert extracted_pdf.filename == extracted_pdfs_strict[i].filename - assert ( - extracted_pdf.pdf_bytes.read() == extracted_pdfs_strict[i].pdf_bytes.read() - ) + assert extracted_pdf.buffer.read() == extracted_pdfs_strict[i].buffer.read() assert len(extracted_pdfs_not_strict) == 2 - assert extracted_pdfs_not_strict[0].filename == "default_sample_001-001.pdf" - assert extracted_pdfs_not_strict[1].filename == "default_sample_002-002.pdf" + assert extracted_pdfs_not_strict[0].filename == "default_sample_pages-001-001.pdf" + assert extracted_pdfs_not_strict[1].filename == "default_sample_pages-002-002.pdf" invoice_0 = client.parse(InvoiceV4, extracted_pdfs_not_strict[0].as_input_source()) test_string_rst_invoice_0 = prepare_invoice_return( diff --git a/tests/v1/extraction/test_pdf_extractor.py b/tests/v1/extraction/test_pdf_extractor.py index f4871ac6..69f303b5 100644 --- a/tests/v1/extraction/test_pdf_extractor.py +++ b/tests/v1/extraction/test_pdf_extractor.py @@ -56,13 +56,13 @@ def test_pdf_should_extract_invoices_no_strict( assert len(extracted_pdfs_no_strict) == 3 assert extracted_pdfs_no_strict[0].get_page_count() == 1 - assert extracted_pdfs_no_strict[0].filename == "invoice_5p_001-001.pdf" + assert extracted_pdfs_no_strict[0].filename == "invoice_5p_pages-001-001.pdf" assert extracted_pdfs_no_strict[1].get_page_count() == 3 - assert extracted_pdfs_no_strict[1].filename == "invoice_5p_002-004.pdf" + assert extracted_pdfs_no_strict[1].filename == "invoice_5p_pages-002-004.pdf" assert extracted_pdfs_no_strict[2].get_page_count() == 1 - assert extracted_pdfs_no_strict[2].filename == "invoice_5p_005-005.pdf" + assert extracted_pdfs_no_strict[2].filename == "invoice_5p_pages-005-005.pdf" @pytest.mark.pillow @@ -79,7 +79,7 @@ def test_pdf_should_extract_invoices_strict( assert len(extracted_pdfs_strict) == 2 assert extracted_pdfs_strict[0].get_page_count() == 1 - assert extracted_pdfs_strict[0].filename == "invoice_5p_001-001.pdf" + assert extracted_pdfs_strict[0].filename == "invoice_5p_pages-001-001.pdf" assert extracted_pdfs_strict[1].get_page_count() == 4 - assert extracted_pdfs_strict[1].filename == "invoice_5p_002-005.pdf" + assert extracted_pdfs_strict[1].filename == "invoice_5p_pages-002-005.pdf" diff --git a/tests/v2/file_operations/test_crop_operation.py b/tests/v2/file_operations/test_crop_operation.py index 22fe809a..0f5d1b22 100644 --- a/tests/v2/file_operations/test_crop_operation.py +++ b/tests/v2/file_operations/test_crop_operation.py @@ -5,7 +5,6 @@ import pytest from mindee.input.path_input import PathInput -from mindee.v2.file_operations.crop import extract_multiple_crops from mindee.v2.product.crop.crop_response import ( CropResponse, ) @@ -14,58 +13,51 @@ Image = pytest.importorskip("PIL.Image") -@pytest.fixture -def crops_single_page_path(): - return V2_PRODUCT_DATA_DIR / "crop" / "default_sample.jpg" - - -@pytest.fixture -def crops_multi_page_path(): - return V2_PRODUCT_DATA_DIR / "crop" / "multipage_sample.pdf" - - -@pytest.fixture -def crops_single_page_json_path(): - return V2_PRODUCT_DATA_DIR / "crop" / "crop_single.json" - - -@pytest.fixture -def crops_multi_page_json_path(): - return V2_PRODUCT_DATA_DIR / "crop" / "crop_multiple.json" - - @pytest.mark.pillow @pytest.mark.pypdfium2 -def test_single_page_crop_split(crops_single_page_path, crops_single_page_json_path): - input_sample = PathInput(crops_single_page_path) - with open(crops_single_page_json_path, "rb") as f: - response = json.load(f) - doc = CropResponse(response) - extracted_crops = extract_multiple_crops(input_sample, doc.inference.result.crops) - assert len(extracted_crops) == 1 +def test_single_page_crop(): + input_sample = PathInput(V2_PRODUCT_DATA_DIR / "crop" / "default_sample.jpg") + with open(V2_PRODUCT_DATA_DIR / "crop" / "default_sample.json", "rb") as f: + response = CropResponse(json.load(f)) + extracted_crops = response.inference.result.extract_from_input_source(input_sample) + assert len(extracted_crops) == 2 + + crop0 = extracted_crops[0] + assert crop0.page_id == 0 + assert crop0.element_id == 0 + assert crop0.filename == "default_sample_page-001-item-001.jpg" + assert Image.open(crop0.buffer).size == (1057, 2071) - assert extracted_crops[0].page_id == 0 - assert extracted_crops[0].element_id == 0 - image_buffer_0 = Image.open(extracted_crops[0].buffer) - assert image_buffer_0.size == (2823, 1571) + crop1 = extracted_crops[1] + assert crop1.page_id == 0 + assert crop1.element_id == 1 + assert crop1.filename == "default_sample_page-001-item-002.jpg" + assert Image.open(crop1.buffer).size == (1298, 1869) @pytest.mark.pillow @pytest.mark.pypdfium2 -def test_multi_page_receipt_crop(crops_multi_page_path, crops_multi_page_json_path): - input_sample = PathInput(crops_multi_page_path) - with open(crops_multi_page_json_path, "rb") as f: - response = json.load(f) - doc = CropResponse(response) - extracted_crops = extract_multiple_crops(input_sample, doc.inference.result.crops) - assert len(extracted_crops) == 2 - - assert extracted_crops[0].page_id == 0 - assert extracted_crops[0].element_id == 0 - image_buffer_0 = Image.open(extracted_crops[0].buffer) - assert image_buffer_0.size == (156, 758) - - assert extracted_crops[1].page_id == 0 - assert extracted_crops[1].element_id == 1 - image_buffer_1 = Image.open(extracted_crops[1].buffer) - assert image_buffer_1.size == (187, 690) +def test_multi_page_crop(): + input_sample = PathInput(V2_PRODUCT_DATA_DIR / "crop" / "multipage_sample.pdf") + with open(V2_PRODUCT_DATA_DIR / "crop" / "multipage_sample.json", "rb") as f: + response = CropResponse(json.load(f)) + extracted_crops = response.inference.result.extract_from_input_source(input_sample) + assert len(extracted_crops) == 5 + + crop0 = extracted_crops[0] + assert crop0.page_id == 0 + assert crop0.element_id == 0 + assert crop0.filename == "multipage_sample_page-001-item-001.jpg" + assert Image.open(crop0.buffer).size == (200, 553) + + crop1 = extracted_crops[1] + assert crop1.page_id == 0 + assert crop1.element_id == 1 + assert crop1.filename == "multipage_sample_page-001-item-002.jpg" + assert Image.open(crop1.buffer).size == (203, 333) + + crop4 = extracted_crops[4] + assert crop4.page_id == 1 + assert crop4.element_id == 1 + assert crop4.filename == "multipage_sample_page-002-item-002.jpg" + assert Image.open(crop4.buffer).size == (197, 520) diff --git a/tests/v2/file_operations/test_crop_operation_integration.py b/tests/v2/file_operations/test_crop_operation_integration.py index ea44ac65..d2e047c5 100644 --- a/tests/v2/file_operations/test_crop_operation_integration.py +++ b/tests/v2/file_operations/test_crop_operation_integration.py @@ -15,16 +15,17 @@ from tests.utils import OUTPUT_DIR, V2_PRODUCT_DATA_DIR, cleanup_output_files -@pytest.fixture -def crop_sample(): - return V2_PRODUCT_DATA_DIR / "crop" / "default_sample.jpg" - - def check_findoc_return(findoc_response: ExtractionResponse): assert len(findoc_response.inference.model.id) > 0 assert findoc_response.inference.result.fields.get("total_amount").value > 0 +output_files = [ + "default_sample_page-001-item-001.jpg", + "default_sample_page-001-item-002.jpg", +] + + @pytest.mark.pillow @pytest.mark.pypdfium2 @pytest.mark.integration @@ -38,30 +39,30 @@ def test_image_should_extract_crops(): ) assert len(response.inference.result.crops) == 2 - extracted_images = extract_multiple_crops( + extracted_crops = extract_multiple_crops( crop_input, response.inference.result.crops ) - assert len(extracted_images) == 2 - assert extracted_images[0].filename == "default_sample.jpg_page1-0.jpg" - assert extracted_images[1].filename == "default_sample.jpg_page1-1.jpg" + assert len(extracted_crops) == 2 + assert extracted_crops[0].filename == output_files[0] + assert extracted_crops[1].filename == output_files[1] invoice_0 = client.enqueue_and_get_result( ExtractionResponse, - extracted_images[0].as_input_source(), + extracted_crops[0].as_input_source(), ExtractionParameters( getenv("MINDEE_V2_SE_TESTS_FINDOC_MODEL_ID"), close_file=False ), ) check_findoc_return(invoice_0) - extracted_images.save_all_to_disk(OUTPUT_DIR) - crop1size = os.path.getsize(OUTPUT_DIR / "crop_001.jpg") - crop2size = os.path.getsize(OUTPUT_DIR / "crop_002.jpg") - assert 180000 <= crop1size <= 199685 - assert 190000 <= crop2size <= 199433 + extracted_crops.save_all_to_disk(OUTPUT_DIR) + crop0_size = os.path.getsize(OUTPUT_DIR / output_files[0]) + crop1_size = os.path.getsize(OUTPUT_DIR / output_files[1]) + assert 180000 <= crop0_size <= 199685 + assert 190000 <= crop1_size <= 199433 @pytest.fixture(scope="module", autouse=True) def cleanup(): yield - cleanup_output_files(["crop_001.jpg", "crop_002.jpg"]) + cleanup_output_files(output_files) diff --git a/tests/v2/file_operations/test_split_operation.py b/tests/v2/file_operations/test_split_operation.py index e7cf3ddd..31a3047f 100644 --- a/tests/v2/file_operations/test_split_operation.py +++ b/tests/v2/file_operations/test_split_operation.py @@ -9,13 +9,6 @@ from tests.utils import V2_PRODUCT_DATA_DIR -@pytest.fixture -def splits_default(): - return ( - V2_PRODUCT_DATA_DIR / "extraction" / "financial_document" / "default_sample.jpg" - ) - - @pytest.fixture def splits_5p(): return V2_PRODUCT_DATA_DIR / "split" / "invoice_5p.pdf" @@ -32,26 +25,41 @@ def splits_multi_page_json_path(): @pytest.mark.pypdfium2 -def test_single_page_split(splits_default, splits_single_page_json_path): - input_sample = PathInput(splits_default) - with open(splits_single_page_json_path, "rb") as f: - response = json.load(f) - doc = SplitResponse(response) - extracted_splits = doc.inference.result.extract_from_input_source(input_sample) - assert len(extracted_splits) == 1 +def test_default_split(): + input_sample = PathInput(V2_PRODUCT_DATA_DIR / "split" / "default_sample.pdf") + with open(V2_PRODUCT_DATA_DIR / "split" / "default_sample.json", "rb") as f: + response = SplitResponse(json.load(f)) + extracted_splits = response.inference.result.extract_from_input_source(input_sample) + assert len(extracted_splits) == 2 assert extracted_splits[0].get_page_count() == 1 + assert extracted_splits[0].filename == "default_sample_pages-001-001.pdf" + assert extracted_splits[1].get_page_count() == 1 + assert extracted_splits[1].filename == "default_sample_pages-002-002.pdf" @pytest.mark.pypdfium2 def test_multi_page_receipt_split(splits_5p, splits_multi_page_json_path): input_sample = PathInput(splits_5p) with open(splits_multi_page_json_path, "rb") as f: - response = json.load(f) - doc = SplitResponse(response) - extracted_splits = doc.inference.result.extract_from_input_source(input_sample) + response = SplitResponse(json.load(f)) + extracted_splits = response.inference.result.extract_from_input_source(input_sample) assert len(extracted_splits) == 3 assert extracted_splits[0].get_page_count() == 1 + assert extracted_splits[0].filename == "invoice_5p_pages-001-001.pdf" assert extracted_splits[1].get_page_count() == 3 + assert extracted_splits[1].filename == "invoice_5p_pages-002-004.pdf" assert extracted_splits[2].get_page_count() == 1 + assert extracted_splits[2].filename == "invoice_5p_pages-005-005.pdf" + + +@pytest.mark.pypdfium2 +def test_multi_page_receipt_single_split(splits_5p, splits_multi_page_json_path): + input_sample = PathInput(splits_5p) + with open(splits_multi_page_json_path, "rb") as f: + response = SplitResponse(json.load(f)) + split = response.inference.result.splits[1] + extracted_split = split.extract_from_input_source(input_sample) + + assert extracted_split.get_page_count() == 3 diff --git a/tests/v2/file_operations/test_split_operation_integration.py b/tests/v2/file_operations/test_split_operation_integration.py index 1f604ea8..b5719bc7 100644 --- a/tests/v2/file_operations/test_split_operation_integration.py +++ b/tests/v2/file_operations/test_split_operation_integration.py @@ -13,19 +13,21 @@ from tests.utils import OUTPUT_DIR, V2_PRODUCT_DATA_DIR, cleanup_output_files -@pytest.fixture -def invoice_splitter_5p_path(): - return V2_PRODUCT_DATA_DIR / "split" / "invoice_5p.pdf" - - def check_findoc_return(findoc_response: ExtractionResponse): assert len(findoc_response.inference.model.id) > 0 assert findoc_response.inference.result.fields.get("total_amount").value > 0 +output_files = [ + "default_sample_pages-001-001.pdf", + "default_sample_pages-002-002.pdf", +] + + @pytest.mark.pypdfium2 @pytest.mark.integration def test_pdf_should_extract_splits(): + client = Client() split_input = PathInput(V2_PRODUCT_DATA_DIR / "split" / "default_sample.pdf") response = client.enqueue_and_get_result( @@ -38,25 +40,25 @@ def test_pdf_should_extract_splits(): ) assert response.inference.file.page_count == 2 - extracted_pdfs = response.inference.result.extract_from_input_source(split_input) + extracted_splits = response.inference.result.extract_from_input_source(split_input) - assert len(extracted_pdfs) == 2 - assert extracted_pdfs[0].filename == "default_sample_001-001.pdf" - assert extracted_pdfs[1].filename == "default_sample_002-002.pdf" + assert len(extracted_splits) == 2 + assert extracted_splits[0].filename == output_files[0] + assert extracted_splits[1].filename == output_files[1] invoice_0 = client.enqueue_and_get_result( ExtractionResponse, - extracted_pdfs[0].as_input_source(), + extracted_splits[0].as_input_source(), ExtractionParameters( getenv("MINDEE_V2_SE_TESTS_FINDOC_MODEL_ID"), close_file=False ), ) check_findoc_return(invoice_0) - extracted_pdfs.save_all_to_disk(OUTPUT_DIR) - for i in range(len(extracted_pdfs)): - local_input = PathInput(OUTPUT_DIR / f"split_{i + 1:03d}.pdf") + extracted_splits.save_all_to_disk(OUTPUT_DIR) + for i in range(len(extracted_splits)): + local_input = PathInput(OUTPUT_DIR / output_files[i]) try: - assert local_input.page_count == extracted_pdfs[i].get_page_count() + assert local_input.page_count == extracted_splits[i].get_page_count() finally: local_input.close() split_input.close() @@ -65,4 +67,4 @@ def test_pdf_should_extract_splits(): @pytest.fixture(scope="module", autouse=True) def cleanup(): yield - cleanup_output_files(["split_001.pdf", "split_002.pdf"]) + cleanup_output_files(output_files)