Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 0 additions & 3 deletions mindee/image/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +0,0 @@
from mindee.image.image_compressor import compress_image

__all__ = ["compress_image"]
58 changes: 23 additions & 35 deletions mindee/image/extracted_image.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,12 @@
from __future__ import annotations

import io
from pathlib import Path
from typing import Any
from typing import Any, BinaryIO

from mindee.dependencies.checkers import PILLOW_AVAILABLE
from mindee.dependencies.decorators import requires_pillow
from mindee.error.mindee_error import MindeeError
from mindee.input.file_input import FileInput
from mindee.input.local_input_source import LocalInputSource
from mindee.input.bytes_input import BytesInput
from mindee.logger import logger

if PILLOW_AVAILABLE:
Expand All @@ -21,78 +19,68 @@
class ExtractedImage:
"""Generic class for image extraction."""

buffer: BinaryIO
filename: str
_page_id: int
"""Id of the page the image was extracted from."""
_element_id: int
"""Id of the element on a given page."""
filename: str
"""Name of the file the image was extracted from."""

def __init__(
self, input_source: LocalInputSource, page_id: int, element_id: int
self,
img_byte_stream: BinaryIO,
filename: str,
page_id: int,
element_id: int,
) -> None:
"""
Initialize the ExtractedImage with a buffer and an internal file name.

:param input_source: Local source for input.
:param img_byte_stream: The raw image bytes.
:param filename: Name of the file.
:param page_id: ID of the page the element was found on.
:param element_id: ID of the element in a page.
"""
self.buffer = io.BytesIO(input_source.file_object.read())
self.buffer.name = input_source.filename
self.filename = input_source.filename
if input_source.is_pdf():
extension = "jpg"
else:
extension = Path(input_source.filename).resolve().suffix
self.buffer = img_byte_stream
self.buffer.seek(0)
pg_number = str(page_id).zfill(3)
elem_number = str(element_id).zfill(3)
self.internal_file_name = (
f"{input_source.filename}_page{pg_number}-{elem_number}.{extension}"
)
self.filename = filename
self._page_id = page_id
self._element_id = 0 if element_id is None else element_id

@requires_pillow
def save_to_file(self, output_path: Path | str, file_format: str | None = None):
def save_to_file(self, output_path: Path | str):
"""
Saves the document to a file.

:param output_path: Path to save the file to.
:param file_format: Optional PIL-compatible format for the file. Inferred from file extension if not provided.
:raises MindeeError: If an invalid path or filename is provided.
"""
out_path = Path(output_path)
if not out_path.resolve().is_dir():
raise MindeeError("Provided path is not a directory.")
out_file_path = out_path / self.filename
try:
resolved_path = Path(output_path).resolve()
if not file_format and len(resolved_path.suffix) < 1:
raise ValueError("Invalid file format.")
self.buffer.seek(0)
image = Image.open(self.buffer)
if file_format:
image.save(resolved_path, format=file_format)
else:
image.save(resolved_path)
logger.info("File saved successfully to '%s'.", resolved_path)
except TypeError as e:
raise MindeeError("Invalid path/filename provided.") from e
image.save(out_file_path)
logger.info("File saved successfully to '%s'.", out_file_path)
except Exception as e:
print(e)
raise MindeeError(f"Could not save file {Path(output_path).name}.") from e

def as_input_source(self) -> FileInput:
def as_input_source(self) -> BytesInput:
"""
Return the file as a Mindee-compatible BufferInput source.

:returns: A BufferInput source.
"""
self.buffer.seek(0)
return FileInput(self.buffer)
return BytesInput(self.buffer.read(), self.filename)

@property
def page_id(self):
"""
ID of the page the receipt was found on.
ID of the page the image was found on.

:return: A valid page ID.
"""
Expand Down
12 changes: 12 additions & 0 deletions mindee/image/extracted_images.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
from pathlib import Path

from mindee.image.extracted_image import ExtractedImage


class ExtractedImages(list[ExtractedImage]):
"""List of extracted images."""

def save_all_to_disk(self, output_path: Path | str) -> None:
"""Save all extracted images to disk."""
for image in self:
image.save_to_file(output_path)
26 changes: 12 additions & 14 deletions mindee/image/image_extractor.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from __future__ import annotations

import io
from pathlib import Path
from typing import Any, BinaryIO

from mindee.dependencies import requires_pypdfium2
Expand All @@ -10,7 +11,6 @@
from mindee.geometry.point import Point
from mindee.geometry.polygon import Polygon, get_min_max_x, get_min_max_y
from mindee.image.extracted_image import ExtractedImage
from mindee.input.bytes_input import BytesInput
from mindee.input.local_input_source import LocalInputSource

if PYPDFIUM2_AVAILABLE:
Expand All @@ -29,7 +29,7 @@

@requires_pillow
@requires_pypdfium2
def attach_image_as_new_file( # type: ignore
def _attach_image_as_new_file( # type: ignore
input_buffer: BinaryIO,
) -> pdfium.PdfDocument:
"""
Expand Down Expand Up @@ -66,7 +66,7 @@ def extract_image_from_polygon(
width: float,
height: float,
file_format: str,
) -> bytes:
) -> BinaryIO:
"""
Crops the image from the given polygon.

Expand All @@ -87,11 +87,11 @@ def extract_image_from_polygon(
int(min_max_y.max * height),
)
)
return save_image_to_buffer(cropped_image, file_format)
return _save_image_to_buffer(cropped_image, file_format)


@requires_pillow
def save_image_to_buffer(image: Image.Image, file_format: str) -> bytes:
def _save_image_to_buffer(image: Image.Image, file_format: str) -> BinaryIO:
"""
Saves an image as a buffer.

Expand All @@ -102,7 +102,7 @@ def save_image_to_buffer(image: Image.Image, file_format: str) -> bytes:
buffer = io.BytesIO()
image.save(buffer, format=file_format)
buffer.seek(0)
return buffer.read()
return buffer


@requires_pillow
Expand Down Expand Up @@ -145,7 +145,8 @@ def extract_multiple_images_from_source(
:param polygons: List of coordinates to pull the elements from.
:return: List of byte arrays representing the extracted elements.
"""
page = load_pdf_doc(input_source).get_page(page_id)
stem = Path(input_source.filename).stem
page = _load_pdf_doc(input_source).get_page(page_id)
page_content = page.render().to_pil()
width, height = page.get_size()

Expand All @@ -159,20 +160,17 @@ def extract_multiple_images_from_source(
)
extracted_elements.append(
ExtractedImage(
BytesInput(
image_data,
f"{input_source.filename}_page{page_id + 1}-{element_id}.{file_extension}",
),
image_data,
f"{stem}_page-{(page_id + 1):03d}-item-{(element_id + 1):03d}.{file_extension}",
page_id,
element_id,
)
)

return extracted_elements


@requires_pypdfium2
def load_pdf_doc(input_file: LocalInputSource) -> pdfium.PdfDocument: # type: ignore
def _load_pdf_doc(input_file: LocalInputSource) -> pdfium.PdfDocument: # type: ignore
"""
Loads a PDF document from a local input source.

Expand All @@ -183,4 +181,4 @@ def load_pdf_doc(input_file: LocalInputSource) -> pdfium.PdfDocument: # type: i
input_file.file_object.seek(0)
return pdfium.PdfDocument(input_file.file_object.read())

return attach_image_as_new_file(input_file.file_object)
return _attach_image_as_new_file(input_file.file_object)
2 changes: 1 addition & 1 deletion mindee/input/local_input_source.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from mindee.dependencies.checkers import PYPDFIUM2_AVAILABLE
from mindee.error.mimetype_error import MimeTypeError
from mindee.error.mindee_error import MindeeError, MindeeSourceError
from mindee.image import compress_image
from mindee.image.image_compressor import compress_image
from mindee.input.page_options import KEEP_ONLY, REMOVE, PageOptions
from mindee.logger import logger
from mindee.pdf.pdf_compressor import compress_pdf
Expand Down
45 changes: 28 additions & 17 deletions mindee/pdf/extracted_pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,18 +18,22 @@
class ExtractedPDF:
"""An extracted sub-Pdf."""

pdf_bytes: BinaryIO
buffer: BinaryIO
filename: str
_page_indexes: tuple[int, int]

def __init__(self, pdf_bytes: BinaryIO, filename: str):
self.pdf_bytes = pdf_bytes
def __init__(
self, pdf_byte_stream: BinaryIO, filename: str, page_indexes: tuple[int, int]
):
self.buffer = pdf_byte_stream
self.filename = filename
self._page_indexes = page_indexes

@requires_pypdfium2
def get_page_count(self) -> int:
"""Get the number of pages in the PDF file."""
try:
pdf = pdfium.PdfDocument(self.pdf_bytes)
pdf = pdfium.PdfDocument(self.buffer)
return len(pdf)
except Exception as e:
raise MindeeError(
Expand All @@ -40,21 +44,28 @@ def save_to_file(self, output_path: Path | str):
"""
Writes the contents of the current PDF object to a file.

:param output_path: Path of the destination file. If
not extension is provided, pdf will be appended by default.
:param output_path: Path of the destination file.
If no extension is provided, '.pdf' will be appended by default.
"""
out_path = Path(output_path)
if out_path.resolve().is_dir():
raise MindeeError("Provided path is not a file.")
if not output_path or not out_path.parent.exists():
raise MindeeError("Invalid save path provided {}.")
if out_path.suffix.lower() != "pdf":
out_path = out_path.parent / (out_path.stem + "." + "pdf")
self.pdf_bytes.seek(0)
with open(out_path, "wb") as out_file:
out_file.write(self.pdf_bytes.read())
if not out_path.resolve().is_dir():
raise MindeeError("Provided path is not a directory.")
out_file_path = out_path / self.filename

try:
self.buffer.seek(0)
with open(out_file_path, "wb") as out_file:
out_file.write(self.buffer.read())
except Exception as e:
print(e)
raise MindeeError(f"Could not save file {out_file_path}.") from e

def as_input_source(self) -> BytesInput:
"""Returns the current PDF object as a usable BytesInput source."""
self.pdf_bytes.seek(0)
return BytesInput(self.pdf_bytes.read(), self.filename)
self.buffer.seek(0)
return BytesInput(self.buffer.read(), self.filename)

@property
def page_indexes(self) -> tuple[int, int]:
"""This PDF was extracted from this page range of the original PDF."""
return self._page_indexes
13 changes: 13 additions & 0 deletions mindee/pdf/extracted_pdfs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
from pathlib import Path

from mindee.pdf.extracted_pdf import ExtractedPDF


class ExtractedPDFs(list[ExtractedPDF]):
"""List of extracted PDFs."""

def save_all_to_disk(self, output_path: Path | str) -> None:
"""Save all extracted images to disk."""

for image in self:
image.save_to_file(output_path)
10 changes: 6 additions & 4 deletions mindee/pdf/pdf_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ def extract_sub_documents(
"""
Extract the sub-documents from the main pdf, based on the given list of page indexes.

:param page_indexes: List of list of numbers, representing page indexes.
:param page_indexes: 2D list of numbers, representing page indexes.
:return: A list of created PDFS.
"""
extracted_pdfs: list[ExtractedPDF] = []
Expand All @@ -80,10 +80,12 @@ def extract_sub_documents(
for page_index in page_index_elem:
if page_index > self.get_page_count():
raise MindeeError(f"Index {page_index} is out of range.")
formatted_max_index = f"{page_index_elem[len(page_index_elem) - 1] + 1:03d}"
field_filename = f"{stem}_{(page_index_elem[0] + 1):03d}-{formatted_max_index}{extension}"
first_page = page_index_elem[0]
last_page = page_index_elem[len(page_index_elem) - 1]
extracted_pdf = ExtractedPDF(
self.cut_pages(page_index_elem), field_filename
self.cut_pages(page_index_elem),
f"{stem}_pages-{(first_page + 1):03d}-{(last_page + 1):03d}{extension}",
(first_page, last_page),
)
extracted_pdfs.append(extracted_pdf)
return extracted_pdfs
Expand Down
6 changes: 3 additions & 3 deletions mindee/v2/file_operations/crop.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
from mindee.error import MindeeError
from mindee.geometry import Point, Polygon
from mindee.image.extracted_image import ExtractedImage
from mindee.image.extracted_images import ExtractedImages
from mindee.image.image_extractor import extract_multiple_images_from_source
from mindee.input.local_input_source import LocalInputSource
from mindee.v2.file_operations.crop_files import CropFiles
from mindee.v2.parsing.inference.field import FieldLocation
from mindee.v2.product.crop.crop_item import CropItem

Expand All @@ -25,7 +25,7 @@ def extract_single_crop(

def extract_multiple_crops(
input_source: LocalInputSource, crops: list[CropItem]
) -> CropFiles:
) -> ExtractedImages:
"""
Extracts individual receipts from multi-receipts documents.

Expand All @@ -49,4 +49,4 @@ def extract_multiple_crops(
polygon,
)
)
return CropFiles(images)
return ExtractedImages(images)
20 changes: 0 additions & 20 deletions mindee/v2/file_operations/crop_files.py

This file was deleted.

Loading
Loading