Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 5 additions & 4 deletions pyiceberg/io/pyarrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -1970,13 +1970,14 @@ def _cast_if_needed(self, field: NestedField, values: pa.Array) -> pa.Array:
return values.cast(target_type)
raise ValueError(f"Unsupported schema projection from {values.type} to {target_type}")
elif isinstance(field.field_type, (IntegerType, LongType)):
# Cast smaller integer types to target type for cross-platform compatibility
# Only allow widening conversions (smaller bit width to larger)
# Narrowing conversions fall through to promote() handling below
# Cast integer types for cross-platform compatibility (e.g. Spark reads):
# widening (smaller bit width to larger) and unsigned-to-signed at same width
if pa.types.is_integer(values.type):
source_width = values.type.bit_width
target_width = target_type.bit_width
if source_width < target_width:
if source_width < target_width or (
pa.types.is_unsigned_integer(values.type) and source_width <= target_width
):
return values.cast(target_type)

if field.field_type != file_field.field_type:
Expand Down
14 changes: 14 additions & 0 deletions tests/io/test_pyarrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -3084,6 +3084,7 @@ def test__to_requested_schema_timestamps_without_downcast_raises_exception(
(pa.int8(), IntegerType(), pa.int32()),
(pa.int16(), IntegerType(), pa.int32()),
(pa.uint16(), IntegerType(), pa.int32()),
(pa.uint32(), IntegerType(), pa.int32()),
(pa.uint32(), LongType(), pa.int64()),
(pa.int32(), LongType(), pa.int64()),
],
Expand All @@ -3109,6 +3110,19 @@ def test__to_requested_schema_integer_promotion(
assert result.column(0).to_pylist() == [1, 2, 3, None]


def test__to_requested_schema_uint32_overflow_raises() -> None:
"""Test that uint32 values exceeding INT32_MAX raise an error rather than wrapping."""
requested_schema = Schema(NestedField(1, "col", IntegerType(), required=False))
file_schema = requested_schema

arrow_schema = pa.schema([pa.field("col", pa.uint32())])
data = pa.array([2**31], type=pa.uint32())
batch = pa.RecordBatch.from_arrays([data], schema=arrow_schema)

with pytest.raises(pa.lib.ArrowInvalid, match="Integer value .* not in range"):
_to_requested_schema(requested_schema, file_schema, batch, downcast_ns_timestamp_to_us=False, include_field_ids=False)


def test_pyarrow_file_io_fs_by_scheme_cache() -> None:
# It's better to set up multi-region minio servers for an integration test once `endpoint_url` argument
# becomes available for `resolve_s3_region`
Expand Down