diff --git a/pyiceberg/io/pyarrow.py b/pyiceberg/io/pyarrow.py index f03baa524d..2754d4285a 100644 --- a/pyiceberg/io/pyarrow.py +++ b/pyiceberg/io/pyarrow.py @@ -1970,13 +1970,14 @@ def _cast_if_needed(self, field: NestedField, values: pa.Array) -> pa.Array: return values.cast(target_type) raise ValueError(f"Unsupported schema projection from {values.type} to {target_type}") elif isinstance(field.field_type, (IntegerType, LongType)): - # Cast smaller integer types to target type for cross-platform compatibility - # Only allow widening conversions (smaller bit width to larger) - # Narrowing conversions fall through to promote() handling below + # Cast integer types for cross-platform compatibility (e.g. Spark reads): + # widening (smaller bit width to larger) and unsigned-to-signed at same width if pa.types.is_integer(values.type): source_width = values.type.bit_width target_width = target_type.bit_width - if source_width < target_width: + if source_width < target_width or ( + pa.types.is_unsigned_integer(values.type) and source_width <= target_width + ): return values.cast(target_type) if field.field_type != file_field.field_type: diff --git a/tests/io/test_pyarrow.py b/tests/io/test_pyarrow.py index bcbf873c2a..d8592b55db 100644 --- a/tests/io/test_pyarrow.py +++ b/tests/io/test_pyarrow.py @@ -3084,6 +3084,7 @@ def test__to_requested_schema_timestamps_without_downcast_raises_exception( (pa.int8(), IntegerType(), pa.int32()), (pa.int16(), IntegerType(), pa.int32()), (pa.uint16(), IntegerType(), pa.int32()), + (pa.uint32(), IntegerType(), pa.int32()), (pa.uint32(), LongType(), pa.int64()), (pa.int32(), LongType(), pa.int64()), ], @@ -3109,6 +3110,19 @@ def test__to_requested_schema_integer_promotion( assert result.column(0).to_pylist() == [1, 2, 3, None] +def test__to_requested_schema_uint32_overflow_raises() -> None: + """Test that uint32 values exceeding INT32_MAX raise an error rather than wrapping.""" + requested_schema = Schema(NestedField(1, "col", IntegerType(), required=False)) + file_schema = requested_schema + + arrow_schema = pa.schema([pa.field("col", pa.uint32())]) + data = pa.array([2**31], type=pa.uint32()) + batch = pa.RecordBatch.from_arrays([data], schema=arrow_schema) + + with pytest.raises(pa.lib.ArrowInvalid, match="Integer value .* not in range"): + _to_requested_schema(requested_schema, file_schema, batch, downcast_ns_timestamp_to_us=False, include_field_ids=False) + + def test_pyarrow_file_io_fs_by_scheme_cache() -> None: # It's better to set up multi-region minio servers for an integration test once `endpoint_url` argument # becomes available for `resolve_s3_region`