diff --git a/README.md b/README.md index fb38630..ffea24b 100644 --- a/README.md +++ b/README.md @@ -8,8 +8,8 @@ The Python Shapefile Library (PyShp) reads and writes ESRI Shapefiles in pure Py - **Author**: [Joel Lawhead](https://github.com/GeospatialPython) - **Maintainers**: [James Parrott](https://github.com/JamesParrott) & [Karim Bahgat](https://github.com/karimbahgat) -- **Version**: 3.1.3 -- **Date**: 25th June 2026 +- **Version**: 3.1.4.dev +- **Date**: 27th June 2026 - **License**: [MIT](https://github.com/GeospatialPython/pyshp/blob/master/LICENSE.TXT) ## Contents @@ -93,6 +93,9 @@ part of your geospatial project. # Version Changes +## 3.1.4.dev +### Testing + - Test other codecs (ascii and unicode so far). ## 3.1.3 - Restore faster text writing paths for single-byte Ascii encodings, and Utf-8. diff --git a/changelog.txt b/changelog.txt index 5bca53a..3d00825 100644 --- a/changelog.txt +++ b/changelog.txt @@ -1,3 +1,9 @@ + +VERSION 3.1.4.dev + +2026-06-27 + * Test other codecs (ascii and unicode so far). + VERSION 3.1.3 2026-06-25 diff --git a/src/shapefile.py b/src/shapefile.py index e240d60..9fccf32 100644 --- a/src/shapefile.py +++ b/src/shapefile.py @@ -8,7 +8,7 @@ from __future__ import annotations -__version__ = "3.1.3" +__version__ = "3.1.4.dev" import abc import array @@ -283,6 +283,7 @@ def _truncate_utf8_str( ) +@functools.cache def _BOM_and_dbf_decoded_pad_bytes( encoding: str = "utf8", ) -> tuple[bytes, Mapping[str, bytes]]: @@ -310,7 +311,6 @@ def _encode_dbf_string( size: int, decode: Decoder | None, pad_byte: bytes, - decoded_pad_bytes: Mapping[str, bytes], encoding: str = "utf8", encodingErrors: str = "strict", strict: bool = True, @@ -358,6 +358,8 @@ def _encode_dbf_string( f"to a short enough byte string, using {encoding=}, {encodingErrors=} ({BOM=!r})" ) + _BOM, decoded_pad_bytes = _BOM_and_dbf_decoded_pad_bytes(encoding) + for suffix, pad_bytes in decoded_pad_bytes.items(): if s.endswith(suffix): msg = ( @@ -523,7 +525,6 @@ def from_unchecked( cls, name: str, *, - decoded_pad_bytes: Mapping[str, bytes], field_type: str | bytes | FieldTypeT = "C", size: int = 50, decimal: int = 0, @@ -532,6 +533,8 @@ def from_unchecked( strict: bool = False, ) -> Field: + name = str(name) + if "\x00" in name: msg = ( "Field names should not contain null characters " @@ -567,11 +570,10 @@ def from_unchecked( # Only use the portion of the name that we are able to encode to # 10 bytes or less. _encoded_name, trimmed_name = cls.trim_name_until_encodable( - name=str(name), + name=name, encoding=encoding, encodingErrors=encodingErrors, strict=strict, - decoded_pad_bytes=decoded_pad_bytes, ) # A doctest in README.md previously passed in a string ('40') for size, @@ -586,7 +588,6 @@ def from_unchecked( encoding=encoding, encodingErrors=encodingErrors, strict=strict, - decoded_pad_bytes=decoded_pad_bytes, ) return inst @@ -597,14 +598,12 @@ def trim_name_until_encodable( encoding: str = "utf8", encodingErrors: str = "strict", strict: bool = False, - decoded_pad_bytes: Mapping[str, bytes] = {}, ) -> tuple[bytes, str]: return _encode_dbf_string( s=name, size=10, decode=cls.decode_name, pad_byte=b"\x00", - decoded_pad_bytes=decoded_pad_bytes, encoding=encoding, encodingErrors=encodingErrors, strict=strict, @@ -615,7 +614,6 @@ def encode_field_descriptor( encoding: str = "utf8", encodingErrors: str = "strict", strict: bool = False, - decoded_pad_bytes: Mapping[str, bytes] = {}, ) -> bytes: # encoded_name = self.name.encode(encoding, encodingErrors) # encoded_name = encoded_name[:10].ljust(10, b"\x00") @@ -624,7 +622,6 @@ def encode_field_descriptor( encoding=encoding, encodingErrors=encodingErrors, strict=strict, - decoded_pad_bytes=decoded_pad_bytes, ) encoded_field_type = self.field_type.encode("ascii") @@ -4242,7 +4239,6 @@ def field( encoding=self.encoding, encodingErrors=self.encodingErrors, strict=self.strict, - decoded_pad_bytes=self._decoded_pad_bytes, ) self.fields.append(field) @@ -4287,7 +4283,6 @@ def _header(self) -> None: encoding=self.encoding, encodingErrors=self.encodingErrors, strict=self.strict, - decoded_pad_bytes=self._decoded_pad_bytes, ) ) @@ -4454,7 +4449,7 @@ def _record(self, record: list[RecordValue]) -> None: ) if self.strict: raise DbfStringDataLoss(msg) - warnings.warn(msg) + warnings.warn(msg, category=PossibleDataLoss) encoded = encoded.ljust(size) else: @@ -4463,7 +4458,6 @@ def _record(self, record: list[RecordValue]) -> None: size=size, decode=_decode_C_or_M_field if self.strict else None, pad_byte=b" ", - decoded_pad_bytes=self._decoded_pad_bytes, encoding=self.encoding, encodingErrors=self.encodingErrors, strict=self.strict, @@ -4886,6 +4880,14 @@ def fields(self) -> list[Field]: def fields(self, value: list[Field]) -> None: self.dbf_writer.fields = value + @property + def strict(self) -> bool: + return self.dbf_writer.strict + + @strict.setter + def strict(self, value: bool) -> None: + self.dbf_writer.strict = value + @property def recNum(self) -> int: if not self._dbf_writer: diff --git a/tests/hypothesis_tests.py b/tests/hypothesis_tests.py index 8a4b830..8fd077f 100644 --- a/tests/hypothesis_tests.py +++ b/tests/hypothesis_tests.py @@ -1,5 +1,6 @@ from __future__ import annotations +import contextlib import datetime import io import itertools @@ -538,15 +539,25 @@ def test_shx_reader_writer_roundtrip(codes_and_shapes)-> None: "D": {"min_length": 8, "max_length": 8}, } + +ENCODINGS = [ + "ascii", + "utf-8", +] + +encodings = sampled_from(ENCODINGS) + + @composite -def dbf_fields(draw): +def _dbf_fields_strategy(draw, encoding: str) -> dict[str, str | int]: field_type, bounds_dict = draw(sampled_from(list(DBF_FIELD_TYPES.items()))) name = draw( text( alphabet=characters( - codec="ascii", - exclude_categories=["Z", "C"] # Z - Whitespace, C - Control chars++ + codec=encoding, + exclude_categories=["C"], # Z - Whitespace, C - Control chars++ + exclude_characters=[" "], ), min_size=1, max_size=10, @@ -559,23 +570,50 @@ def dbf_fields(draw): size = draw(integers(min_value=min_length, max_value=max_length)) decimal = draw(integers(min_value=0, max_value=max(0,min(size - 3, max_decimal)))) - return {"name": name, "field_type": field_type, "size": size, "decimal": decimal} +@composite +def encodings_and_dbf_fields(draw): + encoding = draw(encodings) + fields_strategy = _dbf_fields_strategy(encoding) + field = draw(fields_strategy) + return encoding, field + +def _get_fields_context(fields, codec, strict=False): + for field in fields: + if len(field["name"].encode(codec)) > 10: + return pytest.warns(shp.PossibleDataLoss) + if not strict and " " in field: + return pytest.warns(shp.PossibleDataLoss) + return contextlib.nullcontext() + @pytest.mark.hypothesis -@given(field_kwargs=dbf_fields()) +@settings(suppress_health_check=[HealthCheck.too_slow, HealthCheck.data_too_large]) +@given(encoding_and_dbf_field=encodings_and_dbf_fields()) def test_dbf_Field_roundtrips( - field_kwargs: dict, + encoding_and_dbf_field: dict, ) -> None: - BOM, decoded_pad_bytes = shp._BOM_and_dbf_decoded_pad_bytes() + encoding, field_kwargs = encoding_and_dbf_field - expected = shp.Field.from_unchecked(decoded_pad_bytes=decoded_pad_bytes,**field_kwargs) + L = len(field_kwargs["name"].encode(encoding)) + context = _get_fields_context([field_kwargs], encoding, strict=False) + + with context: + expected = shp.Field.from_unchecked( + encoding=encoding, + strict=False, + **field_kwargs, + ) stream = io.BytesIO() + encoded = expected.encode_field_descriptor(strict=True) stream.write(encoded) stream.seek(0) - actual = shp.Field.from_byte_stream(stream) + actual = shp.Field.from_byte_stream( + stream, + encoding=encoding, + ) assert isinstance(actual, shp.Field) assert actual.name == expected.name assert actual[1:] == expected[1:] @@ -583,7 +621,7 @@ def test_dbf_Field_roundtrips( ascii_printable = string.ascii_letters + string.digits + string.punctuation + " " -def record_value_for_field(name: str, field_type: str, size: int, decimal: int = 0): +def record_value_for_field(name: str, field_type: str, size: int, decimal: int, encoding: str): if field_type == "C": return text( @@ -615,36 +653,45 @@ def record_value_for_field(name: str, field_type: str, size: int, decimal: int = raise ValueError(f"Unsupported: {field_type=}") -def _dbf_fields_and_record_strategy( +def _dbf_encoding_fields_and_record_strategy( draw, - max_fields=10, # In DbfWriter.__init__, max_num_fields: int = 2046, + max_fields: int=10, # In DbfWriter.__init__, max_num_fields: int = 2046, ): - fields = draw(lists(dbf_fields(), min_size=1, max_size=max_fields)) + encoding = draw(encodings) - record_strategy = tuples(*(record_value_for_field(**field) for field in fields)) + fields = draw(lists(_dbf_fields_strategy(encoding), min_size=1, max_size=max_fields)) - return fields, record_strategy + record_strategy = tuples(*(record_value_for_field(encoding=encoding, **field) for field in fields)) + + return encoding, fields, record_strategy @composite -def dbf_fields_and_records( +def dbf_encoding_fields_and_records( draw, max_fields=10, # In DbfWriter.__init__, max_num_fields: int = 2046, max_records=20, ): - fields, record_strategy = _dbf_fields_and_record_strategy(draw, max_fields) + encoding, fields, record_strategy = _dbf_encoding_fields_and_record_strategy(draw, max_fields) records = draw(lists(record_strategy, min_size=0, max_size=max_records)) - return fields, records + return encoding, fields, records -def _assert_reader_matches_expected_records(r, fields, written_records): + +def _assert_reader_matches_expected_fields(r, fields, written_records, writer_strict): for f_r, f_w in itertools.zip_longest(r.data_fields, fields): actual_field_dict = f_r._asdict() + actual_name = actual_field_dict["name"] + if not writer_strict: + actual_name = actual_name.replace(" ", "_") + assert f_w["name"].startswith(actual_name) for k in ("field_type", "size", "decimal"): assert actual_field_dict[k] == f_w[k], f"{k=}, {actual_field_dict[k]=}, {f_w[k]=}" + +def _assert_reader_matches_expected_records(r, fields, written_records): for exp_rec, actual_rec in itertools.zip_longest(written_records, r.records()): for expected, actual, field in itertools.zip_longest(exp_rec, actual_rec, fields): field_type = field["field_type"] @@ -660,14 +707,21 @@ def _assert_reader_matches_expected_records(r, fields, written_records): @pytest.mark.hypothesis -@given(fields_and_records=dbf_fields_and_records()) -def test_dbf_reader_writer_roundtrip(fields_and_records)-> None: - fields, records = fields_and_records +@given(codec_fields_and_records=dbf_encoding_fields_and_records()) +def test_dbf_reader_writer_roundtrip(codec_fields_and_records)-> None: + codec, fields, records = codec_fields_and_records stream = io.BytesIO() + fields_context = _get_fields_context(fields, codec, strict=False) written_records = [] - with shp.DbfWriter(dbf=stream, strict=True) as dbf_w: - for field in fields: - dbf_w.field(**field) + with shp.DbfWriter(dbf=stream, encoding=codec, strict=False) as dbf_w: + + # Only use strict = False to write fields, so that we still + # test the corresponding record values for any fields + # whose name was truncated. + with fields_context: + for field in fields: + dbf_w.field(**field) + dbf_w.strict = True for record in records: try: dbf_w.record(*record) @@ -676,50 +730,42 @@ def test_dbf_reader_writer_roundtrip(fields_and_records)-> None: else: written_records.append(record) - - with shp.DbfReader(dbf=stream) as r: + with shp.DbfReader(dbf=stream, encoding=codec) as r: + _assert_reader_matches_expected_fields(r, fields, written_records, False) _assert_reader_matches_expected_records(r, fields, written_records) -# def code_and_shape_strategy_from_triple(t): -# x, _name, shapes = t -# return tuples( -# just(x), -# lists( -# one_of(shapes, null_shapes), -# min_size = 0, # Empty shp files are in the ESRI spec. -# max_size=MAX_NUM_SHAPES, -# ), -# ) -# codes_and_shapes_strategies = [ -# code_and_shape_strategy_from_triple(t) -# for t in shape_codes_names_and_strategies -# ] - -# codes_and_shapes = one_of(codes_and_shapes_strategies) + @composite -def codes_fields_shapes_and_records(draw): +def codes_codecs_fields_shapes_and_records(draw): code, shapes = draw(codes_and_shapes) - fields, records_strategy = _dbf_fields_and_record_strategy(draw, max_fields=10) + encoding, fields, records_strategy = _dbf_encoding_fields_and_record_strategy(draw, max_fields=10) N = len(shapes) records = [draw(records_strategy) for _ in range(N)] - return code, fields, zip(shapes, records) + return code, encoding, fields, list(zip(shapes, records)) + @pytest.mark.hypothesis @settings(suppress_health_check=[HealthCheck.too_slow, HealthCheck.data_too_large]) -@given(codes_fields_shapes_and_records=codes_fields_shapes_and_records()) -def test_shapefile_reader_writer_roundtrip(codes_fields_shapes_and_records)-> None: +@given(codes_codecs_fields_shapes_and_records=codes_codecs_fields_shapes_and_records()) +def test_shapefile_reader_writer_roundtrip(codes_codecs_fields_shapes_and_records)-> None: - code_ex, fields_ex, shapes_and_records = codes_fields_shapes_and_records + code_ex, encoding, fields_ex, shapes_and_records = codes_codecs_fields_shapes_and_records streams = {"shp" : io.BytesIO(), "shx" : io.BytesIO(), "dbf" : io.BytesIO(),} expected_shapes = [] expected_records = [] + fields_context = _get_fields_context(fields=fields_ex, codec=encoding, strict=False) - with shp.Writer(shapeType = code_ex, strict=True, **streams) as w: - for field in fields_ex: - w.field(**field) + with shp.Writer(shapeType = code_ex, encoding=encoding, strict=False, **streams) as w: + with fields_context: + for field in fields_ex: + w.field(**field) + # Only use strict = False to write fields, so that we still + # test the corresponding record values for any fields + # whose name was truncated. + w.strict=True for shape, record in shapes_and_records: try: w.record(*record) @@ -729,6 +775,7 @@ def test_shapefile_reader_writer_roundtrip(codes_fields_shapes_and_records)-> No expected_shapes.append(shape) expected_records.append(record) - with shp.Reader(**streams) as r: + with shp.Reader(encoding=encoding, **streams) as r: + _assert_reader_matches_expected_fields(r, fields_ex, expected_records, False) _assert_reader_matches_expected_records(r, fields_ex, expected_records) _assert_reader_matches_expected_shapes(r, code_ex, expected_shapes) \ No newline at end of file