diff --git a/Doc/library/zipfile.rst b/Doc/library/zipfile.rst index 98d2a5e5cdf00e2..4665e3292477526 100644 --- a/Doc/library/zipfile.rst +++ b/Doc/library/zipfile.rst @@ -227,9 +227,10 @@ ZipFile objects Similar behavior occurs with files newer than 2107-12-31, the timestamp is also set to the limit. - When mode is ``'r'``, *metadata_encoding* may be set to the name of a codec, - which will be used to decode metadata such as the names of members and ZIP - comments. + The *metadata_encoding* argument may be set to the name of a codec, which + will be used to decode metadata such as the names of members and ZIP + comments when reading from an archive. This argument is ignored for writing + new members. If the file is created with mode ``'w'``, ``'x'`` or ``'a'`` and then :meth:`closed ` without adding any files to the archive, the appropriate @@ -285,6 +286,10 @@ ZipFile objects Added support for specifying member name encoding for reading metadata in the zipfile's directory and file headers. + .. versionchanged:: next + The *metadata_encoding* parameter is now allowed in all modes. Previously + it could only be specified in mode ``'r'``. + .. method:: ZipFile.close() diff --git a/Lib/test/test_zipfile/test_core.py b/Lib/test/test_zipfile/test_core.py index 4f20209927e7b3d..c69129ee6176549 100644 --- a/Lib/test/test_zipfile/test_core.py +++ b/Lib/test/test_zipfile/test_core.py @@ -1440,15 +1440,7 @@ class ZstdWriterTests(AbstractWriterTests, unittest.TestCase): def comparable_zinfo(zinfo): """Return a dict of public ZipInfo attributes for assertEqual comparison.""" - attrs = {k: getattr(zinfo, k) for k in _ZINFO_PUBLIC_KEYS} - - # Since patch gh-84353, the _MASK_UTF_FILENAME (0x800) bit may be - # changed when writing to the end record depending on whether filename - # can be encoded with ascii or cp437. Skip checking this bit by - # pretending it's always set. - attrs['flag_bits'] |= 0x800 - - return attrs + return {k: getattr(zinfo, k) for k in _ZINFO_PUBLIC_KEYS} _struct_pack = struct.pack @@ -5710,15 +5702,27 @@ def setUp(self): with open(TESTFN, "wb") as tf: tf.write(data) - def _test_read(self, zipfp, expected_names, expected_content): + def _test_read(self, zipfp, expected_names, expected_content, + expected_comments=None, expected_efs_flags=None): # Check the namelist names = zipfp.namelist() - self.assertEqual(sorted(names), sorted(expected_names)) + self.assertEqual(names, expected_names) # Check infolist infos = zipfp.infolist() names = [zi.filename for zi in infos] - self.assertEqual(sorted(names), sorted(expected_names)) + self.assertEqual(names, expected_names) + + if expected_comments is not None: + comments = [zi.comment for zi in infos] + self.assertEqual(comments, expected_comments) + + if expected_efs_flags is not None: + efs_flags = [ + bool(zi.flag_bits & zipfile._MASK_UTF_FILENAME) + for zi in infos + ] + self.assertEqual(efs_flags, expected_efs_flags) # check getinfo for name, content in zip(expected_names, expected_content): @@ -5731,6 +5735,8 @@ def test_read_with_metadata_encoding(self): # Read the ZIP archive with correct metadata_encoding with zipfile.ZipFile(TESTFN, "r", metadata_encoding='shift_jis') as zipfp: self._test_read(zipfp, self.file_names, self.file_content) + with zipfile.ZipFile(TESTFN, "a", metadata_encoding='shift_jis') as zipfp: + self._test_read(zipfp, self.file_names, self.file_content) def test_read_without_metadata_encoding(self): # Read the ZIP archive without metadata_encoding @@ -5738,6 +5744,8 @@ def test_read_without_metadata_encoding(self): for name in self.file_names[:2]] + self.file_names[2:] with zipfile.ZipFile(TESTFN, "r") as zipfp: self._test_read(zipfp, expected_names, self.file_content) + with zipfile.ZipFile(TESTFN, "a") as zipfp: + self._test_read(zipfp, expected_names, self.file_content) def test_read_with_incorrect_metadata_encoding(self): # Read the ZIP archive with incorrect metadata_encoding @@ -5745,6 +5753,8 @@ def test_read_with_incorrect_metadata_encoding(self): for name in self.file_names[:2]] + self.file_names[2:] with zipfile.ZipFile(TESTFN, "r", metadata_encoding='koi8-u') as zipfp: self._test_read(zipfp, expected_names, self.file_content) + with zipfile.ZipFile(TESTFN, "a", metadata_encoding='koi8-u') as zipfp: + self._test_read(zipfp, expected_names, self.file_content) def test_read_with_unsuitable_metadata_encoding(self): # Read the ZIP archive with metadata_encoding unsuitable for @@ -5753,6 +5763,10 @@ def test_read_with_unsuitable_metadata_encoding(self): zipfile.ZipFile(TESTFN, "r", metadata_encoding='ascii') with self.assertRaises(UnicodeDecodeError): zipfile.ZipFile(TESTFN, "r", metadata_encoding='utf-8') + with self.assertRaises(UnicodeDecodeError): + zipfile.ZipFile(TESTFN, "a", metadata_encoding='ascii') + with self.assertRaises(UnicodeDecodeError): + zipfile.ZipFile(TESTFN, "a", metadata_encoding='utf-8') def test_read_after_append(self): newname = '\u56db' # Han 'four' @@ -5766,20 +5780,76 @@ def test_read_after_append(self): with zipfile.ZipFile(TESTFN, "a") as zipfp: zipfp.writestr(newname, "newcontent") zipfp.writestr(newname2, "newcontent2") - self.assertEqual(sorted(zipfp.namelist()), sorted(mojibake_expected_names)) + self.assertEqual(zipfp.namelist(), mojibake_expected_names) with zipfile.ZipFile(TESTFN, "r") as zipfp: self._test_read(zipfp, mojibake_expected_names, expected_content) + with zipfile.ZipFile(TESTFN, "a") as zipfp: + self._test_read(zipfp, mojibake_expected_names, expected_content) with zipfile.ZipFile(TESTFN, "r", metadata_encoding='shift_jis') as zipfp: self._test_read(zipfp, expected_names, expected_content) + with zipfile.ZipFile(TESTFN, "a", metadata_encoding='shift_jis') as zipfp: + self._test_read(zipfp, expected_names, expected_content) + + def test_append_keep_efs_flag(self): + """Files loaded from an archive should keep original EFS flags when + rewritten to central directory in append mode.""" + names = ['file1', 'file2', 'file3', 'file4'] + contents = [b'content1', b'content2', b'content3', b'content4'] + comments = ['\u4e00'.encode('utf-8'), b'foo', '\u4e8c'.encode('shift_jis'), b'bar'] + efs_flags = [True, True, False, False] + + def mock_encode(self): + if efs_flags[i]: + zinfo.flag_bits |= zipfile._MASK_UTF_FILENAME + return (self.filename.encode('ascii'), self.flag_bits) + + with mock.patch('zipfile.ZipInfo._encodeFilenameFlags', mock_encode), \ + zipfile.ZipFile(TESTFN, "w") as zipfp: + for i, name in enumerate(names): + zinfo = zipfile.ZipInfo(name) + zinfo.comment = comments[i] + zipfp.writestr(zinfo, contents[i]) + + with zipfile.ZipFile(TESTFN, "a") as zipfp: + # trigger archive rewriting + zipfp.comment = b'comment' + + with zipfile.ZipFile(TESTFN, "r") as zipfp: + self.assertEqual(zipfp.comment, b'comment') + self._test_read(zipfp, names, contents, comments, efs_flags) + + def test_write_enforce_efs_flag(self): + """New files should enforce EFS flag if filename or comment is not ASCII.""" + names = ['\u4e00', '\u4e8c', 'file3', 'file4'] + contents = [b'content1', b'content2', b'content3', b'content4'] + comments = ['\u4e00'.encode('utf-8'), b'foo', '\u4e8c'.encode('utf-8'), b'bar'] + expected_efs_flags = [True, True, True, False] + + with zipfile.ZipFile(TESTFN, "w") as zipfp: + for i, name in enumerate(names): + zinfo = zipfile.ZipInfo(name) + zinfo.comment = comments[i] + zipfp.writestr(zinfo, contents[i]) + self.assertEqual(zipfp.namelist(), names) + + with zipfile.ZipFile(TESTFN, "r") as zipfp: + self._test_read(zipfp, names, contents, comments, expected_efs_flags) def test_write_with_metadata_encoding(self): - ZF = zipfile.ZipFile + """metadata_encoding should not affect the encoding of new files.""" + names = ['\u4e00', 'file2'] + contents = ['\u4e00'.encode('utf-8'), '\u4e8c'.encode('utf-8')] + expected_efs_flags = [True, False] + for mode in ("w", "x", "a"): - with self.assertRaisesRegex(ValueError, - "^metadata_encoding is only"): - ZF("nonesuch.zip", mode, metadata_encoding="shift_jis") + unlink(TESTFN) + with zipfile.ZipFile(TESTFN, mode, metadata_encoding='shift_jis') as zipfp: + for i, name in enumerate(names): + zipfp.writestr(name, contents[i]) + with zipfile.ZipFile(TESTFN, 'r') as zipfp: + self._test_read(zipfp, names, contents, None, expected_efs_flags) def test_add_comment(self): with zipfile.ZipFile(TESTFN, "r") as zipfp: diff --git a/Lib/zipfile/__init__.py b/Lib/zipfile/__init__.py index 418933a2e8d9e87..3c4e0a87f8655c0 100644 --- a/Lib/zipfile/__init__.py +++ b/Lib/zipfile/__init__.py @@ -453,6 +453,7 @@ class ZipInfo: 'file_size', '_raw_time', '_end_offset', + '_metadata_encoding', ) def __init__(self, filename="NoName", date_time=(1980,1,1,0,0,0)): @@ -488,6 +489,7 @@ def __init__(self, filename="NoName", date_time=(1980,1,1,0,0,0)): self.compress_size = 0 # Size of the compressed file self.file_size = 0 # Size of the uncompressed file self._end_offset = None # Start of the next local header or central directory + self._metadata_encoding = None # Encoding used when read from the archive # Other attributes are set by class ZipFile: # header_offset Byte offset to the file header # CRC CRC-32 of the uncompressed file @@ -575,12 +577,18 @@ def FileHeader(self, zip64=None): def _encodeFilenameFlags(self): if self.flag_bits & _MASK_UTF_FILENAME: - encoding = 'ascii' - else: - encoding = 'cp437' + return self.filename.encode('utf-8'), self.flag_bits + + # For a file read from the archive, preserve its original encoding. + encoding = self._metadata_encoding + if encoding: + return self.filename.encode(encoding), self.flag_bits + + # For a newly added file, enforce EFS if filename or comment is non-ASCII. try: - return self.filename.encode(encoding), self.flag_bits & ~_MASK_UTF_FILENAME - except UnicodeEncodeError: + self.comment.decode('ascii') + return self.filename.encode('ascii'), self.flag_bits + except (UnicodeEncodeError, UnicodeDecodeError): return self.filename.encode('utf-8'), self.flag_bits | _MASK_UTF_FILENAME def _decodeExtra(self, filename_crc): @@ -1917,11 +1925,6 @@ def __init__(self, file, mode="r", compression=ZIP_STORED, allowZip64=True, self._strict_timestamps = strict_timestamps self.metadata_encoding = metadata_encoding - # Check that we don't try to write with nonconforming codecs - if self.metadata_encoding and mode != 'r': - raise ValueError( - "metadata_encoding is only supported for reading files") - # Check if we were passed a file-like object if isinstance(file, os.PathLike): file = os.fspath(file) @@ -2072,6 +2075,7 @@ def _RealGetContents(self): t>>11, (t>>5)&0x3F, (t&0x1F) * 2 ) x._decodeExtra(orig_filename_crc) x.header_offset = x.header_offset + concat + x._metadata_encoding = self.metadata_encoding or 'cp437' self.filelist.append(x) self.NameToInfo[x.filename] = x @@ -2286,7 +2290,7 @@ def _open_to_write(self, zinfo, force_zip64=False): zinfo.compress_size = 0 zinfo.CRC = 0 - zinfo.flag_bits = _MASK_UTF_FILENAME + zinfo.flag_bits = 0x00 if zinfo.compress_type == ZIP_LZMA: # Compressed data includes an end-of-stream (EOS) marker zinfo.flag_bits |= _MASK_COMPRESS_OPTION_1 diff --git a/Misc/NEWS.d/next/Library/2026-07-02-03-51-54.gh-issue-152845.D-XZCn.rst b/Misc/NEWS.d/next/Library/2026-07-02-03-51-54.gh-issue-152845.D-XZCn.rst new file mode 100644 index 000000000000000..4b5aa175ffef97d --- /dev/null +++ b/Misc/NEWS.d/next/Library/2026-07-02-03-51-54.gh-issue-152845.D-XZCn.rst @@ -0,0 +1,5 @@ +Fix an issue where the EFS flag is not set when a file with ASCII +filename and UTF-8 comment is written through :mod:`zipfile`. Also +preserves the original encoding and EFS flag for a file read from an +archive and rewritten through the ``'a'`` mode. Additionallly allows +the ``metadata_encoding`` parameter in all modes.