From 6e81d210c117dd75df766b586b3cd309c2a8428c Mon Sep 17 00:00:00 2001 From: Danny Lin Date: Thu, 2 Jul 2026 00:38:01 +0800 Subject: [PATCH 1/5] gh-152845: Remove unneeded sort for tests in `EncodedMetadataTests` --- Lib/test/test_zipfile/test_core.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Lib/test/test_zipfile/test_core.py b/Lib/test/test_zipfile/test_core.py index 4f20209927e7b3..b7031114d29386 100644 --- a/Lib/test/test_zipfile/test_core.py +++ b/Lib/test/test_zipfile/test_core.py @@ -5713,12 +5713,12 @@ def setUp(self): def _test_read(self, zipfp, expected_names, expected_content): # Check the namelist names = zipfp.namelist() - self.assertEqual(sorted(names), sorted(expected_names)) + self.assertEqual(names, expected_names) # Check infolist infos = zipfp.infolist() names = [zi.filename for zi in infos] - self.assertEqual(sorted(names), sorted(expected_names)) + self.assertEqual(names, expected_names) # check getinfo for name, content in zip(expected_names, expected_content): @@ -5766,7 +5766,7 @@ def test_read_after_append(self): with zipfile.ZipFile(TESTFN, "a") as zipfp: zipfp.writestr(newname, "newcontent") zipfp.writestr(newname2, "newcontent2") - self.assertEqual(sorted(zipfp.namelist()), sorted(mojibake_expected_names)) + self.assertEqual(zipfp.namelist(), mojibake_expected_names) with zipfile.ZipFile(TESTFN, "r") as zipfp: self._test_read(zipfp, mojibake_expected_names, expected_content) From 40125e5ee4123f6ab04be6c63d0ec13d37374e39 Mon Sep 17 00:00:00 2001 From: Danny Lin Date: Thu, 2 Jul 2026 01:10:53 +0800 Subject: [PATCH 2/5] gh-152845: Fix missing EFS flag for files with UTF-8 comments Fix a regression introduced by gh-84353/gh-150091 where the EFS flag was dropped or omitted when a file with an ASCII filename and a UTF-8 comment was written to an archive. This affected both newly added files and existing files rewritten to the central directory in append mode, causing an unexpected metadata change and leading to comment mis-decoding. Introduce an internal `_metadata_encoding` attribute for `ZipInfo` to ensure that files read from an archive preserve their original encoding and EFS flags, while newly added files now properly enforce EFS if they contain a non-ASCII filename or comment. --- Lib/test/test_zipfile/test_core.py | 69 +++++++++++++++++++++++++----- Lib/zipfile/__init__.py | 21 ++++++--- 2 files changed, 74 insertions(+), 16 deletions(-) diff --git a/Lib/test/test_zipfile/test_core.py b/Lib/test/test_zipfile/test_core.py index b7031114d29386..d27e2831dfb184 100644 --- a/Lib/test/test_zipfile/test_core.py +++ b/Lib/test/test_zipfile/test_core.py @@ -1440,15 +1440,7 @@ class ZstdWriterTests(AbstractWriterTests, unittest.TestCase): def comparable_zinfo(zinfo): """Return a dict of public ZipInfo attributes for assertEqual comparison.""" - attrs = {k: getattr(zinfo, k) for k in _ZINFO_PUBLIC_KEYS} - - # Since patch gh-84353, the _MASK_UTF_FILENAME (0x800) bit may be - # changed when writing to the end record depending on whether filename - # can be encoded with ascii or cp437. Skip checking this bit by - # pretending it's always set. - attrs['flag_bits'] |= 0x800 - - return attrs + return {k: getattr(zinfo, k) for k in _ZINFO_PUBLIC_KEYS} _struct_pack = struct.pack @@ -5710,7 +5702,8 @@ def setUp(self): with open(TESTFN, "wb") as tf: tf.write(data) - def _test_read(self, zipfp, expected_names, expected_content): + def _test_read(self, zipfp, expected_names, expected_content, + expected_comments=None, expected_efs_flags=None): # Check the namelist names = zipfp.namelist() self.assertEqual(names, expected_names) @@ -5720,6 +5713,17 @@ def _test_read(self, zipfp, expected_names, expected_content): names = [zi.filename for zi in infos] self.assertEqual(names, expected_names) + if expected_comments is not None: + comments = [zi.comment for zi in infos] + self.assertEqual(comments, expected_comments) + + if expected_efs_flags is not None: + efs_flags = [ + bool(zi.flag_bits & zipfile._MASK_UTF_FILENAME) + for zi in infos + ] + self.assertEqual(efs_flags, expected_efs_flags) + # check getinfo for name, content in zip(expected_names, expected_content): info = zipfp.getinfo(name) @@ -5774,6 +5778,51 @@ def test_read_after_append(self): with zipfile.ZipFile(TESTFN, "r", metadata_encoding='shift_jis') as zipfp: self._test_read(zipfp, expected_names, expected_content) + def test_append_keep_efs_flag(self): + """Files loaded from an archive should keep original EFS flags when + rewritten to central directory in append mode.""" + names = ['file1', 'file2', 'file3', 'file4'] + contents = [b'content1', b'content2', b'content3', b'content4'] + comments = ['\u4e00'.encode('utf-8'), b'foo', '\u4e8c'.encode('shift_jis'), b'bar'] + efs_flags = [True, True, False, False] + + def mock_encode(self): + if efs_flags[i]: + zinfo.flag_bits |= zipfile._MASK_UTF_FILENAME + return (self.filename.encode('ascii'), self.flag_bits) + + with mock.patch('zipfile.ZipInfo._encodeFilenameFlags', mock_encode), \ + zipfile.ZipFile(TESTFN, "w") as zipfp: + for i, name in enumerate(names): + zinfo = zipfile.ZipInfo(name) + zinfo.comment = comments[i] + zipfp.writestr(zinfo, contents[i]) + + with zipfile.ZipFile(TESTFN, "a") as zipfp: + # trigger archive rewriting + zipfp.comment = b'comment' + + with zipfile.ZipFile(TESTFN, "r") as zipfp: + self.assertEqual(zipfp.comment, b'comment') + self._test_read(zipfp, names, contents, comments, efs_flags) + + def test_write_enforce_efs_flag(self): + """New files should enforce EFS flag if filename or comment is not ASCII.""" + names = ['\u4e00', '\u4e8c', 'file3', 'file4'] + contents = [b'content1', b'content2', b'content3', b'content4'] + comments = ['\u4e00'.encode('utf-8'), b'foo', '\u4e8c'.encode('utf-8'), b'bar'] + expected_efs_flags = [True, True, True, False] + + with zipfile.ZipFile(TESTFN, "w") as zipfp: + for i, name in enumerate(names): + zinfo = zipfile.ZipInfo(name) + zinfo.comment = comments[i] + zipfp.writestr(zinfo, contents[i]) + self.assertEqual(zipfp.namelist(), names) + + with zipfile.ZipFile(TESTFN, "r") as zipfp: + self._test_read(zipfp, names, contents, comments, expected_efs_flags) + def test_write_with_metadata_encoding(self): ZF = zipfile.ZipFile for mode in ("w", "x", "a"): diff --git a/Lib/zipfile/__init__.py b/Lib/zipfile/__init__.py index 418933a2e8d9e8..e1ec4a3783f7bd 100644 --- a/Lib/zipfile/__init__.py +++ b/Lib/zipfile/__init__.py @@ -453,6 +453,7 @@ class ZipInfo: 'file_size', '_raw_time', '_end_offset', + '_metadata_encoding', ) def __init__(self, filename="NoName", date_time=(1980,1,1,0,0,0)): @@ -488,6 +489,7 @@ def __init__(self, filename="NoName", date_time=(1980,1,1,0,0,0)): self.compress_size = 0 # Size of the compressed file self.file_size = 0 # Size of the uncompressed file self._end_offset = None # Start of the next local header or central directory + self._metadata_encoding = None # Encoding used when read from the archive # Other attributes are set by class ZipFile: # header_offset Byte offset to the file header # CRC CRC-32 of the uncompressed file @@ -575,12 +577,18 @@ def FileHeader(self, zip64=None): def _encodeFilenameFlags(self): if self.flag_bits & _MASK_UTF_FILENAME: - encoding = 'ascii' - else: - encoding = 'cp437' + return self.filename.encode('utf-8'), self.flag_bits + + # For a file read from the archive, preserve its original encoding. + encoding = self._metadata_encoding + if encoding: + return self.filename.encode(encoding), self.flag_bits + + # For a newly added file, enforce EFS if filename or comment is non-ASCII. try: - return self.filename.encode(encoding), self.flag_bits & ~_MASK_UTF_FILENAME - except UnicodeEncodeError: + self.comment.decode('ascii') + return self.filename.encode('ascii'), self.flag_bits + except (UnicodeEncodeError, UnicodeDecodeError): return self.filename.encode('utf-8'), self.flag_bits | _MASK_UTF_FILENAME def _decodeExtra(self, filename_crc): @@ -2072,6 +2080,7 @@ def _RealGetContents(self): t>>11, (t>>5)&0x3F, (t&0x1F) * 2 ) x._decodeExtra(orig_filename_crc) x.header_offset = x.header_offset + concat + x._metadata_encoding = self.metadata_encoding or 'cp437' self.filelist.append(x) self.NameToInfo[x.filename] = x @@ -2286,7 +2295,7 @@ def _open_to_write(self, zinfo, force_zip64=False): zinfo.compress_size = 0 zinfo.CRC = 0 - zinfo.flag_bits = _MASK_UTF_FILENAME + zinfo.flag_bits = 0x00 if zinfo.compress_type == ZIP_LZMA: # Compressed data includes an end-of-stream (EOS) marker zinfo.flag_bits |= _MASK_COMPRESS_OPTION_1 From eae95bdc4a8852b0edb8ada1838f302820ec4a1c Mon Sep 17 00:00:00 2001 From: Danny Lin Date: Thu, 2 Jul 2026 11:14:49 +0800 Subject: [PATCH 3/5] gh-152845: Allow metadata_encoding in all ZipFile modes Allow the `metadata_encoding` parameter in all modes, enabling proper decoding with a customized codec in 'a' mode. This parameter is ignored for 'w' and 'x' modes. --- Lib/test/test_zipfile/test_core.py | 29 +++++++++++++++++++++++++---- Lib/zipfile/__init__.py | 5 ----- 2 files changed, 25 insertions(+), 9 deletions(-) diff --git a/Lib/test/test_zipfile/test_core.py b/Lib/test/test_zipfile/test_core.py index d27e2831dfb184..c69129ee617654 100644 --- a/Lib/test/test_zipfile/test_core.py +++ b/Lib/test/test_zipfile/test_core.py @@ -5735,6 +5735,8 @@ def test_read_with_metadata_encoding(self): # Read the ZIP archive with correct metadata_encoding with zipfile.ZipFile(TESTFN, "r", metadata_encoding='shift_jis') as zipfp: self._test_read(zipfp, self.file_names, self.file_content) + with zipfile.ZipFile(TESTFN, "a", metadata_encoding='shift_jis') as zipfp: + self._test_read(zipfp, self.file_names, self.file_content) def test_read_without_metadata_encoding(self): # Read the ZIP archive without metadata_encoding @@ -5742,6 +5744,8 @@ def test_read_without_metadata_encoding(self): for name in self.file_names[:2]] + self.file_names[2:] with zipfile.ZipFile(TESTFN, "r") as zipfp: self._test_read(zipfp, expected_names, self.file_content) + with zipfile.ZipFile(TESTFN, "a") as zipfp: + self._test_read(zipfp, expected_names, self.file_content) def test_read_with_incorrect_metadata_encoding(self): # Read the ZIP archive with incorrect metadata_encoding @@ -5749,6 +5753,8 @@ def test_read_with_incorrect_metadata_encoding(self): for name in self.file_names[:2]] + self.file_names[2:] with zipfile.ZipFile(TESTFN, "r", metadata_encoding='koi8-u') as zipfp: self._test_read(zipfp, expected_names, self.file_content) + with zipfile.ZipFile(TESTFN, "a", metadata_encoding='koi8-u') as zipfp: + self._test_read(zipfp, expected_names, self.file_content) def test_read_with_unsuitable_metadata_encoding(self): # Read the ZIP archive with metadata_encoding unsuitable for @@ -5757,6 +5763,10 @@ def test_read_with_unsuitable_metadata_encoding(self): zipfile.ZipFile(TESTFN, "r", metadata_encoding='ascii') with self.assertRaises(UnicodeDecodeError): zipfile.ZipFile(TESTFN, "r", metadata_encoding='utf-8') + with self.assertRaises(UnicodeDecodeError): + zipfile.ZipFile(TESTFN, "a", metadata_encoding='ascii') + with self.assertRaises(UnicodeDecodeError): + zipfile.ZipFile(TESTFN, "a", metadata_encoding='utf-8') def test_read_after_append(self): newname = '\u56db' # Han 'four' @@ -5774,9 +5784,13 @@ def test_read_after_append(self): with zipfile.ZipFile(TESTFN, "r") as zipfp: self._test_read(zipfp, mojibake_expected_names, expected_content) + with zipfile.ZipFile(TESTFN, "a") as zipfp: + self._test_read(zipfp, mojibake_expected_names, expected_content) with zipfile.ZipFile(TESTFN, "r", metadata_encoding='shift_jis') as zipfp: self._test_read(zipfp, expected_names, expected_content) + with zipfile.ZipFile(TESTFN, "a", metadata_encoding='shift_jis') as zipfp: + self._test_read(zipfp, expected_names, expected_content) def test_append_keep_efs_flag(self): """Files loaded from an archive should keep original EFS flags when @@ -5824,11 +5838,18 @@ def test_write_enforce_efs_flag(self): self._test_read(zipfp, names, contents, comments, expected_efs_flags) def test_write_with_metadata_encoding(self): - ZF = zipfile.ZipFile + """metadata_encoding should not affect the encoding of new files.""" + names = ['\u4e00', 'file2'] + contents = ['\u4e00'.encode('utf-8'), '\u4e8c'.encode('utf-8')] + expected_efs_flags = [True, False] + for mode in ("w", "x", "a"): - with self.assertRaisesRegex(ValueError, - "^metadata_encoding is only"): - ZF("nonesuch.zip", mode, metadata_encoding="shift_jis") + unlink(TESTFN) + with zipfile.ZipFile(TESTFN, mode, metadata_encoding='shift_jis') as zipfp: + for i, name in enumerate(names): + zipfp.writestr(name, contents[i]) + with zipfile.ZipFile(TESTFN, 'r') as zipfp: + self._test_read(zipfp, names, contents, None, expected_efs_flags) def test_add_comment(self): with zipfile.ZipFile(TESTFN, "r") as zipfp: diff --git a/Lib/zipfile/__init__.py b/Lib/zipfile/__init__.py index e1ec4a3783f7bd..3c4e0a87f8655c 100644 --- a/Lib/zipfile/__init__.py +++ b/Lib/zipfile/__init__.py @@ -1925,11 +1925,6 @@ def __init__(self, file, mode="r", compression=ZIP_STORED, allowZip64=True, self._strict_timestamps = strict_timestamps self.metadata_encoding = metadata_encoding - # Check that we don't try to write with nonconforming codecs - if self.metadata_encoding and mode != 'r': - raise ValueError( - "metadata_encoding is only supported for reading files") - # Check if we were passed a file-like object if isinstance(file, os.PathLike): file = os.fspath(file) From d462cc711761f8aca8938584bdf8838f0544b736 Mon Sep 17 00:00:00 2001 From: "blurb-it[bot]" <43283697+blurb-it[bot]@users.noreply.github.com> Date: Thu, 2 Jul 2026 11:52:01 +0800 Subject: [PATCH 4/5] =?UTF-8?q?=F0=9F=93=9C=F0=9F=A4=96=20Added=20by=20blu?= =?UTF-8?q?rb=5Fit.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../Library/2026-07-02-03-51-54.gh-issue-152845.D-XZCn.rst | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 Misc/NEWS.d/next/Library/2026-07-02-03-51-54.gh-issue-152845.D-XZCn.rst diff --git a/Misc/NEWS.d/next/Library/2026-07-02-03-51-54.gh-issue-152845.D-XZCn.rst b/Misc/NEWS.d/next/Library/2026-07-02-03-51-54.gh-issue-152845.D-XZCn.rst new file mode 100644 index 00000000000000..4b5aa175ffef97 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2026-07-02-03-51-54.gh-issue-152845.D-XZCn.rst @@ -0,0 +1,5 @@ +Fix an issue where the EFS flag is not set when a file with ASCII +filename and UTF-8 comment is written through :mod:`zipfile`. Also +preserves the original encoding and EFS flag for a file read from an +archive and rewritten through the ``'a'`` mode. Additionallly allows +the ``metadata_encoding`` parameter in all modes. From 9892dfdc1723ceb4afca2d6c75af752ea500631f Mon Sep 17 00:00:00 2001 From: Danny Lin Date: Fri, 3 Jul 2026 13:57:05 +0800 Subject: [PATCH 5/5] Document behavior changes --- Doc/library/zipfile.rst | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/Doc/library/zipfile.rst b/Doc/library/zipfile.rst index 98d2a5e5cdf00e..4665e329247752 100644 --- a/Doc/library/zipfile.rst +++ b/Doc/library/zipfile.rst @@ -227,9 +227,10 @@ ZipFile objects Similar behavior occurs with files newer than 2107-12-31, the timestamp is also set to the limit. - When mode is ``'r'``, *metadata_encoding* may be set to the name of a codec, - which will be used to decode metadata such as the names of members and ZIP - comments. + The *metadata_encoding* argument may be set to the name of a codec, which + will be used to decode metadata such as the names of members and ZIP + comments when reading from an archive. This argument is ignored for writing + new members. If the file is created with mode ``'w'``, ``'x'`` or ``'a'`` and then :meth:`closed ` without adding any files to the archive, the appropriate @@ -285,6 +286,10 @@ ZipFile objects Added support for specifying member name encoding for reading metadata in the zipfile's directory and file headers. + .. versionchanged:: next + The *metadata_encoding* parameter is now allowed in all modes. Previously + it could only be specified in mode ``'r'``. + .. method:: ZipFile.close()