@@ -2042,21 +2042,40 @@ def test_setitems_on_non_dicts(self):
20422042 def check_frame_opcodes (self , pickled ):
20432043 """
20442044 Check the arguments of FRAME opcodes in a protocol 4+ pickle.
2045+
2046+ Note that binary objects that are larger than FRAME_SIZE_TARGET are not
2047+ framed by default and are therefore considered a frame by themselves in
2048+ the following consistency check.
20452049 """
2046- frame_opcode_size = 9
2047- last_arg = last_pos = None
2050+ last_arg = last_pos = last_frame_opcode_size = None
2051+ frameless_opcode_sizes = {
2052+ 'BINBYTES' : 5 ,
2053+ 'BINUNICODE' : 5 ,
2054+ 'BINBYTES8' : 9 ,
2055+ 'BINUNICODE8' : 9 ,
2056+ }
20482057 for op , arg , pos in pickletools .genops (pickled ):
2049- if op .name != 'FRAME' :
2058+ if op .name in frameless_opcode_sizes :
2059+ if len (arg ) > self .FRAME_SIZE_TARGET :
2060+ frame_opcode_size = frameless_opcode_sizes [op .name ]
2061+ arg = len (arg )
2062+ else :
2063+ continue
2064+ elif op .name == 'FRAME' :
2065+ frame_opcode_size = 9
2066+ else :
20502067 continue
2068+
20512069 if last_pos is not None :
20522070 # The previous frame's size should be equal to the number
20532071 # of bytes up to the current frame.
2054- frame_size = pos - last_pos - frame_opcode_size
2072+ frame_size = pos - last_pos - last_frame_opcode_size
20552073 self .assertEqual (frame_size , last_arg )
20562074 last_arg , last_pos = arg , pos
2075+ last_frame_opcode_size = frame_opcode_size
20572076 # The last frame's size should be equal to the number of bytes up
20582077 # to the pickle's end.
2059- frame_size = len (pickled ) - last_pos - frame_opcode_size
2078+ frame_size = len (pickled ) - last_pos - last_frame_opcode_size
20602079 self .assertEqual (frame_size , last_arg )
20612080
20622081 def test_framing_many_objects (self ):
@@ -2076,15 +2095,36 @@ def test_framing_many_objects(self):
20762095
20772096 def test_framing_large_objects (self ):
20782097 N = 1024 * 1024
2079- obj = [b'x' * N , b'y' * N , b 'z' * N ]
2098+ obj = [b'x' * N , b'y' * N , 'z' * N ]
20802099 for proto in range (4 , pickle .HIGHEST_PROTOCOL + 1 ):
2081- with self .subTest (proto = proto ):
2082- pickled = self .dumps (obj , proto )
2083- unpickled = self .loads (pickled )
2084- self .assertEqual (obj , unpickled )
2085- n_frames = count_opcode (pickle .FRAME , pickled )
2086- self .assertGreaterEqual (n_frames , len (obj ))
2087- self .check_frame_opcodes (pickled )
2100+ for fast in [True , False ]:
2101+ with self .subTest (proto = proto , fast = fast ):
2102+ if hasattr (self , 'pickler' ):
2103+ buf = io .BytesIO ()
2104+ pickler = self .pickler (buf , protocol = proto )
2105+ pickler .fast = fast
2106+ pickler .dump (obj )
2107+ pickled = buf .getvalue ()
2108+ elif fast :
2109+ continue
2110+ else :
2111+ # Fallback to self.dumps when fast=False and
2112+ # self.pickler is not available.
2113+ pickled = self .dumps (obj , proto )
2114+ unpickled = self .loads (pickled )
2115+ # More informative error message in case of failure.
2116+ self .assertEqual ([len (x ) for x in obj ],
2117+ [len (x ) for x in unpickled ])
2118+ # Perform full equality check if the lengths match.
2119+ self .assertEqual (obj , unpickled )
2120+ n_frames = count_opcode (pickle .FRAME , pickled )
2121+ if not fast :
2122+ # One frame per memoize for each large object.
2123+ self .assertGreaterEqual (n_frames , len (obj ))
2124+ else :
2125+ # One frame at the beginning and one at the end.
2126+ self .assertGreaterEqual (n_frames , 2 )
2127+ self .check_frame_opcodes (pickled )
20882128
20892129 def test_optional_frames (self ):
20902130 if pickle .HIGHEST_PROTOCOL < 4 :
@@ -2125,6 +2165,71 @@ def remove_frames(pickled, keep_frame=None):
21252165 count_opcode (pickle .FRAME , pickled ))
21262166 self .assertEqual (obj , self .loads (some_frames_pickle ))
21272167
2168+ def test_framed_write_sizes_with_delayed_writer (self ):
2169+ class ChunkAccumulator :
2170+ """Accumulate pickler output in a list of raw chunks."""
2171+
2172+ def __init__ (self ):
2173+ self .chunks = []
2174+
2175+ def write (self , chunk ):
2176+ self .chunks .append (chunk )
2177+
2178+ def concatenate_chunks (self ):
2179+ # Some chunks can be memoryview instances, we need to convert
2180+ # them to bytes to be able to call join
2181+ return b"" .join ([c .tobytes () if hasattr (c , 'tobytes' ) else c
2182+ for c in self .chunks ])
2183+
2184+ small_objects = [(str (i ).encode ('ascii' ), i % 42 , {'i' : str (i )})
2185+ for i in range (int (1e4 ))]
2186+
2187+ for proto in range (4 , pickle .HIGHEST_PROTOCOL + 1 ):
2188+ # Protocol 4 packs groups of small objects into frames and issues
2189+ # calls to write only once or twice per frame:
2190+ # The C pickler issues one call to write per-frame (header and
2191+ # contents) while Python pickler issues two calls to write: one for
2192+ # the frame header and one for the frame binary contents.
2193+ writer = ChunkAccumulator ()
2194+ self .pickler (writer , proto ).dump (small_objects )
2195+
2196+ # Actually read the binary content of the chunks after the end
2197+ # of the call to dump: ant memoryview passed to write should not
2198+ # be released otherwise this delayed access would not be possible.
2199+ pickled = writer .concatenate_chunks ()
2200+ reconstructed = self .loads (pickled )
2201+ self .assertEqual (reconstructed , small_objects )
2202+ self .assertGreater (len (writer .chunks ), 1 )
2203+
2204+ n_frames , remainder = divmod (len (pickled ), self .FRAME_SIZE_TARGET )
2205+ if remainder > 0 :
2206+ n_frames += 1
2207+
2208+ # There should be at least one call to write per frame
2209+ self .assertGreaterEqual (len (writer .chunks ), n_frames )
2210+
2211+ # but not too many either: there can be one for the proto,
2212+ # one per-frame header and one per frame for the actual contents.
2213+ self .assertGreaterEqual (2 * n_frames + 1 , len (writer .chunks ))
2214+
2215+ chunk_sizes = [len (c ) for c in writer .chunks [:- 1 ]]
2216+ large_sizes = [s for s in chunk_sizes
2217+ if s >= self .FRAME_SIZE_TARGET ]
2218+ small_sizes = [s for s in chunk_sizes
2219+ if s < self .FRAME_SIZE_TARGET ]
2220+
2221+ # Large chunks should not be too large:
2222+ for chunk_size in large_sizes :
2223+ self .assertGreater (2 * self .FRAME_SIZE_TARGET , chunk_size )
2224+
2225+ last_chunk_size = len (writer .chunks [- 1 ])
2226+ self .assertGreater (2 * self .FRAME_SIZE_TARGET , last_chunk_size )
2227+
2228+ # Small chunks (if any) should be very small
2229+ # (only proto and frame headers)
2230+ for chunk_size in small_sizes :
2231+ self .assertGreaterEqual (9 , chunk_size )
2232+
21282233 def test_nested_names (self ):
21292234 global Nested
21302235 class Nested :
0 commit comments