From 5d58e8ae9f73db45ec2719e305a1742c58265f79 Mon Sep 17 00:00:00 2001 From: Ammar Askar Date: Sun, 24 Jun 2018 01:58:19 -0400 Subject: [PATCH 1/8] bpo-33899: Make tokenize module mirror end-of-file is end-of-line behavior --- Lib/test/test_tokenize.py | 34 ++++++++++++------- Lib/tokenize.py | 6 ++++ .../2018-06-24-01-57-14.bpo-33899.IaOcAr.rst | 3 ++ 3 files changed, 31 insertions(+), 12 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2018-06-24-01-57-14.bpo-33899.IaOcAr.rst diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py index 93e40de96e9eb2e..f3b0773bc173d01 100644 --- a/Lib/test/test_tokenize.py +++ b/Lib/test/test_tokenize.py @@ -1,7 +1,8 @@ from test import support from tokenize import (tokenize, _tokenize, untokenize, NUMBER, NAME, OP, STRING, ENDMARKER, ENCODING, tok_name, detect_encoding, - open as tokenize_open, Untokenizer, generate_tokens) + open as tokenize_open, Untokenizer, generate_tokens, + NEWLINE) from io import BytesIO, StringIO import unittest from unittest import TestCase, mock @@ -15,17 +16,20 @@ class TokenizeTest(TestCase): # Tests for the tokenize module. # The tests can be really simple. Given a small fragment of source - # code, print out a table with tokens. The ENDMARKER is omitted for - # brevity. + # code, print out a table with tokens. The ENDMARKER, ENCODING and + # final NEWLINE are omitted for brevity. def check_tokenize(self, s, expected): # Format the tokens in s in a table format. - # The ENDMARKER is omitted. + # The ENDMARKER and final NEWLINE are omitted. result = [] f = BytesIO(s.encode('utf-8')) + num_lines = len(s.splitlines()) for type, token, start, end, line in tokenize(f.readline): if type == ENDMARKER: break + if s[-1] not in '\r\n' and type == NEWLINE and end[0] == num_lines: + continue type = tok_name[type] result.append(f" {type:10} {token!r:13} {start} {end}") self.assertEqual(result, @@ -922,12 +926,15 @@ async def bar(): pass class GenerateTokensTest(TokenizeTest): def check_tokenize(self, s, expected): # Format the tokens in s in a table format. - # The ENDMARKER is omitted. + # The ENDMARKER and final NEWLINE are omitted. result = [] f = StringIO(s) + num_lines = len(s.splitlines()) for type, token, start, end, line in generate_tokens(f.readline): if type == ENDMARKER: break + if s[-1] not in '\r\n' and type == NEWLINE and end[0] == num_lines: + continue type = tok_name[type] result.append(f" {type:10} {token!r:13} {start} {end}") self.assertEqual(result, expected.rstrip().splitlines()) @@ -1022,8 +1029,8 @@ def readline(): else: return b'' - # skip the initial encoding token and the end token - tokens = list(_tokenize(readline, encoding='utf-8'))[1:-1] + # skip the initial encoding token and the end tokens + tokens = list(_tokenize(readline, encoding='utf-8'))[1:-2] expected_tokens = [(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')] self.assertEqual(tokens, expected_tokens, "bytes not decoded with encoding") @@ -1039,8 +1046,8 @@ def readline(): else: return b'' - # skip the end token - tokens = list(_tokenize(readline, encoding=None))[:-1] + # skip the end tokens + tokens = list(_tokenize(readline, encoding=None))[:-2] expected_tokens = [(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')] self.assertEqual(tokens, expected_tokens, "string not tokenized when encoding is None") @@ -1351,18 +1358,21 @@ def test_oneline_defs(self): # Test that 500 consequent, one-line defs is OK toks = list(tokenize(BytesIO(buf.encode('utf-8')).readline)) - self.assertEqual(toks[-2].string, 'OK') # [-1] is always ENDMARKER + self.assertEqual(toks[-3].string, 'OK') # [-1] is always ENDMARKER + # [-2] is always NEWLINE def assertExactTypeEqual(self, opstr, *optypes): tokens = list(tokenize(BytesIO(opstr.encode('utf-8')).readline)) num_optypes = len(optypes) - self.assertEqual(len(tokens), 2 + num_optypes) + self.assertEqual(len(tokens), 3 + num_optypes) self.assertEqual(tok_name[tokens[0].exact_type], tok_name[ENCODING]) for i in range(num_optypes): self.assertEqual(tok_name[tokens[i + 1].exact_type], tok_name[optypes[i]]) self.assertEqual(tok_name[tokens[1 + num_optypes].exact_type], + tok_name[token.NEWLINE]) + self.assertEqual(tok_name[tokens[2 + num_optypes].exact_type], tok_name[token.ENDMARKER]) def test_exact_type(self): @@ -1515,7 +1525,7 @@ def test_roundtrip(self): self.check_roundtrip("if x == 1:\n" " print(x)\n") self.check_roundtrip("# This is a comment\n" - "# This also") + "# This also\n") # Some people use different formatting conventions, which makes # untokenize a little trickier. Note that this test involves trailing diff --git a/Lib/tokenize.py b/Lib/tokenize.py index c78d9f7e9ee5af5..7571c1d0b754296 100644 --- a/Lib/tokenize.py +++ b/Lib/tokenize.py @@ -492,8 +492,11 @@ def _tokenize(readline, encoding): # BOM will already have been stripped. encoding = "utf-8" yield TokenInfo(ENCODING, encoding, (0, 0), (0, 0), '') + last_line = b'' + line = b'' while True: # loop over lines in stream try: + last_line = line line = readline() except StopIteration: line = b'' @@ -648,6 +651,9 @@ def _tokenize(readline, encoding): (lnum, pos), (lnum, pos+1), line) pos += 1 + # Add an implicit NEWLINE if the input doesn't end in one + if len(last_line) > 0 and last_line[-1] not in '\r\n': + yield TokenInfo(NEWLINE, '', (lnum - 1, len(last_line)), (lnum - 1, len(last_line) + 1), '') for indent in indents[1:]: # pop remaining indent levels yield TokenInfo(DEDENT, '', (lnum, 0), (lnum, 0), '') yield TokenInfo(ENDMARKER, '', (lnum, 0), (lnum, 0), '') diff --git a/Misc/NEWS.d/next/Library/2018-06-24-01-57-14.bpo-33899.IaOcAr.rst b/Misc/NEWS.d/next/Library/2018-06-24-01-57-14.bpo-33899.IaOcAr.rst new file mode 100644 index 000000000000000..f1f8a45f8641088 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2018-06-24-01-57-14.bpo-33899.IaOcAr.rst @@ -0,0 +1,3 @@ +Tokenize module now implicitly emits a NEWLINE when provided with input that +does not have a trailing new line. This behavior now matches what the C +tokenizer does internally. From 43a1bd45a5023f37337988e985779df6e24ad338 Mon Sep 17 00:00:00 2001 From: Ammar Askar Date: Sun, 24 Jun 2018 02:24:06 -0400 Subject: [PATCH 2/8] Add a specific testcase for the change --- Lib/test/test_tokenize.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py index f3b0773bc173d01..45c2f40e396893a 100644 --- a/Lib/test/test_tokenize.py +++ b/Lib/test/test_tokenize.py @@ -36,6 +36,14 @@ def check_tokenize(self, s, expected): [" ENCODING 'utf-8' (0, 0) (0, 0)"] + expected.rstrip().splitlines()) + def test_implicit_newline(self): + # Make sure that the tokenizer puts in an implicit NEWLINE + # when the input lacks a trailing new line. + f = BytesIO("x".encode('utf-8')) + tokens = list(tokenize(f.readline)) + self.assertEqual(tokens[-2].type, NEWLINE) + self.assertEqual(tokens[-1].type, ENDMARKER) + def test_basic(self): self.check_tokenize("1 + 1", """\ NUMBER '1' (1, 0) (1, 1) From 57c92d4bfcc4225be514a22b97d4a23b16d346c9 Mon Sep 17 00:00:00 2001 From: Ammar Askar Date: Sun, 24 Jun 2018 14:31:35 -0400 Subject: [PATCH 3/8] Add comment explaining why last_line is in the start of the loop --- Lib/tokenize.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Lib/tokenize.py b/Lib/tokenize.py index 7571c1d0b754296..6510349f429c628 100644 --- a/Lib/tokenize.py +++ b/Lib/tokenize.py @@ -496,6 +496,9 @@ def _tokenize(readline, encoding): line = b'' while True: # loop over lines in stream try: + # This loop has multiple points where it can break out so we + # pick up the value for the last_line here, at one unified + # point to keep things simple. last_line = line line = readline() except StopIteration: From 679cd89790b100c8dd7097a4fa200d92190e8746 Mon Sep 17 00:00:00 2001 From: Ammar Askar Date: Tue, 3 Jul 2018 09:22:03 -0700 Subject: [PATCH 4/8] Update comment with Tal's suggestions --- Lib/tokenize.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/Lib/tokenize.py b/Lib/tokenize.py index 6510349f429c628..42a6c22af41daab 100644 --- a/Lib/tokenize.py +++ b/Lib/tokenize.py @@ -496,9 +496,10 @@ def _tokenize(readline, encoding): line = b'' while True: # loop over lines in stream try: - # This loop has multiple points where it can break out so we - # pick up the value for the last_line here, at one unified - # point to keep things simple. + # We capture the value of the line variable here because + # readline uses the empty string '' to signal end of input, + # hence `line` itself will always be overwritten at the end + # of this loop. last_line = line line = readline() except StopIteration: From 24214a743e9e1cdbdea1d8bc42a5ffb44c9036e3 Mon Sep 17 00:00:00 2001 From: Ammar Askar Date: Wed, 4 Jul 2018 01:19:47 -0700 Subject: [PATCH 5/8] Fix whitespace issue --- Lib/tokenize.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/tokenize.py b/Lib/tokenize.py index 42a6c22af41daab..cccb0b2b540da20 100644 --- a/Lib/tokenize.py +++ b/Lib/tokenize.py @@ -496,7 +496,7 @@ def _tokenize(readline, encoding): line = b'' while True: # loop over lines in stream try: - # We capture the value of the line variable here because + # We capture the value of the line variable here because # readline uses the empty string '' to signal end of input, # hence `line` itself will always be overwritten at the end # of this loop. From 055dffba8e6073f8bea9ade5cee459d709fb03a0 Mon Sep 17 00:00:00 2001 From: Ammar Askar Date: Thu, 5 Jul 2018 22:33:02 -0400 Subject: [PATCH 6/8] Move common code out to a function --- Lib/test/test_tokenize.py | 41 ++++++++++++++++++++++----------------- 1 file changed, 23 insertions(+), 18 deletions(-) diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py index 45c2f40e396893a..f68580ccfb7c63c 100644 --- a/Lib/test/test_tokenize.py +++ b/Lib/test/test_tokenize.py @@ -12,6 +12,26 @@ import token +# Converts a source string into a list of textual representation +# of the tokens such as: +# ` NAME 'if' (1, 0) (1, 2)` +# to make writing tests easier. +def stringify_tokens_from_source(token_generator, source_string): + result = [] + num_lines = len(source_string.splitlines()) + missing_trailing_nl = source_string[-1] not in '\r\n' + + for type, token, start, end, line in token_generator: + if type == ENDMARKER: + break + # Ignore the new line on the last line if the input lacks one + if missing_trailing_nl and type == NEWLINE and end[0] == num_lines: + continue + type = tok_name[type] + result.append(f" {type:10} {token!r:13} {start} {end}") + + return result + class TokenizeTest(TestCase): # Tests for the tokenize module. @@ -22,16 +42,9 @@ class TokenizeTest(TestCase): def check_tokenize(self, s, expected): # Format the tokens in s in a table format. # The ENDMARKER and final NEWLINE are omitted. - result = [] f = BytesIO(s.encode('utf-8')) - num_lines = len(s.splitlines()) - for type, token, start, end, line in tokenize(f.readline): - if type == ENDMARKER: - break - if s[-1] not in '\r\n' and type == NEWLINE and end[0] == num_lines: - continue - type = tok_name[type] - result.append(f" {type:10} {token!r:13} {start} {end}") + result = stringify_tokens_from_source(tokenize(f.readline), s) + self.assertEqual(result, [" ENCODING 'utf-8' (0, 0) (0, 0)"] + expected.rstrip().splitlines()) @@ -935,16 +948,8 @@ class GenerateTokensTest(TokenizeTest): def check_tokenize(self, s, expected): # Format the tokens in s in a table format. # The ENDMARKER and final NEWLINE are omitted. - result = [] f = StringIO(s) - num_lines = len(s.splitlines()) - for type, token, start, end, line in generate_tokens(f.readline): - if type == ENDMARKER: - break - if s[-1] not in '\r\n' and type == NEWLINE and end[0] == num_lines: - continue - type = tok_name[type] - result.append(f" {type:10} {token!r:13} {start} {end}") + result = stringify_tokens_from_source(generate_tokens(f.readline), s) self.assertEqual(result, expected.rstrip().splitlines()) From 2cca9388bd640bfcee34a8f1674e4a1c93f040f1 Mon Sep 17 00:00:00 2001 From: Ammar Askar Date: Thu, 5 Jul 2018 22:40:56 -0400 Subject: [PATCH 7/8] Simplify last_line check --- Lib/tokenize.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/tokenize.py b/Lib/tokenize.py index cccb0b2b540da20..fce010bc5e7aa7f 100644 --- a/Lib/tokenize.py +++ b/Lib/tokenize.py @@ -656,7 +656,7 @@ def _tokenize(readline, encoding): pos += 1 # Add an implicit NEWLINE if the input doesn't end in one - if len(last_line) > 0 and last_line[-1] not in '\r\n': + if last_line and last_line[-1] not in '\r\n': yield TokenInfo(NEWLINE, '', (lnum - 1, len(last_line)), (lnum - 1, len(last_line) + 1), '') for indent in indents[1:]: # pop remaining indent levels yield TokenInfo(DEDENT, '', (lnum, 0), (lnum, 0), '') From ae032e7329b47b0893ed62ede803704543c7483d Mon Sep 17 00:00:00 2001 From: Tal Einat Date: Fri, 6 Jul 2018 08:48:47 +0300 Subject: [PATCH 8/8] update NEWS to mention contributor --- .../next/Library/2018-06-24-01-57-14.bpo-33899.IaOcAr.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Misc/NEWS.d/next/Library/2018-06-24-01-57-14.bpo-33899.IaOcAr.rst b/Misc/NEWS.d/next/Library/2018-06-24-01-57-14.bpo-33899.IaOcAr.rst index f1f8a45f8641088..21c9095993630e7 100644 --- a/Misc/NEWS.d/next/Library/2018-06-24-01-57-14.bpo-33899.IaOcAr.rst +++ b/Misc/NEWS.d/next/Library/2018-06-24-01-57-14.bpo-33899.IaOcAr.rst @@ -1,3 +1,3 @@ Tokenize module now implicitly emits a NEWLINE when provided with input that -does not have a trailing new line. This behavior now matches what the C -tokenizer does internally. +does not have a trailing new line. This behavior now matches what the C +tokenizer does internally. Contributed by Ammar Askar.