From 5d58e8ae9f73db45ec2719e305a1742c58265f79 Mon Sep 17 00:00:00 2001
From: Ammar Askar <ammar@ammaraskar.com>
Date: Sun, 24 Jun 2018 01:58:19 -0400
Subject: [PATCH 1/8] bpo-33899: Make tokenize module mirror end-of-file is
 end-of-line behavior

---
 Lib/test/test_tokenize.py                     | 34 ++++++++++++-------
 Lib/tokenize.py                               |  6 ++++
 .../2018-06-24-01-57-14.bpo-33899.IaOcAr.rst  |  3 ++
 3 files changed, 31 insertions(+), 12 deletions(-)
 create mode 100644 Misc/NEWS.d/next/Library/2018-06-24-01-57-14.bpo-33899.IaOcAr.rst

diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py
index 93e40de96e9eb2e..f3b0773bc173d01 100644
--- a/Lib/test/test_tokenize.py
+++ b/Lib/test/test_tokenize.py
@@ -1,7 +1,8 @@
 from test import support
 from tokenize import (tokenize, _tokenize, untokenize, NUMBER, NAME, OP,
                      STRING, ENDMARKER, ENCODING, tok_name, detect_encoding,
-                     open as tokenize_open, Untokenizer, generate_tokens)
+                     open as tokenize_open, Untokenizer, generate_tokens,
+                     NEWLINE)
 from io import BytesIO, StringIO
 import unittest
 from unittest import TestCase, mock
@@ -15,17 +16,20 @@ class TokenizeTest(TestCase):
     # Tests for the tokenize module.
 
     # The tests can be really simple. Given a small fragment of source
-    # code, print out a table with tokens. The ENDMARKER is omitted for
-    # brevity.
+    # code, print out a table with tokens. The ENDMARKER, ENCODING and
+    # final NEWLINE are omitted for brevity.
 
     def check_tokenize(self, s, expected):
         # Format the tokens in s in a table format.
-        # The ENDMARKER is omitted.
+        # The ENDMARKER and final NEWLINE are omitted.
         result = []
         f = BytesIO(s.encode('utf-8'))
+        num_lines = len(s.splitlines())
         for type, token, start, end, line in tokenize(f.readline):
             if type == ENDMARKER:
                 break
+            if s[-1] not in '\r\n' and type == NEWLINE and end[0] == num_lines:
+                continue
             type = tok_name[type]
             result.append(f"    {type:10} {token!r:13} {start} {end}")
         self.assertEqual(result,
@@ -922,12 +926,15 @@ async def bar(): pass
 class GenerateTokensTest(TokenizeTest):
     def check_tokenize(self, s, expected):
         # Format the tokens in s in a table format.
-        # The ENDMARKER is omitted.
+        # The ENDMARKER and final NEWLINE are omitted.
         result = []
         f = StringIO(s)
+        num_lines = len(s.splitlines())
         for type, token, start, end, line in generate_tokens(f.readline):
             if type == ENDMARKER:
                 break
+            if s[-1] not in '\r\n' and type == NEWLINE and end[0] == num_lines:
+                continue
             type = tok_name[type]
             result.append(f"    {type:10} {token!r:13} {start} {end}")
         self.assertEqual(result, expected.rstrip().splitlines())
@@ -1022,8 +1029,8 @@ def readline():
             else:
                 return b''
 
-        # skip the initial encoding token and the end token
-        tokens = list(_tokenize(readline, encoding='utf-8'))[1:-1]
+        # skip the initial encoding token and the end tokens
+        tokens = list(_tokenize(readline, encoding='utf-8'))[1:-2]
         expected_tokens = [(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')]
         self.assertEqual(tokens, expected_tokens,
                          "bytes not decoded with encoding")
@@ -1039,8 +1046,8 @@ def readline():
             else:
                 return b''
 
-        # skip the end token
-        tokens = list(_tokenize(readline, encoding=None))[:-1]
+        # skip the end tokens
+        tokens = list(_tokenize(readline, encoding=None))[:-2]
         expected_tokens = [(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')]
         self.assertEqual(tokens, expected_tokens,
                          "string not tokenized when encoding is None")
@@ -1351,18 +1358,21 @@ def test_oneline_defs(self):
 
         # Test that 500 consequent, one-line defs is OK
         toks = list(tokenize(BytesIO(buf.encode('utf-8')).readline))
-        self.assertEqual(toks[-2].string, 'OK') # [-1] is always ENDMARKER
+        self.assertEqual(toks[-3].string, 'OK') # [-1] is always ENDMARKER
+                                                # [-2] is always NEWLINE
 
     def assertExactTypeEqual(self, opstr, *optypes):
         tokens = list(tokenize(BytesIO(opstr.encode('utf-8')).readline))
         num_optypes = len(optypes)
-        self.assertEqual(len(tokens), 2 + num_optypes)
+        self.assertEqual(len(tokens), 3 + num_optypes)
         self.assertEqual(tok_name[tokens[0].exact_type],
                          tok_name[ENCODING])
         for i in range(num_optypes):
             self.assertEqual(tok_name[tokens[i + 1].exact_type],
                              tok_name[optypes[i]])
         self.assertEqual(tok_name[tokens[1 + num_optypes].exact_type],
+                         tok_name[token.NEWLINE])
+        self.assertEqual(tok_name[tokens[2 + num_optypes].exact_type],
                          tok_name[token.ENDMARKER])
 
     def test_exact_type(self):
@@ -1515,7 +1525,7 @@ def test_roundtrip(self):
         self.check_roundtrip("if x == 1:\n"
                              "    print(x)\n")
         self.check_roundtrip("# This is a comment\n"
-                             "# This also")
+                             "# This also\n")
 
         # Some people use different formatting conventions, which makes
         # untokenize a little trickier. Note that this test involves trailing
diff --git a/Lib/tokenize.py b/Lib/tokenize.py
index c78d9f7e9ee5af5..7571c1d0b754296 100644
--- a/Lib/tokenize.py
+++ b/Lib/tokenize.py
@@ -492,8 +492,11 @@ def _tokenize(readline, encoding):
             # BOM will already have been stripped.
             encoding = "utf-8"
         yield TokenInfo(ENCODING, encoding, (0, 0), (0, 0), '')
+    last_line = b''
+    line = b''
     while True:                                # loop over lines in stream
         try:
+            last_line = line
             line = readline()
         except StopIteration:
             line = b''
@@ -648,6 +651,9 @@ def _tokenize(readline, encoding):
                            (lnum, pos), (lnum, pos+1), line)
                 pos += 1
 
+    # Add an implicit NEWLINE if the input doesn't end in one
+    if len(last_line) > 0 and last_line[-1] not in '\r\n':
+        yield TokenInfo(NEWLINE, '', (lnum - 1, len(last_line)), (lnum - 1, len(last_line) + 1), '')
     for indent in indents[1:]:                 # pop remaining indent levels
         yield TokenInfo(DEDENT, '', (lnum, 0), (lnum, 0), '')
     yield TokenInfo(ENDMARKER, '', (lnum, 0), (lnum, 0), '')
diff --git a/Misc/NEWS.d/next/Library/2018-06-24-01-57-14.bpo-33899.IaOcAr.rst b/Misc/NEWS.d/next/Library/2018-06-24-01-57-14.bpo-33899.IaOcAr.rst
new file mode 100644
index 000000000000000..f1f8a45f8641088
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2018-06-24-01-57-14.bpo-33899.IaOcAr.rst
@@ -0,0 +1,3 @@
+Tokenize module now implicitly emits a NEWLINE when provided with input that
+does not have a trailing new line. This behavior now matches what the C
+tokenizer does internally.

From 43a1bd45a5023f37337988e985779df6e24ad338 Mon Sep 17 00:00:00 2001
From: Ammar Askar <ammar@ammaraskar.com>
Date: Sun, 24 Jun 2018 02:24:06 -0400
Subject: [PATCH 2/8] Add a specific testcase for the change

---
 Lib/test/test_tokenize.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py
index f3b0773bc173d01..45c2f40e396893a 100644
--- a/Lib/test/test_tokenize.py
+++ b/Lib/test/test_tokenize.py
@@ -36,6 +36,14 @@ def check_tokenize(self, s, expected):
                          ["    ENCODING   'utf-8'       (0, 0) (0, 0)"] +
                          expected.rstrip().splitlines())
 
+    def test_implicit_newline(self):
+        # Make sure that the tokenizer puts in an implicit NEWLINE
+        # when the input lacks a trailing new line.
+        f = BytesIO("x".encode('utf-8'))
+        tokens = list(tokenize(f.readline))
+        self.assertEqual(tokens[-2].type, NEWLINE)
+        self.assertEqual(tokens[-1].type, ENDMARKER)
+
     def test_basic(self):
         self.check_tokenize("1 + 1", """\
     NUMBER     '1'           (1, 0) (1, 1)

From 57c92d4bfcc4225be514a22b97d4a23b16d346c9 Mon Sep 17 00:00:00 2001
From: Ammar Askar <ammar@ammaraskar.com>
Date: Sun, 24 Jun 2018 14:31:35 -0400
Subject: [PATCH 3/8] Add comment explaining why last_line is in the start of
 the loop

---
 Lib/tokenize.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/Lib/tokenize.py b/Lib/tokenize.py
index 7571c1d0b754296..6510349f429c628 100644
--- a/Lib/tokenize.py
+++ b/Lib/tokenize.py
@@ -496,6 +496,9 @@ def _tokenize(readline, encoding):
     line = b''
     while True:                                # loop over lines in stream
         try:
+            # This loop has multiple points where it can break out so we
+            # pick up the value for the last_line here, at one unified
+            # point to keep things simple.
             last_line = line
             line = readline()
         except StopIteration:

From 679cd89790b100c8dd7097a4fa200d92190e8746 Mon Sep 17 00:00:00 2001
From: Ammar Askar <ammar@ammaraskar.com>
Date: Tue, 3 Jul 2018 09:22:03 -0700
Subject: [PATCH 4/8] Update comment with Tal's suggestions

---
 Lib/tokenize.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/Lib/tokenize.py b/Lib/tokenize.py
index 6510349f429c628..42a6c22af41daab 100644
--- a/Lib/tokenize.py
+++ b/Lib/tokenize.py
@@ -496,9 +496,10 @@ def _tokenize(readline, encoding):
     line = b''
     while True:                                # loop over lines in stream
         try:
-            # This loop has multiple points where it can break out so we
-            # pick up the value for the last_line here, at one unified
-            # point to keep things simple.
+            # We capture the value of the line variable here because 
+            # readline uses the empty string '' to signal end of input,
+            # hence `line` itself will always be overwritten at the end
+            # of this loop.
             last_line = line
             line = readline()
         except StopIteration:

From 24214a743e9e1cdbdea1d8bc42a5ffb44c9036e3 Mon Sep 17 00:00:00 2001
From: Ammar Askar <ammar@ammaraskar.com>
Date: Wed, 4 Jul 2018 01:19:47 -0700
Subject: [PATCH 5/8] Fix whitespace issue

---
 Lib/tokenize.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Lib/tokenize.py b/Lib/tokenize.py
index 42a6c22af41daab..cccb0b2b540da20 100644
--- a/Lib/tokenize.py
+++ b/Lib/tokenize.py
@@ -496,7 +496,7 @@ def _tokenize(readline, encoding):
     line = b''
     while True:                                # loop over lines in stream
         try:
-            # We capture the value of the line variable here because 
+            # We capture the value of the line variable here because
             # readline uses the empty string '' to signal end of input,
             # hence `line` itself will always be overwritten at the end
             # of this loop.

From 055dffba8e6073f8bea9ade5cee459d709fb03a0 Mon Sep 17 00:00:00 2001
From: Ammar Askar <ammar@ammaraskar.com>
Date: Thu, 5 Jul 2018 22:33:02 -0400
Subject: [PATCH 6/8] Move common code out to a function

---
 Lib/test/test_tokenize.py | 41 ++++++++++++++++++++++-----------------
 1 file changed, 23 insertions(+), 18 deletions(-)

diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py
index 45c2f40e396893a..f68580ccfb7c63c 100644
--- a/Lib/test/test_tokenize.py
+++ b/Lib/test/test_tokenize.py
@@ -12,6 +12,26 @@
 import token
 
 
+# Converts a source string into a list of textual representation
+# of the tokens such as:
+# `    NAME       'if'          (1, 0) (1, 2)`
+# to make writing tests easier.
+def stringify_tokens_from_source(token_generator, source_string):
+    result = []
+    num_lines = len(source_string.splitlines())
+    missing_trailing_nl = source_string[-1] not in '\r\n'
+
+    for type, token, start, end, line in token_generator:
+        if type == ENDMARKER:
+            break
+        # Ignore the new line on the last line if the input lacks one
+        if missing_trailing_nl and type == NEWLINE and end[0] == num_lines:
+            continue
+        type = tok_name[type]
+        result.append(f"    {type:10} {token!r:13} {start} {end}")
+
+    return result
+
 class TokenizeTest(TestCase):
     # Tests for the tokenize module.
 
@@ -22,16 +42,9 @@ class TokenizeTest(TestCase):
     def check_tokenize(self, s, expected):
         # Format the tokens in s in a table format.
         # The ENDMARKER and final NEWLINE are omitted.
-        result = []
         f = BytesIO(s.encode('utf-8'))
-        num_lines = len(s.splitlines())
-        for type, token, start, end, line in tokenize(f.readline):
-            if type == ENDMARKER:
-                break
-            if s[-1] not in '\r\n' and type == NEWLINE and end[0] == num_lines:
-                continue
-            type = tok_name[type]
-            result.append(f"    {type:10} {token!r:13} {start} {end}")
+        result = stringify_tokens_from_source(tokenize(f.readline), s)
+
         self.assertEqual(result,
                          ["    ENCODING   'utf-8'       (0, 0) (0, 0)"] +
                          expected.rstrip().splitlines())
@@ -935,16 +948,8 @@ class GenerateTokensTest(TokenizeTest):
     def check_tokenize(self, s, expected):
         # Format the tokens in s in a table format.
         # The ENDMARKER and final NEWLINE are omitted.
-        result = []
         f = StringIO(s)
-        num_lines = len(s.splitlines())
-        for type, token, start, end, line in generate_tokens(f.readline):
-            if type == ENDMARKER:
-                break
-            if s[-1] not in '\r\n' and type == NEWLINE and end[0] == num_lines:
-                continue
-            type = tok_name[type]
-            result.append(f"    {type:10} {token!r:13} {start} {end}")
+        result = stringify_tokens_from_source(generate_tokens(f.readline), s)
         self.assertEqual(result, expected.rstrip().splitlines())
 
 

From 2cca9388bd640bfcee34a8f1674e4a1c93f040f1 Mon Sep 17 00:00:00 2001
From: Ammar Askar <ammar@ammaraskar.com>
Date: Thu, 5 Jul 2018 22:40:56 -0400
Subject: [PATCH 7/8] Simplify last_line check

---
 Lib/tokenize.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Lib/tokenize.py b/Lib/tokenize.py
index cccb0b2b540da20..fce010bc5e7aa7f 100644
--- a/Lib/tokenize.py
+++ b/Lib/tokenize.py
@@ -656,7 +656,7 @@ def _tokenize(readline, encoding):
                 pos += 1
 
     # Add an implicit NEWLINE if the input doesn't end in one
-    if len(last_line) > 0 and last_line[-1] not in '\r\n':
+    if last_line and last_line[-1] not in '\r\n':
         yield TokenInfo(NEWLINE, '', (lnum - 1, len(last_line)), (lnum - 1, len(last_line) + 1), '')
     for indent in indents[1:]:                 # pop remaining indent levels
         yield TokenInfo(DEDENT, '', (lnum, 0), (lnum, 0), '')

From ae032e7329b47b0893ed62ede803704543c7483d Mon Sep 17 00:00:00 2001
From: Tal Einat <taleinat+github@gmail.com>
Date: Fri, 6 Jul 2018 08:48:47 +0300
Subject: [PATCH 8/8] update NEWS to mention contributor

---
 .../next/Library/2018-06-24-01-57-14.bpo-33899.IaOcAr.rst     | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Misc/NEWS.d/next/Library/2018-06-24-01-57-14.bpo-33899.IaOcAr.rst b/Misc/NEWS.d/next/Library/2018-06-24-01-57-14.bpo-33899.IaOcAr.rst
index f1f8a45f8641088..21c9095993630e7 100644
--- a/Misc/NEWS.d/next/Library/2018-06-24-01-57-14.bpo-33899.IaOcAr.rst
+++ b/Misc/NEWS.d/next/Library/2018-06-24-01-57-14.bpo-33899.IaOcAr.rst
@@ -1,3 +1,3 @@
 Tokenize module now implicitly emits a NEWLINE when provided with input that
-does not have a trailing new line. This behavior now matches what the C
-tokenizer does internally.
+does not have a trailing new line.  This behavior now matches what the C
+tokenizer does internally.  Contributed by Ammar Askar.