Skip to content

Commit 99a2b40

Browse files
committed
gh-104169: Refactor tokenizer into lexer and wrappers
* The lexer, which include the actual lexeme producing logic, goes into the `lexer` directory. * The wrappers, one wrapper per input mode (file, string, utf-8, and readline), go into the `tokenizer` directory and include logic for creating a lexer instance and managing the buffer for different modes.
1 parent 5c6e854 commit 99a2b40

20 files changed

Lines changed: 1777 additions & 1662 deletions

Makefile.pre.in

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -347,20 +347,36 @@ PEGEN_OBJS= \
347347
Parser/string_parser.o \
348348
Parser/peg_api.o
349349

350+
TOKENIZER_OBJS= \
351+
Parser/lexer/buffer.o \
352+
Parser/lexer/lexer.o \
353+
Parser/lexer/state.o \
354+
Parser/tokenizer/file_tokenizer.o \
355+
Parser/tokenizer/readline_tokenizer.o \
356+
Parser/tokenizer/string_tokenizer.o \
357+
Parser/tokenizer/utf8_tokenizer.o \
358+
Parser/tokenizer/helpers.o
350359

351360
PEGEN_HEADERS= \
352361
$(srcdir)/Include/internal/pycore_parser.h \
353362
$(srcdir)/Parser/pegen.h \
354363
$(srcdir)/Parser/string_parser.h
355364

365+
TOKENIZER_HEADERS= \
366+
Parser/lexer/buffer.h \
367+
Parser/lexer/lexer.h \
368+
Parser/lexer/state.h \
369+
Parser/tokenizer/tokenizer.h \
370+
Parser/tokenizer/helpers.h
371+
356372
POBJS= \
357373
Parser/token.o \
358374

359-
PARSER_OBJS= $(POBJS) $(PEGEN_OBJS) Parser/myreadline.o Parser/tokenizer.o
375+
PARSER_OBJS= $(POBJS) $(PEGEN_OBJS) $(TOKENIZER_OBJS) Parser/myreadline.o
360376

361377
PARSER_HEADERS= \
362378
$(PEGEN_HEADERS) \
363-
$(srcdir)/Parser/tokenizer.h
379+
$(TOKENIZER_HEADERS)
364380

365381
##########################################################################
366382
# Python

Parser/action_helpers.c

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
#include <Python.h>
22

33
#include "pegen.h"
4-
#include "tokenizer.h"
54
#include "string_parser.h"
65
#include "pycore_runtime.h" // _PyRuntime
76

Parser/lexer/buffer.c

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
#include "Python.h"
2+
#include "errcode.h"
3+
4+
#include "state.h"
5+
6+
/* Traverse and remember all f-string buffers, in order to be able to restore
7+
them after reallocating tok->buf */
8+
void
9+
remember_fstring_buffers(struct tok_state *tok)
10+
{
11+
int index;
12+
tokenizer_mode *mode;
13+
14+
for (index = tok->tok_mode_stack_index; index >= 0; --index) {
15+
mode = &(tok->tok_mode_stack[index]);
16+
mode->f_string_start_offset = mode->f_string_start - tok->buf;
17+
mode->f_string_multi_line_start_offset = mode->f_string_multi_line_start - tok->buf;
18+
}
19+
}
20+
21+
/* Traverse and restore all f-string buffers after reallocating tok->buf */
22+
void
23+
restore_fstring_buffers(struct tok_state *tok)
24+
{
25+
int index;
26+
tokenizer_mode *mode;
27+
28+
for (index = tok->tok_mode_stack_index; index >= 0; --index) {
29+
mode = &(tok->tok_mode_stack[index]);
30+
mode->f_string_start = tok->buf + mode->f_string_start_offset;
31+
mode->f_string_multi_line_start = tok->buf + mode->f_string_multi_line_start_offset;
32+
}
33+
}
34+
35+
/* Read a line of text from TOK into S, using the stream in TOK.
36+
Return NULL on failure, else S.
37+
38+
On entry, tok->decoding_buffer will be one of:
39+
1) NULL: need to call tok->decoding_readline to get a new line
40+
2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
41+
stored the result in tok->decoding_buffer
42+
3) PyByteArrayObject *: previous call to tok_readline_recode did not have enough room
43+
(in the s buffer) to copy entire contents of the line read
44+
by tok->decoding_readline. tok->decoding_buffer has the overflow.
45+
In this case, tok_readline_recode is called in a loop (with an expanded buffer)
46+
until the buffer ends with a '\n' (or until the end of the file is
47+
reached): see tok_nextc and its calls to tok_reserve_buf.
48+
*/
49+
int
50+
tok_reserve_buf(struct tok_state *tok, Py_ssize_t size)
51+
{
52+
Py_ssize_t cur = tok->cur - tok->buf;
53+
Py_ssize_t oldsize = tok->inp - tok->buf;
54+
Py_ssize_t newsize = oldsize + Py_MAX(size, oldsize >> 1);
55+
if (newsize > tok->end - tok->buf) {
56+
char *newbuf = tok->buf;
57+
Py_ssize_t start = tok->start == NULL ? -1 : tok->start - tok->buf;
58+
Py_ssize_t line_start = tok->start == NULL ? -1 : tok->line_start - tok->buf;
59+
Py_ssize_t multi_line_start = tok->multi_line_start - tok->buf;
60+
remember_fstring_buffers(tok);
61+
newbuf = (char *)PyMem_Realloc(newbuf, newsize);
62+
if (newbuf == NULL) {
63+
tok->done = E_NOMEM;
64+
return 0;
65+
}
66+
tok->buf = newbuf;
67+
tok->cur = tok->buf + cur;
68+
tok->inp = tok->buf + oldsize;
69+
tok->end = tok->buf + newsize;
70+
tok->start = start < 0 ? NULL : tok->buf + start;
71+
tok->line_start = line_start < 0 ? NULL : tok->buf + line_start;
72+
tok->multi_line_start = multi_line_start < 0 ? NULL : tok->buf + multi_line_start;
73+
restore_fstring_buffers(tok);
74+
}
75+
return 1;
76+
}

Parser/lexer/buffer.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
#ifndef _LEXER_BUFFER_H_
2+
#define _LEXER_BUFFER_H_
3+
4+
#include "pyport.h"
5+
6+
void remember_fstring_buffers(struct tok_state *tok);
7+
void restore_fstring_buffers(struct tok_state *tok);
8+
int tok_reserve_buf(struct tok_state *tok, Py_ssize_t size);
9+
10+
#endif

0 commit comments

Comments
 (0)