From 7e1fe69af74422d905e33ca520843c532b3d8073 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Fri, 5 Jan 2024 16:27:57 +0200 Subject: [PATCH 1/5] gh-113732: Fix support of QUOTE_NOTNULL and QUOTE_STRINGS in csv.reader --- Doc/whatsnew/3.12.rst | 2 +- Lib/test/test_csv.py | 22 +++++++++ ...-01-05-16-27-34.gh-issue-113732.fgDRXA.rst | 2 + Modules/_csv.c | 46 ++++++++++++------- 4 files changed, 54 insertions(+), 18 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2024-01-05-16-27-34.gh-issue-113732.fgDRXA.rst diff --git a/Doc/whatsnew/3.12.rst b/Doc/whatsnew/3.12.rst index 9a2ccf7ebc6a686..0961bd0a573bd82 100644 --- a/Doc/whatsnew/3.12.rst +++ b/Doc/whatsnew/3.12.rst @@ -690,7 +690,7 @@ csv * Add :const:`csv.QUOTE_NOTNULL` and :const:`csv.QUOTE_STRINGS` flags to provide finer grained control of ``None`` and empty strings by - :class:`csv.writer` objects. + :class:`~csv.reader` and :class:`csv.writer` objects. dis --- diff --git a/Lib/test/test_csv.py b/Lib/test/test_csv.py index 97b9bba24bcbca4..ce70bf36e82ddf1 100644 --- a/Lib/test/test_csv.py +++ b/Lib/test/test_csv.py @@ -347,10 +347,26 @@ def test_read_quoting(self): # will this fail where locale uses comma for decimals? self._read_test([',3,"5",7.3, 9'], [['', 3, '5', 7.3, 9]], quoting=csv.QUOTE_NONNUMERIC) + self._read_test([',3,"5",7.3, 9'], [[None, '3', '5', '7.3', ' 9']], + quoting=csv.QUOTE_NOTNULL) + self._read_test([',3,"5",7.3, 9'], [[None, 3, '5', 7.3, 9]], + quoting=csv.QUOTE_STRINGS) + + self._read_test([',,"",'], [['', '', '', '']]) + self._read_test([',,"",'], [['', '', '', '']], + quoting=csv.QUOTE_NONNUMERIC) + self._read_test([',,"",'], [[None, None, '', None]], + quoting=csv.QUOTE_NOTNULL) + self._read_test([',,"",'], [[None, None, '', None]], + quoting=csv.QUOTE_STRINGS) + self._read_test(['"a\nb", 7'], [['a\nb', ' 7']]) self.assertRaises(ValueError, self._read_test, ['abc,3'], [[]], quoting=csv.QUOTE_NONNUMERIC) + self.assertRaises(ValueError, self._read_test, + ['abc,3'], [[]], + quoting=csv.QUOTE_STRINGS) self._read_test(['1,@,3,@,5'], [['1', ',3,', '5']], quotechar='@') self._read_test(['1,\0,3,\0,5'], [['1', ',3,', '5']], quotechar='\0') @@ -358,6 +374,12 @@ def test_read_skipinitialspace(self): self._read_test(['no space, space, spaces,\ttab'], [['no space', 'space', 'spaces', '\ttab']], skipinitialspace=True) + self._read_test([' , , '], + [['', '', '']], + skipinitialspace=True) + self._read_test([' , , '], + [[None, None, None]], + skipinitialspace=True, quoting=csv.QUOTE_STRINGS) def test_read_bigfield(self): # This exercises the buffer realloc functionality and field size diff --git a/Misc/NEWS.d/next/Library/2024-01-05-16-27-34.gh-issue-113732.fgDRXA.rst b/Misc/NEWS.d/next/Library/2024-01-05-16-27-34.gh-issue-113732.fgDRXA.rst new file mode 100644 index 000000000000000..7582603dcf95f5f --- /dev/null +++ b/Misc/NEWS.d/next/Library/2024-01-05-16-27-34.gh-issue-113732.fgDRXA.rst @@ -0,0 +1,2 @@ +Fix support of :data:`~csv.QUOTE_NOTNULL` and :data:`~csv.QUOTE_STRINGS` in +:func:`csv.reader`. diff --git a/Modules/_csv.c b/Modules/_csv.c index ae6b6457ffad9af..90860de2fadd133 100644 --- a/Modules/_csv.c +++ b/Modules/_csv.c @@ -133,7 +133,7 @@ typedef struct { Py_UCS4 *field; /* temporary buffer */ Py_ssize_t field_size; /* size of allocated buffer */ Py_ssize_t field_len; /* length of current field */ - int numeric_field; /* treat field as numeric */ + int unquoted_field; unsigned long line_num; /* Source-file line number */ } ReaderObj; @@ -607,22 +607,33 @@ _call_dialect(_csvstate *module_state, PyObject *dialect_inst, PyObject *kwargs) static int parse_save_field(ReaderObj *self) { + int quoting = self->dialect->quoting; PyObject *field; - field = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, - (void *) self->field, self->field_len); - if (field == NULL) - return -1; - self->field_len = 0; - if (self->numeric_field) { - PyObject *tmp; - - self->numeric_field = 0; - tmp = PyNumber_Float(field); - Py_DECREF(field); - if (tmp == NULL) + if (self->unquoted_field && + self->field_len == 0 && + (quoting == QUOTE_NOTNULL || quoting == QUOTE_STRINGS)) + { + field = Py_NewRef(Py_None); + } + else { + field = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, + (void *) self->field, self->field_len); + if (field == NULL) return -1; - field = tmp; + if (self->unquoted_field && + self->field_len != 0 && + (quoting == QUOTE_NONNUMERIC || quoting == QUOTE_STRINGS)) + { + PyObject *tmp; + + tmp = PyNumber_Float(field); + Py_DECREF(field); + if (tmp == NULL) + return -1; + field = tmp; + } + self->field_len = 0; } if (PyList_Append(self->fields, field) < 0) { Py_DECREF(field); @@ -684,6 +695,7 @@ parse_process_char(ReaderObj *self, _csvstate *module_state, Py_UCS4 c) /* fallthru */ case START_FIELD: /* expecting field */ + self->unquoted_field = 1; if (c == '\n' || c == '\r' || c == EOL) { /* save empty field - return [fields] */ if (parse_save_field(self) < 0) @@ -693,10 +705,12 @@ parse_process_char(ReaderObj *self, _csvstate *module_state, Py_UCS4 c) else if (c == dialect->quotechar && dialect->quoting != QUOTE_NONE) { /* start quoted field */ + self->unquoted_field = 0; self->state = IN_QUOTED_FIELD; } else if (c == dialect->escapechar) { /* possible escaped character */ + self->unquoted_field = 0; self->state = ESCAPED_CHAR; } else if (c == ' ' && dialect->skipinitialspace) @@ -709,8 +723,6 @@ parse_process_char(ReaderObj *self, _csvstate *module_state, Py_UCS4 c) } else { /* begin new unquoted field */ - if (dialect->quoting == QUOTE_NONNUMERIC) - self->numeric_field = 1; if (parse_add_char(self, module_state, c) < 0) return -1; self->state = IN_FIELD; @@ -854,7 +866,7 @@ parse_reset(ReaderObj *self) return -1; self->field_len = 0; self->state = START_RECORD; - self->numeric_field = 0; + self->unquoted_field = 0; return 0; } From c0d6557d0a54bbc73610fc8f21f29d345819d33a Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Fri, 5 Jan 2024 19:41:59 +0200 Subject: [PATCH 2/5] Apply suggestions from code review Co-authored-by: Erlend E. Aasland --- Modules/_csv.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/Modules/_csv.c b/Modules/_csv.c index 90860de2fadd133..fbc551a1093daf1 100644 --- a/Modules/_csv.c +++ b/Modules/_csv.c @@ -619,18 +619,18 @@ parse_save_field(ReaderObj *self) else { field = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, (void *) self->field, self->field_len); - if (field == NULL) + if (field == NULL) { return -1; + } if (self->unquoted_field && self->field_len != 0 && (quoting == QUOTE_NONNUMERIC || quoting == QUOTE_STRINGS)) { - PyObject *tmp; - - tmp = PyNumber_Float(field); + PyObject *tmp = PyNumber_Float(field); Py_DECREF(field); - if (tmp == NULL) + if (tmp == NULL) { return -1; + } field = tmp; } self->field_len = 0; From 11768e54e7879343015041ae0803cec0d0553a56 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Fri, 5 Jan 2024 20:09:25 +0200 Subject: [PATCH 3/5] Use bool. --- Modules/_csv.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/Modules/_csv.c b/Modules/_csv.c index fbc551a1093daf1..cb483331430cf8b 100644 --- a/Modules/_csv.c +++ b/Modules/_csv.c @@ -133,7 +133,7 @@ typedef struct { Py_UCS4 *field; /* temporary buffer */ Py_ssize_t field_size; /* size of allocated buffer */ Py_ssize_t field_len; /* length of current field */ - int unquoted_field; + bool unquoted_field; unsigned long line_num; /* Source-file line number */ } ReaderObj; @@ -695,7 +695,7 @@ parse_process_char(ReaderObj *self, _csvstate *module_state, Py_UCS4 c) /* fallthru */ case START_FIELD: /* expecting field */ - self->unquoted_field = 1; + self->unquoted_field = true; if (c == '\n' || c == '\r' || c == EOL) { /* save empty field - return [fields] */ if (parse_save_field(self) < 0) @@ -705,12 +705,12 @@ parse_process_char(ReaderObj *self, _csvstate *module_state, Py_UCS4 c) else if (c == dialect->quotechar && dialect->quoting != QUOTE_NONE) { /* start quoted field */ - self->unquoted_field = 0; + self->unquoted_field = false; self->state = IN_QUOTED_FIELD; } else if (c == dialect->escapechar) { /* possible escaped character */ - self->unquoted_field = 0; + self->unquoted_field = false; self->state = ESCAPED_CHAR; } else if (c == ' ' && dialect->skipinitialspace) @@ -866,7 +866,7 @@ parse_reset(ReaderObj *self) return -1; self->field_len = 0; self->state = START_RECORD; - self->unquoted_field = 0; + self->unquoted_field = false; return 0; } From 0dab238d64eb4ce8a46804758c01770801abe28f Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Mon, 22 Jan 2024 15:50:12 +0200 Subject: [PATCH 4/5] Test QUOTE_NOTNULL with skipinitialspace. --- Lib/test/test_csv.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Lib/test/test_csv.py b/Lib/test/test_csv.py index 5bf27a8bd3635c7..21a4cb586ff6658 100644 --- a/Lib/test/test_csv.py +++ b/Lib/test/test_csv.py @@ -422,6 +422,9 @@ def test_read_skipinitialspace(self): self._read_test([' , , '], [['', '', '']], skipinitialspace=True) + self._read_test([' , , '], + [[None, None, None]], + skipinitialspace=True, quoting=csv.QUOTE_NOTNULL) self._read_test([' , , '], [[None, None, None]], skipinitialspace=True, quoting=csv.QUOTE_STRINGS) From 31aaff5a178e1daa073ab9fb3ceeba2220bbe2aa Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Mon, 22 Jan 2024 18:27:13 +0200 Subject: [PATCH 5/5] Apply suggestions from code review MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Éric --- Doc/whatsnew/3.12.rst | 2 +- Modules/_csv.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Doc/whatsnew/3.12.rst b/Doc/whatsnew/3.12.rst index 5d215b095f2975e..100312a5940b796 100644 --- a/Doc/whatsnew/3.12.rst +++ b/Doc/whatsnew/3.12.rst @@ -690,7 +690,7 @@ csv * Add :const:`csv.QUOTE_NOTNULL` and :const:`csv.QUOTE_STRINGS` flags to provide finer grained control of ``None`` and empty strings by - :class:`~csv.reader` and :class:`csv.writer` objects. + :class:`~csv.reader` and :class:`~csv.writer` objects. dis --- diff --git a/Modules/_csv.c b/Modules/_csv.c index 4cd98b56cf804e1..3aa648b8e9cec44 100644 --- a/Modules/_csv.c +++ b/Modules/_csv.c @@ -131,7 +131,7 @@ typedef struct { Py_UCS4 *field; /* temporary buffer */ Py_ssize_t field_size; /* size of allocated buffer */ Py_ssize_t field_len; /* length of current field */ - bool unquoted_field; + bool unquoted_field; /* true if no quotes around the current field */ unsigned long line_num; /* Source-file line number */ } ReaderObj;