Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 47 additions & 0 deletions Lib/test/test_dstring.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
import unittest


class DStringTestCase(unittest.TestCase):
def assertAllRaise(self, exception_type, regex, error_strings):
for str in error_strings:
with self.subTest(str=str):
with self.assertRaisesRegex(exception_type, regex) as cm:
eval(str)
# print("Testing expression:", repr(str))
# print(repr(cm.exception))
# print(repr(cm.exception.text))

def test_single_quote(self):
exprs = [
"d'hello'",
'D"hello"',
"d'hello\\nworld'",
]
self.assertAllRaise(SyntaxError, "d-string must be triple-quoted", exprs)

def test_empty_dstring(self):
exprs = [
"d''''''",
'D""""""',
]
self.assertAllRaise(SyntaxError, "d-string must start with a newline", exprs)

def test_no_last_newline(self):
exprs = [
"d'''\nhello world'''",
'D"""\nhello world"""',
"df'''\nhello {42}'''",
]
self.assertAllRaise(SyntaxError, "d-string must end with an indent line", exprs)

def test_simple_dstring(self):
self.assertEqual(eval('d"""\n hello world\n """'), "hello world\n")
self.assertEqual(eval('d"""\n hello world\n """'), " hello world\n")
self.assertEqual(eval('d"""\n hello world\n"""'), " hello world\n")
self.assertEqual(eval('d"""\n hello world\\\n """'), " hello world")
self.assertEqual(eval('dr"""\n hello world\\\n """'), " hello world\\\n")



if __name__ == '__main__':
unittest.main()
4 changes: 2 additions & 2 deletions Lib/test/test_tokenize.py
Original file line number Diff line number Diff line change
Expand Up @@ -3420,7 +3420,7 @@ def determine_valid_prefixes():
# some uppercase-only prefix is added.
for letter in itertools.chain(string.ascii_lowercase, string.ascii_uppercase):
try:
eval(f'{letter}""')
eval(f'{letter}"""\n"""') # d-string needs multiline
single_char_valid_prefixes.add(letter.lower())
except SyntaxError:
pass
Expand All @@ -3444,7 +3444,7 @@ def determine_valid_prefixes():
# because it's a valid expression: not ""
continue
try:
eval(f'{p}""')
eval(f'{p}"""\n"""') # d-string needs multiline

# No syntax error, so p is a valid string
# prefix.
Expand Down
3 changes: 2 additions & 1 deletion Lib/tokenize.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,8 @@ def _all_string_prefixes():
# The valid string prefixes. Only contain the lower case versions,
# and don't contain any permutations (include 'fr', but not
# 'rf'). The various permutations will be generated.
_valid_string_prefixes = ['b', 'r', 'u', 'f', 't', 'br', 'fr', 'tr']
_valid_string_prefixes = ['b', 'r', 'u', 'f', 't', 'd', 'br', 'fr', 'tr',
'df', 'dt', 'dr', 'dfr', 'dtr']
# if we add binary f-strings, add: ['fb', 'fbr']
result = {''}
for prefix in _valid_string_prefixes:
Expand Down
180 changes: 171 additions & 9 deletions Parser/action_helpers.c
Original file line number Diff line number Diff line change
Expand Up @@ -1292,24 +1292,124 @@ _PyPegen_nonparen_genexp_in_call(Parser *p, expr_ty args, asdl_comprehension_seq

// Fstring stuff

static int
unicodewriter_write_line(Parser *p, PyUnicodeWriter *w, const char *line_start, const char *line_end,
int is_raw, Token* token)
{
if (is_raw || memchr(line_start, '\\', line_end - line_start) == NULL) {
return PyUnicodeWriter_WriteUTF8(w, line_start, line_end - line_start);
}
else {
PyObject *line = _PyPegen_decode_string(p, 1, line_start, line_end - line_start, token);
if (line == NULL || PyUnicodeWriter_WriteStr(w, line) < 0) {
Py_XDECREF(line);
return -1;
}
Py_DECREF(line);
}
return 0;
}

static PyObject*
_PyPegen_dedent_string_part(Parser *p, const char *s, size_t len, int indent_char, Py_ssize_t dedent_count,
int is_raw, int is_first, expr_ty constant, Token* token)
{
Py_ssize_t lineno = constant->lineno;
const char *line_start = s;
const char *s_end = s + len;

PyUnicodeWriter *w = PyUnicodeWriter_Create(len);
if (w == NULL) {
return NULL;
}
if (is_first) {
assert (line_start[0] == '\n');
line_start++; // skip the first newline
}
else {
// Example: df"""
// first part {param} second part
// next line
// """"
// We don't need to dedent the first line in the non-first parts.
const char *line_end = memchr(line_start, '\n', s_end - line_start);
if (line_end) {
line_end++; // include the newline
}
else {
line_end = s_end;
}
if (unicodewriter_write_line(p, w, line_start, line_end, is_raw, token) < 0) {
PyUnicodeWriter_Discard(w);
return NULL;
}
line_start = line_end;
}

while (line_start < s + len) {
lineno++;

Py_ssize_t i = 0;
while (line_start + i < s_end && i < dedent_count && line_start[i] == indent_char) {
i++;
}

if (line_start[i] == '\0') { // found an empty line without newline.
break;
}
if (line_start[i] == '\n') { // found an empty line with newline.
if (PyUnicodeWriter_WriteChar(w, '\n') < 0) {
PyUnicodeWriter_Discard(w);
return NULL;
}
line_start += i+1;
continue;
}
if (i < dedent_count) { // found an invalid indent.
assert(line_start[i] != indent_char);
PyUnicodeWriter_Discard(w);
RAISE_ERROR_KNOWN_LOCATION(p, PyExc_SyntaxError, lineno, i, lineno, i+1,
"d-string line missing valid indentation");
return NULL;
}

// found a indented line. let's dedent it.
line_start += i;
const char *line_end = memchr(line_start, '\n', s_end - line_start);
if (line_end) {
line_end++; // include the newline
}
else {
line_end = s_end;
}
if (unicodewriter_write_line(p, w, line_start, line_end, is_raw, token) < 0) {
PyUnicodeWriter_Discard(w);
return NULL;
}
line_start = line_end;
}
return PyUnicodeWriter_Finish(w);
}

static expr_ty
_PyPegen_decode_fstring_part(Parser* p, int is_raw, expr_ty constant, Token* token) {
_PyPegen_decode_fstring_part(Parser* p, int is_first, int is_raw, int indent_char, Py_ssize_t dedent_count, expr_ty constant, Token* token) {
assert(PyUnicode_CheckExact(constant->v.Constant.value));

const char* bstr = PyUnicode_AsUTF8(constant->v.Constant.value);
if (bstr == NULL) {
return NULL;
}
is_raw = is_raw || strchr(bstr, '\\') == NULL;

size_t len;
if (strcmp(bstr, "{{") == 0 || strcmp(bstr, "}}") == 0) {
len = 1;
} else {
len = strlen(bstr);
PyObject *str = NULL;
if (dedent_count > 0) {
str = _PyPegen_dedent_string_part(p, bstr, strlen(bstr), indent_char, dedent_count,
is_raw, is_first, constant, token);
}
else {
str = _PyPegen_decode_string(p, is_raw, bstr, strlen(bstr), token);
}

is_raw = is_raw || strchr(bstr, '\\') == NULL;
PyObject *str = _PyPegen_decode_string(p, is_raw, bstr, len, token);
if (str == NULL) {
_Pypegen_raise_decode_error(p);
return NULL;
Expand Down Expand Up @@ -1340,12 +1440,74 @@ _get_resized_exprs(Parser *p, Token *a, asdl_expr_seq *raw_expressions, Token *b
return NULL;
}
int is_raw = strpbrk(quote_str, "rR") != NULL;
int is_dedent = strpbrk(quote_str, "dD") != NULL;
int indent_char = 0;
Py_ssize_t indent_count = 0;

asdl_expr_seq *seq = _Py_asdl_expr_seq_new(total_items, p->arena);
if (seq == NULL) {
return NULL;
}

if (is_dedent) {
expr_ty first_item = asdl_seq_GET(raw_expressions, 0);
if (first_item->kind != Constant_kind
|| PyUnicode_ReadChar(first_item->v.Constant.value, 0) != '\n') {
RAISE_SYNTAX_ERROR_KNOWN_LOCATION(
first_item,
"d-string must start with a newline"
);
return NULL;
}

expr_ty last_item = asdl_seq_GET(raw_expressions, n_items - 1);
if (last_item->kind != Constant_kind) {
RAISE_SYNTAX_ERROR_KNOWN_LOCATION(
last_item,
"d-string must end with an indent line"
);
return NULL;
}

Py_ssize_t blen;
const char *bstr = PyUnicode_AsUTF8AndSize(last_item->v.Constant.value, &blen);
if (bstr == NULL) {
return NULL;
}

// memrchr is GNU extension; use manual loop for portability.
const char *lastline = bstr + blen;
while (bstr < lastline) {
if (lastline[-1] == '\n') {
break;
}
lastline--;
if (*lastline != ' ' && *lastline != '\t') {
RAISE_SYNTAX_ERROR_KNOWN_LOCATION(
last_item,
"d-string must end with an indent line"
);
return NULL;
}
}

// checks indent of the last line.
indent_count = bstr + blen - lastline;
if (indent_count > 0) {
indent_char = lastline[0];

for (Py_ssize_t i = 1; i < indent_count; i++) {
if (lastline[i] != indent_char) {
RAISE_ERROR_KNOWN_LOCATION(
p, PyExc_TabError, last_item->end_lineno, i, last_item->end_lineno, i+1,
"inconsistent use of tabs and spaces in indentation"
);
return NULL;
}
}
}
}

Py_ssize_t index = 0;
for (Py_ssize_t i = 0; i < n_items; i++) {
expr_ty item = asdl_seq_GET(raw_expressions, i);
Expand Down Expand Up @@ -1377,7 +1539,7 @@ _get_resized_exprs(Parser *p, Token *a, asdl_expr_seq *raw_expressions, Token *b
}

if (item->kind == Constant_kind) {
item = _PyPegen_decode_fstring_part(p, is_raw, item, b);
item = _PyPegen_decode_fstring_part(p, i == 0, is_raw, indent_char, indent_count, item, b);
if (item == NULL) {
return NULL;
}
Expand Down
Loading
Loading