diff --git a/Lib/_colorize.py b/Lib/_colorize.py index 0b7047620b4556..81bcda24122095 100644 --- a/Lib/_colorize.py +++ b/Lib/_colorize.py @@ -1,8 +1,16 @@ +import builtins +import keyword import os import sys +import token as T +import tokenize +from collections import deque from collections.abc import Callable, Iterator, Mapping from dataclasses import dataclass, field, Field +from io import StringIO +from tokenize import TokenInfo as TI +from typing import Iterable, Match, NamedTuple COLORIZE = True @@ -373,3 +381,238 @@ def set_theme(t: Theme) -> None: set_theme(default_theme) + + +# --------------------------- Syntax colorizer ------------------------------- # + +IDENTIFIERS_AFTER = {"def", "class"} +KEYWORD_CONSTANTS = {"True", "False", "None"} +BUILTINS = {str(name) for name in dir(builtins) if not name.startswith('_')} +_keyword_first_sets_match = {"False", "None", "True", "await", "lambda", "not"} +_keyword_first_sets_case = {"False", "None", "True"} + + +class _Span(NamedTuple): + """Span indexing that's inclusive on both ends.""" + + start: int + end: int + + @classmethod + def from_re(cls, m: Match[str], group: int | str) -> Self: + re_span = m.span(group) + return cls(re_span[0], re_span[1] - 1) + + @classmethod + def from_token(cls, token: TI, line_len: list[int]) -> Self: + end_offset = -1 + if (token.type in {T.FSTRING_MIDDLE, T.TSTRING_MIDDLE} + and token.string.endswith(("{", "}"))): + # gh-134158: a visible trailing brace comes from a double brace in input + end_offset += 1 + + return cls( + line_len[token.start[0] - 1] + token.start[1], + line_len[token.end[0] - 1] + token.end[1] + end_offset, + ) + + +class _ColorSpan(NamedTuple): + span: _Span + tag: str + + +def _prev_next_window[T]( + iterable: Iterable[T] +) -> Iterator[tuple[T | None, ...]]: + """Generates three-tuples of (previous, current, next) items. + + On the first iteration previous is None. On the last iteration next + is None. In case of exception next is None and the exception is re-raised + on a subsequent next() call. + + Inspired by `sliding_window` from `itertools` recipes. + """ + + iterator = iter(iterable) + window = deque((None, next(iterator)), maxlen=3) + try: + for x in iterator: + window.append(x) + yield tuple(window) + except Exception: + raise + finally: + window.append(None) + yield tuple(window) + + +def _is_soft_keyword_used(*tokens: TI | None) -> bool: + """Returns True if the current token is a keyword in this context. + + For the `*tokens` to match anything, they have to be a three-tuple of + (previous, current, next). + """ + match tokens: + case ( + None | TI(T.NEWLINE) | TI(T.INDENT) | TI(string=":"), + TI(string="match"), + TI(T.NUMBER | T.STRING | T.FSTRING_START | T.TSTRING_START) + | TI(T.OP, string="(" | "*" | "[" | "{" | "~" | "...") + ): + return True + case ( + None | TI(T.NEWLINE) | TI(T.INDENT) | TI(string=":"), + TI(string="match"), + TI(T.NAME, string=s) + ): + if keyword.iskeyword(s): + return s in _keyword_first_sets_match + return True + case ( + None | TI(T.NEWLINE) | TI(T.INDENT) | TI(T.DEDENT) | TI(string=":"), + TI(string="case"), + TI(T.NUMBER | T.STRING | T.FSTRING_START | T.TSTRING_START) + | TI(T.OP, string="(" | "*" | "-" | "[" | "{") + ): + return True + case ( + None | TI(T.NEWLINE) | TI(T.INDENT) | TI(T.DEDENT) | TI(string=":"), + TI(string="case"), + TI(T.NAME, string=s) + ): + if keyword.iskeyword(s): + return s in _keyword_first_sets_case + return True + case (TI(string="case"), TI(string="_"), TI(string=":")): + return True + case ( + None | TI(T.NEWLINE) | TI(T.INDENT) | TI(T.DEDENT) | TI(string=":"), + TI(string="type"), + TI(T.NAME, string=s) + ): + return not keyword.iskeyword(s) + case _: + return False + + +def _gen_colors_from_token_stream( + token_generator: Iterator[TI], + line_lengths: list[int], +) -> Iterator[_ColorSpan]: + token_window = _prev_next_window(token_generator) + + is_def_name = False + bracket_level = 0 + for prev_token, token, next_token in token_window: + assert token is not None + if token.start == token.end: + continue + + match token.type: + case ( + T.STRING + | T.FSTRING_START | T.FSTRING_MIDDLE | T.FSTRING_END + | T.TSTRING_START | T.TSTRING_MIDDLE | T.TSTRING_END + ): + span = _Span.from_token(token, line_lengths) + yield _ColorSpan(span, "string") + case T.COMMENT: + span = _Span.from_token(token, line_lengths) + yield _ColorSpan(span, "comment") + case T.NUMBER: + span = _Span.from_token(token, line_lengths) + yield _ColorSpan(span, "number") + case T.OP: + if token.string in "([{": + bracket_level += 1 + elif token.string in ")]}": + bracket_level -= 1 + span = _Span.from_token(token, line_lengths) + yield _ColorSpan(span, "op") + case T.NAME: + if is_def_name: + is_def_name = False + span = _Span.from_token(token, line_lengths) + yield _ColorSpan(span, "definition") + elif keyword.iskeyword(token.string): + span_cls = "keyword" + if token.string in KEYWORD_CONSTANTS: + span_cls = "keyword_constant" + span = _Span.from_token(token, line_lengths) + yield _ColorSpan(span, span_cls) + if token.string in IDENTIFIERS_AFTER: + is_def_name = True + elif ( + keyword.issoftkeyword(token.string) + and bracket_level == 0 + and _is_soft_keyword_used(prev_token, token, next_token) + ): + span = _Span.from_token(token, line_lengths) + yield _ColorSpan(span, "soft_keyword") + elif ( + token.string in BUILTINS + and not (prev_token and prev_token.exact_type == T.DOT) + ): + span = _Span.from_token(token, line_lengths) + yield _ColorSpan(span, "builtin") + + +def _recover_unterminated_string( + exc: tokenize.TokenError, + line_lengths: list[int], + last_emitted: _ColorSpan | None, + buffer: str, +) -> Iterator[_ColorSpan]: + msg, loc = exc.args + if loc is None: + return + + line_no, column = loc + + if msg.startswith( + ( + "unterminated string literal", + "unterminated f-string literal", + "unterminated t-string literal", + "EOF in multi-line string", + "unterminated triple-quoted f-string literal", + "unterminated triple-quoted t-string literal", + ) + ): + start = line_lengths[line_no - 1] + column - 1 + end = line_lengths[-1] - 1 + + # in case FSTRING_START was already emitted + if last_emitted and start <= last_emitted.span.start: + start = last_emitted.span.end + 1 + + span = _Span(start, end) + yield _ColorSpan(span, "string") + + +def _gen_colors(buffer: str) -> Iterator[_ColorSpan]: + """Returns a list of index spans to color using the given color tag. + + The input `buffer` should be a valid start of a Python code block, i.e. + it cannot be a block starting in the middle of a multiline string. + """ + sio = StringIO(buffer) + line_lengths = [0] + [len(line) for line in sio.readlines()] + # make line_lengths cumulative + for i in range(1, len(line_lengths)): + line_lengths[i] += line_lengths[i-1] + + sio.seek(0) + gen = tokenize.generate_tokens(sio.readline) + last_emitted: _ColorSpan | None = None + try: + for color in _gen_colors_from_token_stream(gen, line_lengths): + yield color + last_emitted = color + except SyntaxError: + return + except tokenize.TokenError as te: + yield from _recover_unterminated_string( + te, line_lengths, last_emitted, buffer + ) diff --git a/Lib/_pyrepl/reader.py b/Lib/_pyrepl/reader.py index 0ebd9162eca4bb..1cda9853ebe7f0 100644 --- a/Lib/_pyrepl/reader.py +++ b/Lib/_pyrepl/reader.py @@ -28,7 +28,7 @@ from dataclasses import dataclass, field, fields from . import commands, console, input -from .utils import wlen, unbracket, disp_str, gen_colors, THEME +from .utils import wlen, unbracket, disp_str, THEME from .trace import trace @@ -312,7 +312,7 @@ def calc_screen(self) -> list[str]: prompt_from_cache = (offset and self.buffer[offset - 1] != "\n") if self.can_colorize: - colors = list(gen_colors(self.get_unicode())) + colors = list(_colorize._gen_colors(self.get_unicode())) else: colors = None trace("colors = {colors}", colors=colors) diff --git a/Lib/_pyrepl/utils.py b/Lib/_pyrepl/utils.py index 06cddef851bb40..f16275ea7dcdeb 100644 --- a/Lib/_pyrepl/utils.py +++ b/Lib/_pyrepl/utils.py @@ -1,27 +1,22 @@ from __future__ import annotations -import builtins import functools -import keyword import re -import token as T -import tokenize import unicodedata import _colorize from collections import deque -from io import StringIO -from tokenize import TokenInfo as TI -from typing import Iterable, Iterator, Match, NamedTuple, Self +from typing import Iterable, Iterator from .types import CharBuffer, CharWidths -from .trace import trace ANSI_ESCAPE_SEQUENCE = re.compile(r"\x1b\[[ -@]*[A-~]") ZERO_WIDTH_BRACKET = re.compile(r"\x01.*?\x02") ZERO_WIDTH_TRANS = str.maketrans({"\x01": "", "\x02": ""}) -IDENTIFIERS_AFTER = {"def", "class"} -KEYWORD_CONSTANTS = {"True", "False", "None"} -BUILTINS = {str(name) for name in dir(builtins) if not name.startswith('_')} + +# Re-export from _colorize for backward compatibility +gen_colors = _colorize._gen_colors +ColorSpan = _colorize._ColorSpan +Span = _colorize._Span def THEME(**kwargs): @@ -29,36 +24,6 @@ def THEME(**kwargs): return _colorize.get_theme(**kwargs).syntax -class Span(NamedTuple): - """Span indexing that's inclusive on both ends.""" - - start: int - end: int - - @classmethod - def from_re(cls, m: Match[str], group: int | str) -> Self: - re_span = m.span(group) - return cls(re_span[0], re_span[1] - 1) - - @classmethod - def from_token(cls, token: TI, line_len: list[int]) -> Self: - end_offset = -1 - if (token.type in {T.FSTRING_MIDDLE, T.TSTRING_MIDDLE} - and token.string.endswith(("{", "}"))): - # gh-134158: a visible trailing brace comes from a double brace in input - end_offset += 1 - - return cls( - line_len[token.start[0] - 1] + token.start[1], - line_len[token.end[0] - 1] + token.end[1] + end_offset, - ) - - -class ColorSpan(NamedTuple): - span: Span - tag: str - - @functools.cache def str_width(c: str) -> int: if ord(c) < 128: @@ -96,190 +61,6 @@ def unbracket(s: str, including_content: bool = False) -> str: return s.translate(ZERO_WIDTH_TRANS) -def gen_colors(buffer: str) -> Iterator[ColorSpan]: - """Returns a list of index spans to color using the given color tag. - - The input `buffer` should be a valid start of a Python code block, i.e. - it cannot be a block starting in the middle of a multiline string. - """ - sio = StringIO(buffer) - line_lengths = [0] + [len(line) for line in sio.readlines()] - # make line_lengths cumulative - for i in range(1, len(line_lengths)): - line_lengths[i] += line_lengths[i-1] - - sio.seek(0) - gen = tokenize.generate_tokens(sio.readline) - last_emitted: ColorSpan | None = None - try: - for color in gen_colors_from_token_stream(gen, line_lengths): - yield color - last_emitted = color - except SyntaxError: - return - except tokenize.TokenError as te: - yield from recover_unterminated_string( - te, line_lengths, last_emitted, buffer - ) - - -def recover_unterminated_string( - exc: tokenize.TokenError, - line_lengths: list[int], - last_emitted: ColorSpan | None, - buffer: str, -) -> Iterator[ColorSpan]: - msg, loc = exc.args - if loc is None: - return - - line_no, column = loc - - if msg.startswith( - ( - "unterminated string literal", - "unterminated f-string literal", - "unterminated t-string literal", - "EOF in multi-line string", - "unterminated triple-quoted f-string literal", - "unterminated triple-quoted t-string literal", - ) - ): - start = line_lengths[line_no - 1] + column - 1 - end = line_lengths[-1] - 1 - - # in case FSTRING_START was already emitted - if last_emitted and start <= last_emitted.span.start: - trace("before last emitted = {s}", s=start) - start = last_emitted.span.end + 1 - - span = Span(start, end) - trace("yielding span {a} -> {b}", a=span.start, b=span.end) - yield ColorSpan(span, "string") - else: - trace( - "unhandled token error({buffer}) = {te}", - buffer=repr(buffer), - te=str(exc), - ) - - -def gen_colors_from_token_stream( - token_generator: Iterator[TI], - line_lengths: list[int], -) -> Iterator[ColorSpan]: - token_window = prev_next_window(token_generator) - - is_def_name = False - bracket_level = 0 - for prev_token, token, next_token in token_window: - assert token is not None - if token.start == token.end: - continue - - match token.type: - case ( - T.STRING - | T.FSTRING_START | T.FSTRING_MIDDLE | T.FSTRING_END - | T.TSTRING_START | T.TSTRING_MIDDLE | T.TSTRING_END - ): - span = Span.from_token(token, line_lengths) - yield ColorSpan(span, "string") - case T.COMMENT: - span = Span.from_token(token, line_lengths) - yield ColorSpan(span, "comment") - case T.NUMBER: - span = Span.from_token(token, line_lengths) - yield ColorSpan(span, "number") - case T.OP: - if token.string in "([{": - bracket_level += 1 - elif token.string in ")]}": - bracket_level -= 1 - span = Span.from_token(token, line_lengths) - yield ColorSpan(span, "op") - case T.NAME: - if is_def_name: - is_def_name = False - span = Span.from_token(token, line_lengths) - yield ColorSpan(span, "definition") - elif keyword.iskeyword(token.string): - span_cls = "keyword" - if token.string in KEYWORD_CONSTANTS: - span_cls = "keyword_constant" - span = Span.from_token(token, line_lengths) - yield ColorSpan(span, span_cls) - if token.string in IDENTIFIERS_AFTER: - is_def_name = True - elif ( - keyword.issoftkeyword(token.string) - and bracket_level == 0 - and is_soft_keyword_used(prev_token, token, next_token) - ): - span = Span.from_token(token, line_lengths) - yield ColorSpan(span, "soft_keyword") - elif ( - token.string in BUILTINS - and not (prev_token and prev_token.exact_type == T.DOT) - ): - span = Span.from_token(token, line_lengths) - yield ColorSpan(span, "builtin") - - -keyword_first_sets_match = {"False", "None", "True", "await", "lambda", "not"} -keyword_first_sets_case = {"False", "None", "True"} - - -def is_soft_keyword_used(*tokens: TI | None) -> bool: - """Returns True if the current token is a keyword in this context. - - For the `*tokens` to match anything, they have to be a three-tuple of - (previous, current, next). - """ - trace("is_soft_keyword_used{t}", t=tokens) - match tokens: - case ( - None | TI(T.NEWLINE) | TI(T.INDENT) | TI(string=":"), - TI(string="match"), - TI(T.NUMBER | T.STRING | T.FSTRING_START | T.TSTRING_START) - | TI(T.OP, string="(" | "*" | "[" | "{" | "~" | "...") - ): - return True - case ( - None | TI(T.NEWLINE) | TI(T.INDENT) | TI(string=":"), - TI(string="match"), - TI(T.NAME, string=s) - ): - if keyword.iskeyword(s): - return s in keyword_first_sets_match - return True - case ( - None | TI(T.NEWLINE) | TI(T.INDENT) | TI(T.DEDENT) | TI(string=":"), - TI(string="case"), - TI(T.NUMBER | T.STRING | T.FSTRING_START | T.TSTRING_START) - | TI(T.OP, string="(" | "*" | "-" | "[" | "{") - ): - return True - case ( - None | TI(T.NEWLINE) | TI(T.INDENT) | TI(T.DEDENT) | TI(string=":"), - TI(string="case"), - TI(T.NAME, string=s) - ): - if keyword.iskeyword(s): - return s in keyword_first_sets_case - return True - case (TI(string="case"), TI(string="_"), TI(string=":")): - return True - case ( - None | TI(T.NEWLINE) | TI(T.INDENT) | TI(T.DEDENT) | TI(string=":"), - TI(string="type"), - TI(T.NAME, string=s) - ): - return not keyword.iskeyword(s) - case _: - return False - - def disp_str( buffer: str, colors: list[ColorSpan] | None = None, diff --git a/Lib/pdb.py b/Lib/pdb.py index c1a5db080dc7ef..e1067eba110760 100644 --- a/Lib/pdb.py +++ b/Lib/pdb.py @@ -1093,7 +1093,7 @@ def handle_command_def(self, line): def _colorize_code(self, code): if self.colorize: - colors = list(_pyrepl.utils.gen_colors(code)) + colors = list(_colorize._gen_colors(code)) chars, _ = _pyrepl.utils.disp_str(code, colors=colors, force_color=True) code = "".join(chars) return code diff --git a/Lib/test/test__colorize.py b/Lib/test/test__colorize.py index 67e0595943d356..b0b6cb5549992c 100644 --- a/Lib/test/test__colorize.py +++ b/Lib/test/test__colorize.py @@ -178,5 +178,29 @@ def test_colorized_detection_checks_for_file(self): self.assertEqual(_colorize.can_colorize(file=file), False) +class TestSyntaxColorizer(unittest.TestCase): + def test_gen_colors_keyword_highlighting(self): + cases = [ + # no highlights + ("a.set", [(".", "op")]), + ("obj.list", [(".", "op")]), + ("obj.match", [(".", "op")]), + ("b. \\\n format", [(".", "op")]), + # highlights + ("set", [("set", "builtin")]), + ("list", [("list", "builtin")]), + (" \n dict", [("dict", "builtin")]), + ] + for code, expected_highlights in cases: + with self.subTest(code=code): + colors = list(_colorize._gen_colors(code)) + # Extract (text, tag) pairs for comparison + actual_highlights = [] + for color in colors: + span_text = code[color.span.start:color.span.end + 1] + actual_highlights.append((span_text, color.tag)) + self.assertEqual(actual_highlights, expected_highlights) + + if __name__ == "__main__": unittest.main() diff --git a/Lib/test/test_pyrepl/test_utils.py b/Lib/test/test_pyrepl/test_utils.py index 656a1e441e0e47..c458ffd9f6b0c1 100644 --- a/Lib/test/test_pyrepl/test_utils.py +++ b/Lib/test/test_pyrepl/test_utils.py @@ -1,6 +1,6 @@ from unittest import TestCase -from _pyrepl.utils import str_width, wlen, prev_next_window, gen_colors +from _pyrepl.utils import str_width, wlen, prev_next_window class TestUtils(TestCase): @@ -81,25 +81,3 @@ def gen_raise(): self.assertEqual(next(pnw), (3, 4, None)) with self.assertRaises(ZeroDivisionError): next(pnw) - - def test_gen_colors_keyword_highlighting(self): - cases = [ - # no highlights - ("a.set", [(".", "op")]), - ("obj.list", [(".", "op")]), - ("obj.match", [(".", "op")]), - ("b. \\\n format", [(".", "op")]), - # highlights - ("set", [("set", "builtin")]), - ("list", [("list", "builtin")]), - (" \n dict", [("dict", "builtin")]), - ] - for code, expected_highlights in cases: - with self.subTest(code=code): - colors = list(gen_colors(code)) - # Extract (text, tag) pairs for comparison - actual_highlights = [] - for color in colors: - span_text = code[color.span.start:color.span.end + 1] - actual_highlights.append((span_text, color.tag)) - self.assertEqual(actual_highlights, expected_highlights)