Coverage for coverage / phystokens.py: 100.000%
96 statements
« prev ^ index » next coverage.py v7.12.1a0.dev1, created at 2025-11-30 17:57 +0000
« prev ^ index » next coverage.py v7.12.1a0.dev1, created at 2025-11-30 17:57 +0000
1# Licensed under the Apache License: http://www.apache.org/licenses/LICENSE-2.0
2# For details: https://github.com/coveragepy/coveragepy/blob/main/NOTICE.txt
4"""Better tokenizing for coverage.py."""
6from __future__ import annotations 1QRSTUVWXYZ01abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOP234
8import ast 1QRSTUVWXYZ01abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOP234
9import io 1QRSTUVWXYZ01abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOP234
10import keyword 1QRSTUVWXYZ01abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOP234
11import re 1QRSTUVWXYZ01abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOP234
12import sys 1QRSTUVWXYZ01abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOP234
13import token 1QRSTUVWXYZ01abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOP234
14import tokenize 1QRSTUVWXYZ01abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOP234
15from collections.abc import Iterable 1QRSTUVWXYZ01abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOP234
17from coverage import env 1QRSTUVWXYZ01abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOP234
18from coverage.types import TLineNo, TSourceTokenLines 1QRSTUVWXYZ01abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOP234
20TokenInfos = Iterable[tokenize.TokenInfo] 1QRSTUVWXYZ01abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOP234
23def _phys_tokens(toks: TokenInfos) -> TokenInfos: 1QRSTUVWXYZ01abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOP234
24 """Return all physical tokens, even line continuations.
26 tokenize.generate_tokens() doesn't return a token for the backslash that
27 continues lines. This wrapper provides those tokens so that we can
28 re-create a faithful representation of the original source.
30 Returns the same values as generate_tokens()
32 """
33 last_line: str | None = None 1QRSTUVWXYZ01abcdefghijklmnopqrst5uv6wx7yz8AB9CD!EF#GH$IJ%KL'MN(OP)234
34 last_lineno = -1 1QRSTUVWXYZ01abcdefghijklmnopqrst5uv6wx7yz8AB9CD!EF#GH$IJ%KL'MN(OP)234
35 last_ttext: str = "" 1QRSTUVWXYZ01abcdefghijklmnopqrst5uv6wx7yz8AB9CD!EF#GH$IJ%KL'MN(OP)234
36 for ttype, ttext, (slineno, scol), (elineno, ecol), ltext in toks: 1QRSTUVWXYZ01abcdefghijklmnopqrst5uv6wx7yz8AB9CD!EF#GH$IJ%KL'MN(OP)234
37 if last_lineno != elineno: 1QRSTUVWXYZ01abcdefghijklmnopqrst5uv6wx7yz8AB9CD!EF#GH$IJ%KL'MN(OP)234
38 if last_line and last_line.endswith("\\\n"): 1QRSTUVWXYZ01abcdefghijklmnopqrst5uv6wx7yz8AB9CD!EF#GH$IJ%KL'MN(OP)234
39 # We are at the beginning of a new line, and the last line
40 # ended with a backslash. We probably have to inject a
41 # backslash token into the stream. Unfortunately, there's more
42 # to figure out. This code::
43 #
44 # usage = """\
45 # HEY THERE
46 # """
47 #
48 # triggers this condition, but the token text is::
49 #
50 # '"""\\\nHEY THERE\n"""'
51 #
52 # so we need to figure out if the backslash is already in the
53 # string token or not.
54 inject_backslash = True 1QRSTUVWXYZ01abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOP234
55 if last_ttext.endswith("\\"): 1QRSTUVWXYZ01abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOP234
56 inject_backslash = False 1QRSTUVWXYZ01abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOP234
57 elif ttype == token.STRING: 1QRSTUVWXYZ01abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOP234
58 if ( # pylint: disable=simplifiable-if-statement 1QRSTUVWXYZ01234
59 last_line.endswith("\\\n")
60 and last_line.rstrip(" \\\n").endswith(last_ttext)
61 ):
62 # Deal with special cases like such code::
63 #
64 # a = ["aaa",\ # there may be zero or more blanks between "," and "\".
65 # "bbb \
66 # ccc"]
67 #
68 inject_backslash = True 1QRSTUVWXYZ01abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOP234
69 else:
70 # It's a multi-line string and the first line ends with
71 # a backslash, so we don't need to inject another.
72 inject_backslash = False 1QRSTUVWXYZ01abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOP234
73 elif env.PYBEHAVIOR.fstring_syntax and ttype == token.FSTRING_MIDDLE: 1QRSTUVWXYZ01abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOP234
74 inject_backslash = False 1abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOP
75 if inject_backslash: 1QRSTUVWXYZ01abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOP234
76 # Figure out what column the backslash is in.
77 ccol = len(last_line.split("\n")[-2]) - 1 1QRSTUVWXYZ01abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOP234
78 # Yield the token, with a fake token type.
79 yield tokenize.TokenInfo( 1QRSTUVWXYZ01abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOP234
80 99999,
81 "\\\n",
82 (slineno, ccol),
83 (slineno, ccol + 2),
84 last_line,
85 )
86 last_line = ltext 1QRSTUVWXYZ01abcdefghijklmnopqrst5uv6wx7yz8AB9CD!EF#GH$IJ%KL'MN(OP)234
87 if ttype not in (tokenize.NEWLINE, tokenize.NL): 1QRSTUVWXYZ01abcdefghijklmnopqrst5uv6wx7yz8AB9CD!EF#GH$IJ%KL'MN(OP)234
88 last_ttext = ttext 1QRSTUVWXYZ01abcdefghijklmnopqrst5uv6wx7yz8AB9CD!EF#GH$IJ%KL'MN(OP)234
89 yield tokenize.TokenInfo(ttype, ttext, (slineno, scol), (elineno, ecol), ltext) 1QRSTUVWXYZ01abcdefghijklmnopqrst5uv6wx7yz8AB9CD!EF#GH$IJ%KL'MN(OP)234
90 last_lineno = elineno 1QRSTUVWXYZ01abcdefghijklmnopqrst5uv6wx7yz8AB9CD!EF#GH$IJ%KL'MN(OP)234
93def find_soft_key_lines(source: str) -> set[TLineNo]: 1QRSTUVWXYZ01abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOP234
94 """Helper for finding lines with soft keywords, like match/case lines."""
95 soft_key_lines: set[TLineNo] = set() 1QRSTUVWXYZ01abcdefghijklmnopqrst5uv6wx7yz8AB9CD!EF#GH$IJ%KL'MN(OP)234
97 for node in ast.walk(ast.parse(source)): 1QRSTUVWXYZ01abcdefghijklmnopqrst5uv6wx7yz8AB9CD!EF#GH$IJ%KL'MN(OP)234
98 if isinstance(node, ast.Match): 1QRSTUVWXYZ01abcdefghijklmnopqrst5uv6wx7yz8AB9CD!EF#GH$IJ%KL'MN(OP)234
99 soft_key_lines.add(node.lineno) 1QRSTUVWXYZ01abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOP234
100 for case in node.cases: 1QRSTUVWXYZ01abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOP234
101 soft_key_lines.add(case.pattern.lineno) 1QRSTUVWXYZ01abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOP234
102 elif sys.version_info >= (3, 12) and isinstance(node, ast.TypeAlias): 1QRSTUVWXYZ01abcdefghijklmnopqrst5uv6wx7yz8AB9CD!EF#GH$IJ%KL'MN(OP)234
103 soft_key_lines.add(node.lineno) 1abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOP
105 return soft_key_lines 1QRSTUVWXYZ01abcdefghijklmnopqrst5uv6wx7yz8AB9CD!EF#GH$IJ%KL'MN(OP)234
108def source_token_lines(source: str) -> TSourceTokenLines: 1QRSTUVWXYZ01abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOP234
109 """Generate a series of lines, one for each line in `source`.
111 Each line is a list of pairs, each pair is a token::
113 [('key', 'def'), ('ws', ' '), ('nam', 'hello'), ('op', '('), ... ]
115 Each pair has a token class, and the token text.
117 If you concatenate all the token texts, and then join them with newlines,
118 you should have your original `source` back, with two differences:
119 trailing white space is not preserved, and a final line with no newline
120 is indistinguishable from a final line with a newline.
122 """
124 ws_tokens = {token.INDENT, token.DEDENT, token.NEWLINE, tokenize.NL} 1QRSTUVWXYZ01abcdefghijklmnopqrst5uv6wx7yz8AB9CD!EF#GH$IJ%KL'MN(OP)234
125 line: list[tuple[str, str]] = [] 1QRSTUVWXYZ01abcdefghijklmnopqrst5uv6wx7yz8AB9CD!EF#GH$IJ%KL'MN(OP)234
126 col = 0 1QRSTUVWXYZ01abcdefghijklmnopqrst5uv6wx7yz8AB9CD!EF#GH$IJ%KL'MN(OP)234
128 source = source.expandtabs(8).replace("\r\n", "\n") 1QRSTUVWXYZ01abcdefghijklmnopqrst5uv6wx7yz8AB9CD!EF#GH$IJ%KL'MN(OP)234
129 tokgen = generate_tokens(source) 1QRSTUVWXYZ01abcdefghijklmnopqrst5uv6wx7yz8AB9CD!EF#GH$IJ%KL'MN(OP)234
131 soft_key_lines = find_soft_key_lines(source) 1QRSTUVWXYZ01abcdefghijklmnopqrst5uv6wx7yz8AB9CD!EF#GH$IJ%KL'MN(OP)234
133 for ttype, ttext, (sline, scol), (_, ecol), _ in _phys_tokens(tokgen): 1QRSTUVWXYZ01abcdefghijklmnopqrst5uv6wx7yz8AB9CD!EF#GH$IJ%KL'MN(OP)234
134 mark_start = True 1QRSTUVWXYZ01abcdefghijklmnopqrst5uv6wx7yz8AB9CD!EF#GH$IJ%KL'MN(OP)234
135 for part in re.split("(\n)", ttext): 1QRSTUVWXYZ01abcdefghijklmnopqrst5uv6wx7yz8AB9CD!EF#GH$IJ%KL'MN(OP)234
136 if part == "\n": 1QRSTUVWXYZ01abcdefghijklmnopqrst5uv6wx7yz8AB9CD!EF#GH$IJ%KL'MN(OP)234
137 yield line 1QRSTUVWXYZ01abcdefghijklmnopqrst5uv6wx7yz8AB9CD!EF#GH$IJ%KL'MN(OP)234
138 line = [] 1QRSTUVWXYZ01abcdefghijklmnopqrst5uv6wx7yz8AB9CD!EF#GH$IJ%KL'MN(OP)234
139 col = 0 1QRSTUVWXYZ01abcdefghijklmnopqrst5uv6wx7yz8AB9CD!EF#GH$IJ%KL'MN(OP)234
140 mark_end = False 1QRSTUVWXYZ01abcdefghijklmnopqrst5uv6wx7yz8AB9CD!EF#GH$IJ%KL'MN(OP)234
141 elif part == "": 1QRSTUVWXYZ01abcdefghijklmnopqrst5uv6wx7yz8AB9CD!EF#GH$IJ%KL'MN(OP)234
142 mark_end = False 1QRSTUVWXYZ01abcdefghijklmnopqrst5uv6wx7yz8AB9CD!EF#GH$IJ%KL'MN(OP)234
143 elif ttype in ws_tokens: 1QRSTUVWXYZ01abcdefghijklmnopqrst5uv6wx7yz8AB9CD!EF#GH$IJ%KL'MN(OP)234
144 mark_end = False 1QRSTUVWXYZ01abcdefghijklmnopqrst5uv6wx7yz8AB9CD!EF#GH$IJ%KL'MN(OP)234
145 else:
146 if env.PYBEHAVIOR.fstring_syntax and ttype == token.FSTRING_MIDDLE: 1QRSTUVWXYZ01abcdefghijklmnopqrst5uv6wx7yz8AB9CD!EF#GH$IJ%KL'MN(OP)234
147 part = part.replace("{", "{{").replace("}", "}}") 1abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOP
148 ecol = scol + len(part) 1abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOP
149 if mark_start and scol > col: 1QRSTUVWXYZ01abcdefghijklmnopqrst5uv6wx7yz8AB9CD!EF#GH$IJ%KL'MN(OP)234
150 line.append(("ws", " " * (scol - col))) 1QRSTUVWXYZ01abcdefghijklmnopqrst5uv6wx7yz8AB9CD!EF#GH$IJ%KL'MN(OP)234
151 mark_start = False 1QRSTUVWXYZ01abcdefghijklmnopqrst5uv6wx7yz8AB9CD!EF#GH$IJ%KL'MN(OP)234
152 tok_class = tokenize.tok_name.get(ttype, "xx").lower()[:3] 1QRSTUVWXYZ01abcdefghijklmnopqrst5uv6wx7yz8AB9CD!EF#GH$IJ%KL'MN(OP)234
153 if ttype == token.NAME: 1QRSTUVWXYZ01abcdefghijklmnopqrst5uv6wx7yz8AB9CD!EF#GH$IJ%KL'MN(OP)234
154 if keyword.iskeyword(ttext): 1QRSTUVWXYZ01abcdefghijklmnopqrst5uv6wx7yz8AB9CD!EF#GH$IJ%KL'MN(OP)234
155 # Hard keywords are always keywords.
156 tok_class = "key" 1QRSTUVWXYZ01abcdefghijklmnopqrst5uv6wx7yz8AB9CD!EF#GH$IJ%KL'MN(OP)234
157 elif keyword.issoftkeyword(ttext): 1QRSTUVWXYZ01abcdefghijklmnopqrst5uv6wx7yz8AB9CD!EF#GH$IJ%KL'MN(OP)234
158 # Soft keywords appear at the start of their line.
159 if len(line) == 0: 1QRSTUVWXYZ01abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOP234
160 is_start_of_line = True 1QRSTUVWXYZ01abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOP234
161 elif (len(line) == 1) and line[0][0] == "ws": 1QRSTUVWXYZ01abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOP234
162 is_start_of_line = True 1QRSTUVWXYZ01abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOP234
163 else:
164 is_start_of_line = False 1QRSTUVWXYZ01abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOP234
165 if is_start_of_line and sline in soft_key_lines: 1QRSTUVWXYZ01abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOP234
166 tok_class = "key" 1QRSTUVWXYZ01abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOP234
167 line.append((tok_class, part)) 1QRSTUVWXYZ01abcdefghijklmnopqrst5uv6wx7yz8AB9CD!EF#GH$IJ%KL'MN(OP)234
168 mark_end = True 1QRSTUVWXYZ01abcdefghijklmnopqrst5uv6wx7yz8AB9CD!EF#GH$IJ%KL'MN(OP)234
169 scol = 0 1QRSTUVWXYZ01abcdefghijklmnopqrst5uv6wx7yz8AB9CD!EF#GH$IJ%KL'MN(OP)234
170 if mark_end: 1QRSTUVWXYZ01abcdefghijklmnopqrst5uv6wx7yz8AB9CD!EF#GH$IJ%KL'MN(OP)234
171 col = ecol 1QRSTUVWXYZ01abcdefghijklmnopqrst5uv6wx7yz8AB9CD!EF#GH$IJ%KL'MN(OP)234
173 if line: 1QRSTUVWXYZ01abcdefghijklmnopqrst5uv6wx7yz8AB9CD!EF#GH$IJ%KL'MN(OP)234
174 yield line 1QRSTUVWXYZ01abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOP234
177def generate_tokens(text: str) -> TokenInfos: 1QRSTUVWXYZ01abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOP234
178 """A helper around `tokenize.generate_tokens`.
180 Originally this was used to cache the results, but it didn't seem to make
181 reporting go faster, and caused issues with using too much memory.
183 """
184 readline = io.StringIO(text).readline 1QRSTUVWXYZ01abcdefghijklmnopqrst5uv6wx7yz8AB9CD!EF#GH$IJ%KL'MN(OP)234
185 return tokenize.generate_tokens(readline) 1QRSTUVWXYZ01abcdefghijklmnopqrst5uv6wx7yz8AB9CD!EF#GH$IJ%KL'MN(OP)234
188def source_encoding(source: bytes) -> str: 1QRSTUVWXYZ01abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOP234
189 """Determine the encoding for `source`, according to PEP 263.
191 `source` is a byte string: the text of the program.
193 Returns a string, the name of the encoding.
195 """
196 readline = iter(source.splitlines(True)).__next__ 1QRSTUVWXYZ01abcdefghijklmnopqrst5uv6wx7yz8AB9CD!EF#GH$IJ%KL'MN(OP)234
197 return tokenize.detect_encoding(readline)[0] 1QRSTUVWXYZ01abcdefghijklmnopqrst5uv6wx7yz8AB9CD!EF#GH$IJ%KL'MN(OP)234