Coverage for coverage / phystokens.py: 100.000%

96 statements  

« prev     ^ index     » next       coverage.py v7.12.1a0.dev1, created at 2025-11-30 17:57 +0000

1# Licensed under the Apache License: http://www.apache.org/licenses/LICENSE-2.0 

2# For details: https://github.com/coveragepy/coveragepy/blob/main/NOTICE.txt 

3 

4"""Better tokenizing for coverage.py.""" 

5 

6from __future__ import annotations 1QRSTUVWXYZ01abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOP234

7 

8import ast 1QRSTUVWXYZ01abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOP234

9import io 1QRSTUVWXYZ01abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOP234

10import keyword 1QRSTUVWXYZ01abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOP234

11import re 1QRSTUVWXYZ01abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOP234

12import sys 1QRSTUVWXYZ01abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOP234

13import token 1QRSTUVWXYZ01abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOP234

14import tokenize 1QRSTUVWXYZ01abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOP234

15from collections.abc import Iterable 1QRSTUVWXYZ01abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOP234

16 

17from coverage import env 1QRSTUVWXYZ01abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOP234

18from coverage.types import TLineNo, TSourceTokenLines 1QRSTUVWXYZ01abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOP234

19 

20TokenInfos = Iterable[tokenize.TokenInfo] 1QRSTUVWXYZ01abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOP234

21 

22 

23def _phys_tokens(toks: TokenInfos) -> TokenInfos: 1QRSTUVWXYZ01abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOP234

24 """Return all physical tokens, even line continuations. 

25 

26 tokenize.generate_tokens() doesn't return a token for the backslash that 

27 continues lines. This wrapper provides those tokens so that we can 

28 re-create a faithful representation of the original source. 

29 

30 Returns the same values as generate_tokens() 

31 

32 """ 

33 last_line: str | None = None 1QRSTUVWXYZ01abcdefghijklmnopqrst5uv6wx7yz8AB9CD!EF#GH$IJ%KL'MN(OP)234

34 last_lineno = -1 1QRSTUVWXYZ01abcdefghijklmnopqrst5uv6wx7yz8AB9CD!EF#GH$IJ%KL'MN(OP)234

35 last_ttext: str = "" 1QRSTUVWXYZ01abcdefghijklmnopqrst5uv6wx7yz8AB9CD!EF#GH$IJ%KL'MN(OP)234

36 for ttype, ttext, (slineno, scol), (elineno, ecol), ltext in toks: 1QRSTUVWXYZ01abcdefghijklmnopqrst5uv6wx7yz8AB9CD!EF#GH$IJ%KL'MN(OP)234

37 if last_lineno != elineno: 1QRSTUVWXYZ01abcdefghijklmnopqrst5uv6wx7yz8AB9CD!EF#GH$IJ%KL'MN(OP)234

38 if last_line and last_line.endswith("\\\n"): 1QRSTUVWXYZ01abcdefghijklmnopqrst5uv6wx7yz8AB9CD!EF#GH$IJ%KL'MN(OP)234

39 # We are at the beginning of a new line, and the last line 

40 # ended with a backslash. We probably have to inject a 

41 # backslash token into the stream. Unfortunately, there's more 

42 # to figure out. This code:: 

43 # 

44 # usage = """\ 

45 # HEY THERE 

46 # """ 

47 # 

48 # triggers this condition, but the token text is:: 

49 # 

50 # '"""\\\nHEY THERE\n"""' 

51 # 

52 # so we need to figure out if the backslash is already in the 

53 # string token or not. 

54 inject_backslash = True 1QRSTUVWXYZ01abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOP234

55 if last_ttext.endswith("\\"): 1QRSTUVWXYZ01abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOP234

56 inject_backslash = False 1QRSTUVWXYZ01abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOP234

57 elif ttype == token.STRING: 1QRSTUVWXYZ01abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOP234

58 if ( # pylint: disable=simplifiable-if-statement 1QRSTUVWXYZ01234

59 last_line.endswith("\\\n") 

60 and last_line.rstrip(" \\\n").endswith(last_ttext) 

61 ): 

62 # Deal with special cases like such code:: 

63 # 

64 # a = ["aaa",\ # there may be zero or more blanks between "," and "\". 

65 # "bbb \ 

66 # ccc"] 

67 # 

68 inject_backslash = True 1QRSTUVWXYZ01abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOP234

69 else: 

70 # It's a multi-line string and the first line ends with 

71 # a backslash, so we don't need to inject another. 

72 inject_backslash = False 1QRSTUVWXYZ01abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOP234

73 elif env.PYBEHAVIOR.fstring_syntax and ttype == token.FSTRING_MIDDLE: 1QRSTUVWXYZ01abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOP234

74 inject_backslash = False 1abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOP

75 if inject_backslash: 1QRSTUVWXYZ01abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOP234

76 # Figure out what column the backslash is in. 

77 ccol = len(last_line.split("\n")[-2]) - 1 1QRSTUVWXYZ01abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOP234

78 # Yield the token, with a fake token type. 

79 yield tokenize.TokenInfo( 1QRSTUVWXYZ01abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOP234

80 99999, 

81 "\\\n", 

82 (slineno, ccol), 

83 (slineno, ccol + 2), 

84 last_line, 

85 ) 

86 last_line = ltext 1QRSTUVWXYZ01abcdefghijklmnopqrst5uv6wx7yz8AB9CD!EF#GH$IJ%KL'MN(OP)234

87 if ttype not in (tokenize.NEWLINE, tokenize.NL): 1QRSTUVWXYZ01abcdefghijklmnopqrst5uv6wx7yz8AB9CD!EF#GH$IJ%KL'MN(OP)234

88 last_ttext = ttext 1QRSTUVWXYZ01abcdefghijklmnopqrst5uv6wx7yz8AB9CD!EF#GH$IJ%KL'MN(OP)234

89 yield tokenize.TokenInfo(ttype, ttext, (slineno, scol), (elineno, ecol), ltext) 1QRSTUVWXYZ01abcdefghijklmnopqrst5uv6wx7yz8AB9CD!EF#GH$IJ%KL'MN(OP)234

90 last_lineno = elineno 1QRSTUVWXYZ01abcdefghijklmnopqrst5uv6wx7yz8AB9CD!EF#GH$IJ%KL'MN(OP)234

91 

92 

93def find_soft_key_lines(source: str) -> set[TLineNo]: 1QRSTUVWXYZ01abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOP234

94 """Helper for finding lines with soft keywords, like match/case lines.""" 

95 soft_key_lines: set[TLineNo] = set() 1QRSTUVWXYZ01abcdefghijklmnopqrst5uv6wx7yz8AB9CD!EF#GH$IJ%KL'MN(OP)234

96 

97 for node in ast.walk(ast.parse(source)): 1QRSTUVWXYZ01abcdefghijklmnopqrst5uv6wx7yz8AB9CD!EF#GH$IJ%KL'MN(OP)234

98 if isinstance(node, ast.Match): 1QRSTUVWXYZ01abcdefghijklmnopqrst5uv6wx7yz8AB9CD!EF#GH$IJ%KL'MN(OP)234

99 soft_key_lines.add(node.lineno) 1QRSTUVWXYZ01abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOP234

100 for case in node.cases: 1QRSTUVWXYZ01abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOP234

101 soft_key_lines.add(case.pattern.lineno) 1QRSTUVWXYZ01abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOP234

102 elif sys.version_info >= (3, 12) and isinstance(node, ast.TypeAlias): 1QRSTUVWXYZ01abcdefghijklmnopqrst5uv6wx7yz8AB9CD!EF#GH$IJ%KL'MN(OP)234

103 soft_key_lines.add(node.lineno) 1abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOP

104 

105 return soft_key_lines 1QRSTUVWXYZ01abcdefghijklmnopqrst5uv6wx7yz8AB9CD!EF#GH$IJ%KL'MN(OP)234

106 

107 

108def source_token_lines(source: str) -> TSourceTokenLines: 1QRSTUVWXYZ01abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOP234

109 """Generate a series of lines, one for each line in `source`. 

110 

111 Each line is a list of pairs, each pair is a token:: 

112 

113 [('key', 'def'), ('ws', ' '), ('nam', 'hello'), ('op', '('), ... ] 

114 

115 Each pair has a token class, and the token text. 

116 

117 If you concatenate all the token texts, and then join them with newlines, 

118 you should have your original `source` back, with two differences: 

119 trailing white space is not preserved, and a final line with no newline 

120 is indistinguishable from a final line with a newline. 

121 

122 """ 

123 

124 ws_tokens = {token.INDENT, token.DEDENT, token.NEWLINE, tokenize.NL} 1QRSTUVWXYZ01abcdefghijklmnopqrst5uv6wx7yz8AB9CD!EF#GH$IJ%KL'MN(OP)234

125 line: list[tuple[str, str]] = [] 1QRSTUVWXYZ01abcdefghijklmnopqrst5uv6wx7yz8AB9CD!EF#GH$IJ%KL'MN(OP)234

126 col = 0 1QRSTUVWXYZ01abcdefghijklmnopqrst5uv6wx7yz8AB9CD!EF#GH$IJ%KL'MN(OP)234

127 

128 source = source.expandtabs(8).replace("\r\n", "\n") 1QRSTUVWXYZ01abcdefghijklmnopqrst5uv6wx7yz8AB9CD!EF#GH$IJ%KL'MN(OP)234

129 tokgen = generate_tokens(source) 1QRSTUVWXYZ01abcdefghijklmnopqrst5uv6wx7yz8AB9CD!EF#GH$IJ%KL'MN(OP)234

130 

131 soft_key_lines = find_soft_key_lines(source) 1QRSTUVWXYZ01abcdefghijklmnopqrst5uv6wx7yz8AB9CD!EF#GH$IJ%KL'MN(OP)234

132 

133 for ttype, ttext, (sline, scol), (_, ecol), _ in _phys_tokens(tokgen): 1QRSTUVWXYZ01abcdefghijklmnopqrst5uv6wx7yz8AB9CD!EF#GH$IJ%KL'MN(OP)234

134 mark_start = True 1QRSTUVWXYZ01abcdefghijklmnopqrst5uv6wx7yz8AB9CD!EF#GH$IJ%KL'MN(OP)234

135 for part in re.split("(\n)", ttext): 1QRSTUVWXYZ01abcdefghijklmnopqrst5uv6wx7yz8AB9CD!EF#GH$IJ%KL'MN(OP)234

136 if part == "\n": 1QRSTUVWXYZ01abcdefghijklmnopqrst5uv6wx7yz8AB9CD!EF#GH$IJ%KL'MN(OP)234

137 yield line 1QRSTUVWXYZ01abcdefghijklmnopqrst5uv6wx7yz8AB9CD!EF#GH$IJ%KL'MN(OP)234

138 line = [] 1QRSTUVWXYZ01abcdefghijklmnopqrst5uv6wx7yz8AB9CD!EF#GH$IJ%KL'MN(OP)234

139 col = 0 1QRSTUVWXYZ01abcdefghijklmnopqrst5uv6wx7yz8AB9CD!EF#GH$IJ%KL'MN(OP)234

140 mark_end = False 1QRSTUVWXYZ01abcdefghijklmnopqrst5uv6wx7yz8AB9CD!EF#GH$IJ%KL'MN(OP)234

141 elif part == "": 1QRSTUVWXYZ01abcdefghijklmnopqrst5uv6wx7yz8AB9CD!EF#GH$IJ%KL'MN(OP)234

142 mark_end = False 1QRSTUVWXYZ01abcdefghijklmnopqrst5uv6wx7yz8AB9CD!EF#GH$IJ%KL'MN(OP)234

143 elif ttype in ws_tokens: 1QRSTUVWXYZ01abcdefghijklmnopqrst5uv6wx7yz8AB9CD!EF#GH$IJ%KL'MN(OP)234

144 mark_end = False 1QRSTUVWXYZ01abcdefghijklmnopqrst5uv6wx7yz8AB9CD!EF#GH$IJ%KL'MN(OP)234

145 else: 

146 if env.PYBEHAVIOR.fstring_syntax and ttype == token.FSTRING_MIDDLE: 1QRSTUVWXYZ01abcdefghijklmnopqrst5uv6wx7yz8AB9CD!EF#GH$IJ%KL'MN(OP)234

147 part = part.replace("{", "{{").replace("}", "}}") 1abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOP

148 ecol = scol + len(part) 1abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOP

149 if mark_start and scol > col: 1QRSTUVWXYZ01abcdefghijklmnopqrst5uv6wx7yz8AB9CD!EF#GH$IJ%KL'MN(OP)234

150 line.append(("ws", " " * (scol - col))) 1QRSTUVWXYZ01abcdefghijklmnopqrst5uv6wx7yz8AB9CD!EF#GH$IJ%KL'MN(OP)234

151 mark_start = False 1QRSTUVWXYZ01abcdefghijklmnopqrst5uv6wx7yz8AB9CD!EF#GH$IJ%KL'MN(OP)234

152 tok_class = tokenize.tok_name.get(ttype, "xx").lower()[:3] 1QRSTUVWXYZ01abcdefghijklmnopqrst5uv6wx7yz8AB9CD!EF#GH$IJ%KL'MN(OP)234

153 if ttype == token.NAME: 1QRSTUVWXYZ01abcdefghijklmnopqrst5uv6wx7yz8AB9CD!EF#GH$IJ%KL'MN(OP)234

154 if keyword.iskeyword(ttext): 1QRSTUVWXYZ01abcdefghijklmnopqrst5uv6wx7yz8AB9CD!EF#GH$IJ%KL'MN(OP)234

155 # Hard keywords are always keywords. 

156 tok_class = "key" 1QRSTUVWXYZ01abcdefghijklmnopqrst5uv6wx7yz8AB9CD!EF#GH$IJ%KL'MN(OP)234

157 elif keyword.issoftkeyword(ttext): 1QRSTUVWXYZ01abcdefghijklmnopqrst5uv6wx7yz8AB9CD!EF#GH$IJ%KL'MN(OP)234

158 # Soft keywords appear at the start of their line. 

159 if len(line) == 0: 1QRSTUVWXYZ01abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOP234

160 is_start_of_line = True 1QRSTUVWXYZ01abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOP234

161 elif (len(line) == 1) and line[0][0] == "ws": 1QRSTUVWXYZ01abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOP234

162 is_start_of_line = True 1QRSTUVWXYZ01abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOP234

163 else: 

164 is_start_of_line = False 1QRSTUVWXYZ01abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOP234

165 if is_start_of_line and sline in soft_key_lines: 1QRSTUVWXYZ01abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOP234

166 tok_class = "key" 1QRSTUVWXYZ01abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOP234

167 line.append((tok_class, part)) 1QRSTUVWXYZ01abcdefghijklmnopqrst5uv6wx7yz8AB9CD!EF#GH$IJ%KL'MN(OP)234

168 mark_end = True 1QRSTUVWXYZ01abcdefghijklmnopqrst5uv6wx7yz8AB9CD!EF#GH$IJ%KL'MN(OP)234

169 scol = 0 1QRSTUVWXYZ01abcdefghijklmnopqrst5uv6wx7yz8AB9CD!EF#GH$IJ%KL'MN(OP)234

170 if mark_end: 1QRSTUVWXYZ01abcdefghijklmnopqrst5uv6wx7yz8AB9CD!EF#GH$IJ%KL'MN(OP)234

171 col = ecol 1QRSTUVWXYZ01abcdefghijklmnopqrst5uv6wx7yz8AB9CD!EF#GH$IJ%KL'MN(OP)234

172 

173 if line: 1QRSTUVWXYZ01abcdefghijklmnopqrst5uv6wx7yz8AB9CD!EF#GH$IJ%KL'MN(OP)234

174 yield line 1QRSTUVWXYZ01abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOP234

175 

176 

177def generate_tokens(text: str) -> TokenInfos: 1QRSTUVWXYZ01abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOP234

178 """A helper around `tokenize.generate_tokens`. 

179 

180 Originally this was used to cache the results, but it didn't seem to make 

181 reporting go faster, and caused issues with using too much memory. 

182 

183 """ 

184 readline = io.StringIO(text).readline 1QRSTUVWXYZ01abcdefghijklmnopqrst5uv6wx7yz8AB9CD!EF#GH$IJ%KL'MN(OP)234

185 return tokenize.generate_tokens(readline) 1QRSTUVWXYZ01abcdefghijklmnopqrst5uv6wx7yz8AB9CD!EF#GH$IJ%KL'MN(OP)234

186 

187 

188def source_encoding(source: bytes) -> str: 1QRSTUVWXYZ01abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOP234

189 """Determine the encoding for `source`, according to PEP 263. 

190 

191 `source` is a byte string: the text of the program. 

192 

193 Returns a string, the name of the encoding. 

194 

195 """ 

196 readline = iter(source.splitlines(True)).__next__ 1QRSTUVWXYZ01abcdefghijklmnopqrst5uv6wx7yz8AB9CD!EF#GH$IJ%KL'MN(OP)234

197 return tokenize.detect_encoding(readline)[0] 1QRSTUVWXYZ01abcdefghijklmnopqrst5uv6wx7yz8AB9CD!EF#GH$IJ%KL'MN(OP)234