Skip to content

Commit 89e08ef

Browse files
feat: ✨ Improve performance (#65)
1 parent 9de17a3 commit 89e08ef

File tree

23 files changed

+1155
-1168
lines changed

23 files changed

+1155
-1168
lines changed

codelimit/common/Scanner.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,8 @@
2323
from codelimit.common.Token import Token
2424
from codelimit.common.lexer_utils import lex
2525
from codelimit.common.report.Report import Report
26-
from codelimit.common.scope.scope_utils import build_scopes, unfold_scopes
26+
from codelimit.common.scope.scope_utils import build_scopes, unfold_scopes, count_lines
27+
from codelimit.common.source_utils import filter_tokens
2728
from codelimit.common.utils import (
2829
calculate_checksum,
2930
)
@@ -141,11 +142,12 @@ def scan_file(tokens: list[Token], language: Language) -> list[Measurement]:
141142
scopes = build_scopes(tokens, language)
142143
scopes = unfold_scopes(scopes)
143144
measurements: list[Measurement] = []
145+
code_tokens = filter_tokens(tokens)
144146
if scopes:
145147
for scope in scopes:
146-
length = len(scope)
147-
start_location = scope.header.token_range[0].location
148-
last_token = scope.block.tokens[-1]
148+
length = count_lines(scope, code_tokens)
149+
start_location = code_tokens[scope.header.token_range.start].location
150+
last_token = code_tokens[scope.block.end - 1]
149151
end_location = Location(
150152
last_token.location.line,
151153
last_token.location.column + len(last_token.value),

codelimit/common/TokenRange.py

Lines changed: 18 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -4,37 +4,37 @@
44

55

66
class TokenRange:
7-
def __init__(self, tokens: list[Token]):
8-
self.tokens = tokens
7+
def __init__(self, start: int, end: int):
8+
self.start = start
9+
self.end = end
910

1011
def __str__(self):
11-
return f"[{self.tokens[0].location}, {self.tokens[-1].location}]"
12+
return f"({self.start}, {self.end})"
1213

1314
def __repr__(self):
1415
return self.__str__()
1516

16-
def __getitem__(self, item):
17-
return self.tokens.__getitem__(item)
18-
19-
def token_string(self):
20-
return " ".join([t.value for t in self.tokens])
17+
def token_string(self, tokens: list[Token]):
18+
return " ".join([t.value for t in tokens[self.start:self.end]])
2119

2220
def lt(self, other: TokenRange):
23-
return self.tokens[-1].location.lt(other.tokens[0].location)
21+
return self.start < other.start
2422

2523
def gt(self, other: TokenRange):
2624
return other.lt(self)
2725

2826
def contains(self, other: TokenRange):
29-
return self.tokens[0].location.lt(other.tokens[0].location) and self.tokens[
30-
-1
31-
].location.gt(other.tokens[-1].location)
27+
return self.start < other.start and self.end > other.end
3228

3329
def overlaps(self, other: TokenRange):
34-
start_overlap = self.tokens[0].location.le(
35-
other.tokens[0].location
36-
) and self.tokens[-1].location.ge(other.tokens[0].location)
37-
end_overlap = self.tokens[0].location.le(
38-
other.tokens[0].location
39-
) and self.tokens[-1].location.ge(other.tokens[0].location)
30+
start_overlap = self.start <= other.start <= self.end
31+
end_overlap = self.start <= other.end <= self.end
4032
return start_overlap or end_overlap
33+
34+
35+
def sort_token_ranges(token_ranges: list[TokenRange], tokens: list[Token], reverse=False) -> list[TokenRange]:
36+
return sorted(
37+
token_ranges,
38+
reverse=reverse,
39+
key=lambda tr: (tokens[tr.start].location.line, tokens[tr.start].location.column),
40+
)

codelimit/common/scope/Header.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,18 @@
11
from dataclasses import dataclass
22

3+
from codelimit.common.Token import Token
34
from codelimit.common.TokenRange import TokenRange
45

56

67
@dataclass
78
class Header:
89
name: str
910
token_range: TokenRange
11+
12+
13+
def sort_headers(headers: list[Header], tokens: list[Token], reverse=False) -> list[Header]:
14+
return sorted(
15+
headers,
16+
reverse=reverse,
17+
key=lambda h: (tokens[h.token_range.start].location.line, tokens[h.token_range.start].location.column),
18+
)

codelimit/common/scope/Scope.py

Lines changed: 2 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
from __future__ import annotations
22

3-
from codelimit.common.Token import Token
43
from codelimit.common.TokenRange import TokenRange
54
from codelimit.common.scope.Header import Header
65

@@ -12,31 +11,10 @@ def __init__(self, header: Header, block: TokenRange):
1211
self.children: list[Scope] = []
1312

1413
def __str__(self):
15-
return (
16-
f"[{self.header.token_range[0].location}, {self.block.tokens[-1].location}]"
17-
)
14+
return f"({self.header.token_range.start}, {self.block.end})"
1815

1916
def __repr__(self):
2017
return self.__str__()
2118

22-
def __len__(self):
23-
return count_lines(self.tokens())
24-
25-
def tokens(self):
26-
children_tokens = []
27-
for child in self.children:
28-
children_tokens.extend(child.tokens())
29-
return [
30-
t
31-
for t in self.header.token_range.tokens + self.block.tokens
32-
if t not in children_tokens
33-
]
34-
3519
def contains(self, other: Scope) -> bool:
36-
return self.header.token_range[0].location.lt(
37-
other.header.token_range[0].location
38-
) and self.block.tokens[-1].location.gt(other.block.tokens[-1].location)
39-
40-
41-
def count_lines(tokens: list[Token]):
42-
return len(set([t.location.line for t in tokens]))
20+
return self.header.token_range.start < other.header.token_range.start and self.block.end > other.block.end

codelimit/common/scope/scope_utils.py

Lines changed: 41 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -2,17 +2,13 @@
22

33
from codelimit.common.Language import Language
44
from codelimit.common.Token import Token
5-
from codelimit.common.TokenRange import TokenRange
5+
from codelimit.common.TokenRange import TokenRange, sort_token_ranges
66
from codelimit.common.gsm.Expression import Expression
77
from codelimit.common.gsm.matcher import find_all, starts_with
8-
from codelimit.common.scope.Header import Header
8+
from codelimit.common.scope.Header import Header, sort_headers
99
from codelimit.common.scope.Scope import Scope
1010
from codelimit.common.source_utils import filter_tokens, filter_nocl_comment_tokens
11-
from codelimit.common.token_utils import (
12-
sort_tokens,
13-
sort_token_ranges,
14-
get_balanced_symbol_token_indices,
15-
)
11+
from codelimit.common.token_utils import get_balanced_symbol_token_indices
1612
from codelimit.common.utils import delete_indices
1713

1814

@@ -21,8 +17,8 @@ def build_scopes(tokens: list[Token], language: Language) -> list[Scope]:
2117
nocl_comment_tokens = filter_nocl_comment_tokens(tokens)
2218
headers = language.extract_headers(code_tokens)
2319
blocks = language.extract_blocks(code_tokens, headers)
24-
scopes = _build_scopes_from_headers_and_blocks(headers, blocks)
25-
filtered_scopes = _filter_nocl_scopes(scopes, nocl_comment_tokens)
20+
scopes = _build_scopes_from_headers_and_blocks(headers, blocks, code_tokens)
21+
filtered_scopes = _filter_nocl_scopes(scopes, code_tokens, nocl_comment_tokens)
2622
if language.allow_nested_functions:
2723
return fold_scopes(filtered_scopes)
2824
else:
@@ -66,26 +62,23 @@ def filter_scopes_nested_functions(scopes: list[Scope]) -> list[Scope]:
6662

6763

6864
def _build_scopes_from_headers_and_blocks(
69-
headers: list[Header], blocks: list[TokenRange]
65+
headers: list[Header], blocks: list[TokenRange], tokens: list[Token]
7066
) -> list[Scope]:
7167
result: list[Scope] = []
72-
reverse_headers = headers[::-1]
68+
reverse_headers = sort_headers(headers, tokens, reverse=True)
7369
for header in reverse_headers:
7470
scope_blocks_indices = _find_scope_blocks_indices(header.token_range, blocks)
7571
if len(scope_blocks_indices) > 0:
76-
scope_tokens = []
77-
for i in scope_blocks_indices:
78-
scope_tokens.extend(blocks[i].tokens)
79-
scope_tokens = sort_tokens(scope_tokens)
80-
scope_block = TokenRange(scope_tokens)
81-
result.append(Scope(header, scope_block))
72+
start = min(blocks[bi].start for bi in scope_blocks_indices)
73+
end = max(blocks[bi].end for bi in scope_blocks_indices)
74+
result.append(Scope(header, TokenRange(start, end)))
8275
blocks = delete_indices(blocks, scope_blocks_indices)
8376
result.reverse()
8477
return result
8578

8679

8780
def _find_scope_blocks_indices(
88-
header: TokenRange, blocks: list[TokenRange]
81+
header: TokenRange, blocks: list[TokenRange]
8982
) -> list[int]:
9083
body_block = _get_nearest_block(header, blocks)
9184
if body_block:
@@ -97,7 +90,7 @@ def _find_scope_blocks_indices(
9790

9891

9992
def _get_nearest_block(
100-
header: TokenRange, blocks: list[TokenRange]
93+
header: TokenRange, blocks: list[TokenRange]
10194
) -> Optional[TokenRange]:
10295
reverse_blocks = blocks[::-1]
10396
result = None
@@ -112,13 +105,15 @@ def _get_nearest_block(
112105

113106

114107
def _filter_nocl_scopes(
115-
scopes: list[Scope], nocl_comment_tokens: list[Token]
108+
scopes: list[Scope], tokens: list[Token], nocl_comment_tokens: list[Token]
116109
) -> list[Scope]:
117110
nocl_comment_lines = [t.location.line for t in nocl_comment_tokens]
118111

119112
def get_scope_header_lines(scope: Scope) -> set[int]:
120-
result = set([t.location.line for t in scope.header.token_range.tokens])
121-
first_line = scope.header.token_range.tokens[0].location.line
113+
header_token_range = scope.header.token_range
114+
header_tokens = tokens[header_token_range.start:header_token_range.end]
115+
result = set([t.location.line for t in header_tokens])
116+
first_line = header_tokens[0].location.line
122117
if first_line > 0:
123118
result.add(first_line - 1)
124119
return result
@@ -139,25 +134,43 @@ def has_curly_suffix(tokens: list[Token], index):
139134

140135

141136
def get_headers(
142-
tokens: list[Token], expression: Expression, followed_by: Expression = None
137+
tokens: list[Token], expression: Expression, followed_by: Expression = None
143138
) -> list[Header]:
144139
# expression = replace_string_literal_with_predicate(expression)
145140
patterns = find_all(expression, tokens)
146141
if followed_by:
147-
patterns = [p for p in patterns if starts_with(followed_by, tokens[p.end :])]
142+
patterns = [p for p in patterns if starts_with(followed_by, tokens[p.end:])]
148143
result = []
149144
for pattern in patterns:
150145
name_token = next(t for t in pattern.tokens if t.is_name())
151146
if name_token:
152-
result.append(Header(name_token.value, TokenRange(pattern.tokens)))
147+
result.append(Header(name_token.value, TokenRange(pattern.start, pattern.end)))
153148
return result
154149

155150

156151
def get_blocks(
157-
tokens: list[Token], open: str, close: str, extract_nested=True
152+
tokens: list[Token], open: str, close: str, extract_nested=True
158153
) -> list[TokenRange]:
159154
balanced_tokens = get_balanced_symbol_token_indices(
160155
tokens, open, close, extract_nested
161156
)
162-
token_ranges = [TokenRange(tokens[bt[0] : bt[1] + 1]) for bt in balanced_tokens]
163-
return sort_token_ranges(token_ranges)
157+
token_ranges = [TokenRange(bt[0], bt[1] + 1) for bt in balanced_tokens]
158+
return sort_token_ranges(token_ranges, tokens)
159+
160+
161+
def count_lines(scope: Scope, tokens: list[Token]):
162+
return len(set([t.location.line for t in _scope_tokens(scope, tokens)]))
163+
164+
165+
def _scope_tokens(scope: Scope, tokens: list[Token]) -> list[Token]:
166+
result = []
167+
children_token_ranges = []
168+
for child in scope.children:
169+
children_token_ranges.append(TokenRange(child.header.token_range.start, child.block.end))
170+
children_token_ranges = sort_token_ranges(children_token_ranges, tokens)
171+
for index in range(scope.header.token_range.start, scope.block.end):
172+
while len(children_token_ranges) > 0 and index > children_token_ranges[0].end:
173+
children_token_ranges.pop(0)
174+
if len(children_token_ranges) == 0 or index < children_token_ranges[0].start:
175+
result.append(tokens[index])
176+
return result
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
from codelimit.common.Token import Token
2+
from codelimit.common.token_matching.predicate.TokenPredicate import TokenPredicate
3+
from codelimit.common.token_matching.predicate.TokenValue import TokenValue
4+
5+
6+
class Not(TokenPredicate):
7+
def __init__(self, value: TokenPredicate | str):
8+
super().__init__()
9+
self.predicate = value if isinstance(value, TokenPredicate) else TokenValue(value)
10+
11+
def accept(self, token: Token) -> bool:
12+
return not self.predicate.accept(token)
13+
14+
def __eq__(self, other: object) -> bool:
15+
if not isinstance(other, Not):
16+
return False
17+
return self.predicate == other.predicate
18+
19+
def __hash__(self):
20+
return hash(self.predicate)

codelimit/common/token_utils.py

Lines changed: 7 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55

66

77
def get_balanced_symbol_token_indices(
8-
tokens: list[Token], start: str, end: str, extract_nested=False
8+
tokens: list[Token], start: str, end: str, extract_nested=False
99
) -> list[Tuple[int, int]]:
1010
result = []
1111
block_starts = []
@@ -21,32 +21,21 @@ def get_balanced_symbol_token_indices(
2121

2222

2323
def get_balanced_symbol_token_ranges(
24-
tokens: list[Token], start: str, end: str
24+
tokens: list[Token], start: str, end: str
2525
) -> list[TokenRange]:
2626
result = []
27-
token_lists = []
27+
start_indices: list[int] = []
2828
for index, t in enumerate(tokens):
2929
if t.is_symbol(start):
30-
token_lists.append([t])
30+
start_indices.append(index)
3131
elif t.is_symbol(end):
32-
if len(token_lists) > 0:
33-
token_lists[-1].append(t)
34-
tokens = token_lists.pop()
35-
result.append(TokenRange(tokens))
36-
else:
37-
if len(token_lists) > 0:
38-
token_lists[-1].append(t)
32+
if len(start_indices) > 0:
33+
start_index = start_indices.pop()
34+
result.append(TokenRange(start_index, index + 1))
3935
return result
4036

4137

4238
def sort_tokens(tokens: list[Token]) -> list[Token]:
4339
result = sorted(tokens, key=lambda t: t.location.column)
4440
result = sorted(result, key=lambda t: t.location.line)
4541
return result
46-
47-
48-
def sort_token_ranges(token_ranges: list[TokenRange]) -> list[TokenRange]:
49-
return sorted(
50-
token_ranges,
51-
key=lambda tr: (tr.tokens[0].location.line, tr.tokens[0].location.column),
52-
)

codelimit/languages/Java.py

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,14 @@
11
from codelimit.common.Language import Language
2+
from codelimit.common.Token import Token
23
from codelimit.common.gsm.operator.OneOrMore import OneOrMore
4+
from codelimit.common.scope.Header import Header
35
from codelimit.common.scope.scope_utils import (
46
get_blocks,
57
get_headers,
68
)
79
from codelimit.common.token_matching.predicate.Balanced import Balanced
810
from codelimit.common.token_matching.predicate.Choice import Choice
11+
from codelimit.common.token_matching.predicate.Keyword import Keyword
912
from codelimit.common.token_matching.predicate.Name import Name
1013

1114

@@ -14,9 +17,21 @@ def __init__(self):
1417
super().__init__("Java")
1518

1619
def extract_headers(self, tokens: list) -> list:
17-
return get_headers(
20+
headers = get_headers(
1821
tokens, [Name(), OneOrMore(Balanced("(", ")"))], Choice("{", "throws")
1922
)
23+
return filter_headers(headers, tokens)
2024

2125
def extract_blocks(self, tokens: list, headers: list) -> list:
2226
return get_blocks(tokens, "{", "}")
27+
28+
29+
def filter_headers(headers: list[Header], tokens: list[Token]) -> list[Header]:
30+
result = []
31+
keywords = Choice(Keyword('record'), Keyword('new'))
32+
for header in headers:
33+
if header.token_range.start > 0 and keywords.accept(tokens[header.token_range.start - 1]):
34+
continue
35+
else:
36+
result.append(header)
37+
return result

0 commit comments

Comments
 (0)