Skip to content

Commit 6f37980

Browse files
committed
Fix regular-expression based extractors offset
1 parent 6dc8ab3 commit 6f37980

File tree

10 files changed

+282
-43
lines changed

10 files changed

+282
-43
lines changed

atest/05_Features/Completion.robot

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -296,6 +296,28 @@ Shows Documentation With CompletionItem Resolve
296296
Completer Should Include Documentation the default method of the
297297
[Teardown] Clean Up After Working With File completion.R
298298

299+
Shows Only Relevant Suggestions In Known Magics
300+
# https://github.com/krassowski/jupyterlab-lsp/issues/559
301+
# h<tab>
302+
Enter Cell Editor 20 line=2
303+
Trigger Completer
304+
Completer Should Suggest help
305+
Completer Should Not Suggest from
306+
Completer Should Suggest hash
307+
308+
Completes In R Magics
309+
# Proper completion in R magics needs to be tested as:
310+
# - R magic extractor uses a tailor-made replacer function, not tested elsewhere
311+
# - R lanugage server is very sensitive to off-by-one errors (see https://github.com/REditorSupport/languageserver/issues/395)
312+
# '%%R\n librar<tab>'
313+
Enter Cell Editor 22 line=2
314+
Trigger Completer
315+
Completer Should Suggest library
316+
# '%R lib<tab>'
317+
Enter Cell Editor 24 line=1
318+
Trigger Completer
319+
Completer Should Suggest library
320+
299321
*** Keywords ***
300322
Setup Completion Test
301323
Setup Notebook Python Completion.ipynb

atest/examples/Completion.ipynb

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -150,6 +150,56 @@
150150
"source": [
151151
"t"
152152
]
153+
},
154+
{
155+
"cell_type": "markdown",
156+
"metadata": {},
157+
"source": [
158+
"Cell magics show only relevant suggestions (triggering after `h` should return `hash`, `help`, etc, but not `from`):"
159+
]
160+
},
161+
{
162+
"cell_type": "code",
163+
"execution_count": null,
164+
"metadata": {},
165+
"outputs": [],
166+
"source": [
167+
"%%python\n",
168+
"h"
169+
]
170+
},
171+
{
172+
"cell_type": "markdown",
173+
"metadata": {},
174+
"source": [
175+
"Test that the leading space does not cause issues in R cell magic:"
176+
]
177+
},
178+
{
179+
"cell_type": "code",
180+
"execution_count": null,
181+
"metadata": {},
182+
"outputs": [],
183+
"source": [
184+
"%%R\n",
185+
" librar"
186+
]
187+
},
188+
{
189+
"cell_type": "markdown",
190+
"metadata": {},
191+
"source": [
192+
"And that R line magic works too:"
193+
]
194+
},
195+
{
196+
"cell_type": "code",
197+
"execution_count": null,
198+
"metadata": {},
199+
"outputs": [],
200+
"source": [
201+
"%R li"
202+
]
153203
}
154204
],
155205
"metadata": {

packages/jupyterlab-lsp/src/extractors/regexp.spec.ts

Lines changed: 52 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,13 @@
11
import { expect } from 'chai';
2-
import { RegExpForeignCodeExtractor } from './regexp';
2+
import { getIndexOfCaptureGroup, RegExpForeignCodeExtractor } from './regexp';
33

44
let R_CELL_MAGIC_EXISTS = `%%R
55
some text
66
`;
77

8+
let PYTHON_CELL_MAGIC_WITH_H = `%%python
9+
h`;
10+
811
let NO_CELL_MAGIC = `%R
912
some text
1013
%%R
@@ -23,11 +26,24 @@ x = """<a href="#">
2326
</a>""";
2427
print(x)`;
2528

29+
30+
describe('getIndexOfCaptureGroup', () => {
31+
it('extracts index of a captured group', () => {
32+
// tests for https://github.com/krassowski/jupyterlab-lsp/issues/559
33+
let result = getIndexOfCaptureGroup(
34+
new RegExp('^%%(python|python2|python3|pypy)( .*?)?\n([^]*)'),
35+
'%%python\nh',
36+
'h'
37+
);
38+
expect(result).to.be.equal(9);
39+
});
40+
});
41+
2642
describe('RegExpForeignCodeExtractor', () => {
2743
let r_cell_extractor = new RegExpForeignCodeExtractor({
2844
language: 'R',
2945
pattern: '^%%R( .*?)?\n([^]*)',
30-
extract_to_foreign: '$2',
46+
foreign_capture_group: 2,
3147
keep_in_host: true,
3248
is_standalone: false,
3349
file_extension: 'R'
@@ -36,12 +52,21 @@ describe('RegExpForeignCodeExtractor', () => {
3652
let r_line_extractor = new RegExpForeignCodeExtractor({
3753
language: 'R',
3854
pattern: '(^|\n)%R (.*)\n?',
39-
extract_to_foreign: '$2',
55+
foreign_capture_group: 2,
4056
keep_in_host: true,
4157
is_standalone: false,
4258
file_extension: 'R'
4359
});
4460

61+
let python_cell_extractor = new RegExpForeignCodeExtractor({
62+
language: 'python',
63+
pattern: '^%%(python|python2|python3|pypy)( .*?)?\n([^]*)',
64+
foreign_capture_group: 3,
65+
keep_in_host: true,
66+
is_standalone: true,
67+
file_extension: 'py'
68+
});
69+
4570
describe('#has_foreign_code()', () => {
4671
it('detects cell magics', () => {
4772
let result = r_cell_extractor.has_foreign_code(R_CELL_MAGIC_EXISTS);
@@ -75,16 +100,35 @@ describe('RegExpForeignCodeExtractor', () => {
75100
});
76101

77102
describe('#extract_foreign_code()', () => {
103+
104+
it('should correctly return the range', () => {
105+
let results = python_cell_extractor.extract_foreign_code(PYTHON_CELL_MAGIC_WITH_H);
106+
expect(results.length).to.equal(1);
107+
108+
let result = results[0];
109+
110+
// test against https://github.com/krassowski/jupyterlab-lsp/issues/559
111+
expect(result.host_code).to.equal(PYTHON_CELL_MAGIC_WITH_H);
112+
expect(result.foreign_code).to.equal(
113+
'h'
114+
);
115+
116+
expect(result.range.start.line).to.equal(1);
117+
expect(result.range.start.column).to.equal(0);
118+
expect(result.range.end.line).to.equal(1);
119+
expect(result.range.end.column).to.equal(1);
120+
});
121+
78122
it('should work with non-line magic and non-cell magic code snippets as well', () => {
79123
// Note: in the real application, one should NOT use regular expressions for HTML extraction
80124

81125
let html_extractor = new RegExpForeignCodeExtractor({
82126
language: 'HTML',
83-
pattern: '<(.*?)( .*?)?>([^]*?)</\\1>',
84-
extract_to_foreign: '<$1$2>$3</$1>',
127+
pattern: '(<(.*?)( .*?)?>([^]*?)</\\2>)',
128+
foreign_capture_group: 1,
85129
keep_in_host: false,
86130
is_standalone: false,
87-
file_extension: 'R'
131+
file_extension: 'html'
88132
});
89133

90134
let results = html_extractor.extract_foreign_code(HTML_IN_PYTHON);
@@ -118,7 +162,7 @@ describe('RegExpForeignCodeExtractor', () => {
118162
let extractor = new RegExpForeignCodeExtractor({
119163
language: 'R',
120164
pattern: '^%%R( .*?)?\n([^]*)',
121-
extract_to_foreign: '$2',
165+
foreign_capture_group: 2,
122166
keep_in_host: false,
123167
is_standalone: false,
124168
file_extension: 'R'
@@ -136,7 +180,7 @@ describe('RegExpForeignCodeExtractor', () => {
136180
let r_line_extractor = new RegExpForeignCodeExtractor({
137181
language: 'R',
138182
pattern: '(^|\n)%R (.*)\n?',
139-
extract_to_foreign: '$2',
183+
foreign_capture_group: 2,
140184
keep_in_host: false,
141185
is_standalone: false,
142186
file_extension: 'R'

packages/jupyterlab-lsp/src/extractors/regexp.ts

Lines changed: 78 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,38 @@ import { position_at_offset } from '../positioning';
33
import { replacer } from '../overrides/tokens';
44
import { CodeEditor } from '@jupyterlab/codeeditor';
55

6+
7+
export function getIndexOfCaptureGroup(expression: RegExp, matched_string: string, value_of_captured_group: string): number {
8+
// TODO: use https://github.com/tc39/proposal-regexp-match-indices once supported in >95% of browsers
9+
// (probably around 2025)
10+
11+
// get index of the part that is being extracted to foreign document
12+
let captured_groups = expression.exec(matched_string);
13+
let offset_in_match = 0;
14+
15+
// first element is full match
16+
let full_matched = captured_groups[0];
17+
18+
for (let group of captured_groups.slice(1)) {
19+
20+
if (typeof group === 'undefined') {
21+
continue;
22+
}
23+
24+
if (group === value_of_captured_group) {
25+
offset_in_match += full_matched.indexOf(group);
26+
break;
27+
}
28+
29+
let group_end_offset = full_matched.indexOf(group) + group.length;
30+
31+
full_matched = full_matched.slice(group_end_offset);
32+
offset_in_match += group_end_offset;
33+
}
34+
35+
return offset_in_match;
36+
}
37+
638
export class RegExpForeignCodeExtractor implements IForeignCodeExtractor {
739
options: RegExpForeignCodeExtractor.IOptions;
840
language: string;
@@ -37,14 +69,18 @@ export class RegExpForeignCodeExtractor implements IForeignCodeExtractor {
3769
let match: RegExpExecArray = this.global_expression.exec(code);
3870
let host_code_fragment: string;
3971

72+
let new_api_replacer = typeof this.options.foreign_replacer !== 'undefined' ? this.options.foreign_replacer : ('$' + this.options.foreign_capture_group);
73+
const replacer = typeof this.options.extract_to_foreign !== 'undefined' ? this.options.extract_to_foreign : new_api_replacer;
74+
4075
while (match != null) {
4176
let matched_string = match[0];
4277
let position_shift: CodeEditor.IPosition = null;
78+
4379
let foreign_code_fragment = matched_string.replace(
4480
this.expression,
4581
// eslint-disable-next-line @typescript-eslint/ban-ts-comment
4682
// @ts-ignore
47-
this.options.extract_to_foreign
83+
replacer
4884
);
4985
let prefix = '';
5086
if (typeof this.options.extract_arguments !== 'undefined') {
@@ -72,11 +108,22 @@ export class RegExpForeignCodeExtractor implements IForeignCodeExtractor {
72108
}
73109
}
74110

75-
// TODO: this could be slightly optimized (start at start) by using the match[n],
76-
// where n is the group to be used; while this reduces the flexibility of extract_to_foreign,
77-
// it might be better to enforce such strict requirement
111+
let foreign_code_group_value = foreign_code_fragment;
112+
113+
if (new_api_replacer) {
114+
foreign_code_group_value = matched_string.replace(
115+
this.expression,
116+
'$' + this.options.foreign_capture_group
117+
);
118+
}
119+
120+
const foreign_group_index_in_match = getIndexOfCaptureGroup(
121+
this.expression, matched_string, foreign_code_group_value
122+
);
123+
78124
let start_offset =
79-
match.index + matched_string.indexOf(foreign_code_fragment);
125+
match.index + foreign_group_index_in_match;
126+
80127
let start = position_at_offset(start_offset, lines);
81128
let end = position_at_offset(
82129
start_offset + foreign_code_fragment.length,
@@ -118,16 +165,36 @@ namespace RegExpForeignCodeExtractor {
118165
* String giving regular expression to test cells for the foreign language presence.
119166
*
120167
* For example:
121-
* - %%R( (.*))?\n(.*) will match R cells of rpy2
122-
* - (.*)'<html>(.*)</html>'(.*) will match html documents in strings of any language using single ticks
168+
* - `%%R( (.*))?\n(.*)` will match R cells of rpy2
169+
* - `(.*)'<html>(.*)</html>'(.*)` will match html documents in strings of any language using single ticks
123170
*/
124171
pattern: string;
125172
/**
126173
* String specifying match groups to be extracted from the regular expression match,
127174
* for the use in virtual document of the foreign language.
128-
* For the R example this should be '$3'
175+
* For the R example this should be `3`. Please not that these are 1-based, as the 0th index is the full match.
176+
*
177+
* If more than one capture group is needed to extract the code (which is rarely the case:
178+
* usually one can use non-capturing groups rather than multiple adjacent capturing groups),
179+
* specify the first capturing group to allow for proper calculation of the start offset,
180+
* and handle any additional groups using `foreign_replacer`.
181+
*
182+
* `foreign_capture_group` is required for proper offset calculation and will no longer be optional in 4.0.
183+
*/
184+
foreign_capture_group?: number;
185+
/**
186+
* Function to compose the foreign document code, in case if using a capture group alone is not sufficient;
187+
* If specified, `foreign_capture_group` should be specified as well, so that it points to the first occurrence
188+
* of the foreign code. When both are specified, `foreign_replacer` takes precedence.
189+
*
190+
* See:
191+
* https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/String/replace#specifying_a_function_as_a_parameter
192+
*/
193+
foreign_replacer?: replacer;
194+
/**
195+
* @deprecated `extract_to_foreign` will be removed in 4.0; use `foreign_capture_group` or `foreign_replacer` instead
129196
*/
130-
extract_to_foreign: string | replacer;
197+
extract_to_foreign?: string | replacer;
131198
/**
132199
* If arguments from the cell or line magic are to be extracted and prepended before the extracted code,
133200
* set extract_arguments to a replacer function taking the code and returning the string to be prepended.
@@ -143,6 +210,8 @@ namespace RegExpForeignCodeExtractor {
143210
*
144211
* Setting to false is DEPRECATED as it breaks the edit feature (while it could be fixed,
145212
* it would make the code considerably more complex).
213+
*
214+
* @deprecated `keep_in_host` will be removed in 4.0
146215
*/
147216
keep_in_host?: boolean;
148217
/**

packages/jupyterlab-lsp/src/extractors/testutils.ts

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,9 +12,14 @@ export function extract_code(document: VirtualDocument, code: string) {
1212
);
1313
}
1414

15+
interface IDocumentWithRange {
16+
range: CodeEditor.IRange;
17+
virtual_document: VirtualDocument;
18+
}
19+
1520
export function get_the_only_pair(
1621
foreign_document_map: Map<CodeEditor.IRange, IVirtualDocumentBlock>
17-
) {
22+
): IDocumentWithRange {
1823
expect(foreign_document_map.size).to.equal(1);
1924

2025
let range = foreign_document_map.keys().next().value;

0 commit comments

Comments
 (0)