Skip to content

Commit d677fd7

Browse files
authored
Merge pull request #41 from Evref-BL/21-Finish-to-clean-the-APInot-using-the-right-encoding
Fixing issue #21
2 parents 7024a93 + 35d744e commit d677fd7

File tree

4 files changed

+45
-96
lines changed

4 files changed

+45
-96
lines changed

src/TreeSitter-Highlighter/TSHighlighter.class.st

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ TSHighlighter >> highlight: aString [
5353
text := aString asText.
5454
string := aString.
5555
tree := self parser parseString: aString.
56-
self highlight: text usingNode: tree rootNode.
56+
self highlight: text withPlatformLineEndings usingNode: tree rootNode.
5757
^ text
5858
]
5959

src/TreeSitter/String.extension.st

Lines changed: 29 additions & 80 deletions
Original file line numberDiff line numberDiff line change
@@ -2,89 +2,38 @@ Extension { #name : 'String' }
22

33
{ #category : '*TreeSitter' }
44
String >> positionFromTSPoint: aTSPoint [
5-
"Use me to convert a TSPoint to the position in the original string"
6-
7-
"Ideally this method (and #ts* methods on String) should be removed because it does not manage the encoding. On top of that, they are too slow because they do too much things while we could directly manipulate a ByteArray or BinaryStream and use #startByte and #endByte to read or get the positions."
8-
9-
| sum currentLine lines |
10-
sum := aTSPoint column.
11-
currentLine := 0.
12-
lines := self tsLines.
13-
[ currentLine < aTSPoint row ] whileTrue: [
14-
sum := sum + (lines at: currentLine + 1) size.
15-
currentLine := currentLine + 1 ].
16-
^ sum
17-
]
18-
19-
{ #category : '*TreeSitter' }
20-
String >> tsLineIndicesDo: aBlock [
21-
"execute aBlock with 3 arguments for each line:
22-
- start index of line
23-
- end index of line without line delimiter
24-
- end index of line including line delimiter(s) CR, LF or CRLF"
25-
26-
27-
"Ideally this method (and #ts* methods on String) should be removed because it does not manage the encoding. On top of that, they are too slow because they do too much things while we could directly manipulate a ByteArray or BinaryStream and use #startByte and #endByte to read or get the positions."
28-
29-
| cr lf start sz nextLF nextCR |
30-
start := 1.
31-
sz := self size.
32-
cr := Character cr.
33-
nextCR := self indexOf: cr startingAt: 1.
34-
lf := Character lf.
35-
nextLF := self indexOf: lf startingAt: 1.
36-
sz = 0
37-
ifTrue: [ aBlock value: sz value: sz value: sz.
38-
^ self ].
39-
[ start <= sz ]
40-
whileTrue: [ (nextLF = 0 and: [ nextCR = 0 ])
41-
ifTrue: [ "No more CR, nor LF, the string is over"
42-
aBlock value: start value: sz value: sz.
43-
^ self ].
44-
(nextCR = 0 or: [ 0 < nextLF and: [ nextLF < nextCR ] ])
45-
ifTrue: [ "Found a LF"
46-
aBlock value: start value: nextLF - 1 value: nextLF.
47-
start := 1 + nextLF.
48-
nextLF := self indexOf: lf startingAt: start ]
49-
ifFalse: [ 1 + nextCR = nextLF
50-
ifTrue: [ "Found a CR-LF pair"
51-
aBlock value: start value: nextCR - 1 value: nextLF.
52-
start := 1 + nextLF.
53-
nextCR := self indexOf: cr startingAt: start.
54-
nextLF := self indexOf: lf startingAt: start ]
55-
ifFalse: [ "Found a CR"
56-
aBlock value: start value: nextCR - 1 value: nextCR.
57-
start := 1 + nextCR.
58-
nextCR := self indexOf: cr startingAt: start ] ] ].
59-
aBlock value: start value: sz value: sz
5+
6+
^ self positionFromTSPoint: aTSPoint usingEncoding: #utf8
607
]
618

629
{ #category : '*TreeSitter' }
63-
String >> tsLines [
64-
"Same as lines but empty line exist"
10+
String >> positionFromTSPoint: aTSPoint usingEncoding: anEncoding [
6511

12+
"This method is used to convert a TSPoint to the position in the original string"
13+
"It is specifically used in the TSHighliter to make it compatible with inspectionFASTSourceCode: of FASTEntity"
6614

67-
"Ideally this method (and #ts* methods on String) should be removed because it does not manage the encoding. On top of that, they are too slow because they do too much things while we could directly manipulate a ByteArray or BinaryStream and use #startByte and #endByte to read or get the positions."
68-
69-
^ Array
70-
new: (self size // 60 max: 16)
71-
streamContents: [ :lines |
72-
self tsLinesDo: [ :aLine | lines nextPut: aLine ] ]
73-
]
74-
75-
{ #category : '*TreeSitter' }
76-
String >> tsLinesDo: aBlock [
77-
"Same as linesDo but empty line exist"
78-
79-
80-
"Ideally this method (and #ts* methods on String) should be removed because it does not manage the encoding. On top of that, they are too slow because they do too much things while we could directly manipulate a ByteArray or BinaryStream and use #startByte and #endByte to read or get the positions."
81-
82-
self
83-
tsLineIndicesDo: [ :start :endWithoutDelimiters :end |
84-
| begin |
85-
"endWithoutDelimiters = start
86-
ifTrue: [ aBlock value: '' ]
87-
ifFalse: ["
88-
begin := (start = 0) ifTrue: [ 1 ] ifFalse: [ start ].
89-
aBlock value: (self copyFrom: begin to: end) "]" ]
15+
| bytes currentRow index |
16+
17+
bytes := self encodeWith: anEncoding. "converting cod e to bytes;"
18+
19+
currentRow := 0.
20+
index := 1.
21+
22+
[ currentRow < aTSPoint row ] whileTrue: [
23+
index > bytes size ifTrue: [
24+
self error: 'Row exceeds number of lines'
25+
].
26+
27+
(bytes at: index) = 10 ifTrue: [ "apparently 10 is the byte value of \n in UTF-8 (and ASCII); but this is risky if the encoding is not utf8"
28+
currentRow := currentRow + 1
29+
].
30+
31+
index := index + 1.
32+
].
33+
34+
(index - 1 + aTSPoint column) > bytes size ifTrue: [
35+
self error: 'Column exceeds line length'
36+
].
37+
38+
^ index - 1 + aTSPoint column
9039
]

src/TreeSitter/TSLibrary.class.st

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -208,17 +208,17 @@ TSLibrary >> ts_parser: aParser _parse_string: aString ofLength: length usingOld
208208
]
209209

210210
{ #category : 'parser' }
211-
TSLibrary >> ts_parser: aTSParser _print_dot_graphs: fd [
212-
211+
TSLibrary >> ts_parser: aParser _parse_string: aString ofLength: length usingOldTree: anOldTree encoding: anEncoding [
212+
213213
^ self ffiCall:
214-
'void ts_parser_print_dot_graphs (TSParser * aTSParser, int fd )'
214+
'TSTree *ts_parser_parse_string_encoding(TSParser * aParser, const TSTree * anOldTree, const char * aString, uint32 length, TSInputEncoding anEncoding)'
215215
]
216216

217217
{ #category : 'parser' }
218-
TSLibrary >> ts_parser: aParser _parse_string: aString ofLength: length usingOldTree: anOldTree encoding: anEncoding [
219-
218+
TSLibrary >> ts_parser: aTSParser _print_dot_graphs: fd [
219+
220220
^ self ffiCall:
221-
'TSTree *ts_parser_parse_string_encoding(TSParser * aParser, const TSTree * anOldTree, const char * aString, uint32 length, TSInputEncoding anEncoding)'
221+
'void ts_parser_print_dot_graphs (TSParser * aTSParser, int fd )'
222222
]
223223

224224
{ #category : 'parser' }

src/TreeSitter/TSParser.class.st

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -126,15 +126,6 @@ TSParser >> parseString: aString usingTree: aTree pharoEncoding: anEncoding [
126126
usingOldTree: aTree
127127
]
128128

129-
{ #category : 'parsing' }
130-
TSParser >> printDotGraphTo: aFileDescriptor [
131-
"-1 for no"
132-
133-
^ TSLibrary uniqueInstance
134-
ts_parser: self
135-
_print_dot_graphs: aFileDescriptor
136-
]
137-
138129
{ #category : 'parsing' }
139130
TSParser >> parseString: aString usingTree: aTree tsEncoding: anEncoding [
140131

@@ -150,6 +141,15 @@ TSParser >> parseString: aString usingTree: aTree tsEncoding: anEncoding [
150141
encoding: anEncoding
151142
]
152143

144+
{ #category : 'parsing' }
145+
TSParser >> printDotGraphTo: aFileDescriptor [
146+
"-1 for no"
147+
148+
^ TSLibrary uniqueInstance
149+
ts_parser: self
150+
_print_dot_graphs: aFileDescriptor
151+
]
152+
153153
{ #category : 'initialization' }
154154
TSParser >> reset [
155155

0 commit comments

Comments
 (0)