Skip to content

Commit 7ffdfba

Browse files
committed
Reduce benchmark allocations from 372MB to 213MB by rewriting input[n] char-at-a-time accesses
1 parent 7d0a032 commit 7ffdfba

File tree

7 files changed

+1538
-60
lines changed

7 files changed

+1538
-60
lines changed

lib/stupidedi/reader/substring.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ class Substring < Pointer
99
#########################################################################
1010

1111
def_delegators :reify, :to_d
12-
def_delegators :reify, :to_sym, :intern, :to_i, type: String
12+
def_delegators :reify, :to_sym, :intern, :to_i, :split, type: String
1313
def_delegators :@storage, :encoding, :valid_encoding?, type: String
1414

1515
alias_method :to_s, :reify

lib/stupidedi/reader/tokenizer.rb

Lines changed: 113 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,11 @@ module Reader
99
# input.at(offset). It might be more efficient to replace char-at-a-time reads
1010
# with Regexps that consume multiple characters at once.
1111

12+
# tr separators.segment
13+
# gs separators.element
14+
# us separators.component
15+
# rs separators.repetition
16+
1217
class Tokenizer
1318
include Inspect
1419

@@ -190,15 +195,15 @@ def _next_isa_segment_id(input)
190195
return eof("ISA", input.position) unless input.defined_at?(s)
191196

192197
# The character after this "I" is not "S"
193-
next unless input[s, 1] == @s
198+
next unless input.start_with?(@s, s)#[s, 1] == @s
194199

195200
a = input.min_graphic_index(s+1)
196201

197202
#return eof("ISA", input.position_at(i)) unless input.defined_at?(a)
198203
return eof("ISA", input.position) unless input.defined_at?(a)
199204

200205
# The character after this "S" is not "A"
201-
next unless input[a, 1] == @a
206+
next unless input.start_with?@a, a)#[a, 1] == @a
202207

203208
# The next character determines the element separator. If it's an
204209
# alphanumeric or space, we assume this is not the start of an ISA
@@ -265,20 +270,30 @@ def _next_segment_id(input)
265270
buffer = input.pointer.drop_take(offset, 0)
266271
start_pos = input.position_at(offset)
267272

268-
while true
269-
return eof("segment identifier", input.position) \
270-
unless input.defined_at?(offset)
273+
# while true
274+
# return eof("segment identifier", input.position) \
275+
# unless input.defined_at?(offset)
271276

272-
char = input.at(offset)
273-
break if char == @separators.element
274-
break if char == @separators.segment
277+
# char = input.at(offset)
278+
# break if char == @separators.element
279+
# break if char == @separators.segment
275280

276-
# Zero-copy as long as we've not skipped over any characters yet
277-
buffer << char if input.graphic?(offset)
278-
offset += 1
281+
# # Zero-copy as long as we've not skipped over any characters yet
282+
# buffer << char if input.graphic?(offset)
283+
# offset += 1
279284

280-
break if buffer.length >= 3
281-
end
285+
# break if buffer.length >= 3
286+
# end
287+
288+
# Whichever occurs first
289+
gs = input.index(@separators.element, offset)
290+
tr = input.index(@separators.segment, offset)
291+
xx = if gs and tr and gs < tr; then gs end || tr || gs
292+
return eof("segment identifier", input.position) unless xx
293+
294+
length = xx - offset
295+
length = 3 if length > 3
296+
buffer = input[offset, length]
282297

283298
# This is the only String allocation we cannot get around. The `match?`
284299
# call either has a pattern with \A..\z, or the length of segment_id
@@ -292,7 +307,7 @@ def _next_segment_id(input)
292307
return expected("segment identifier, found %s" % segment_id.inspect,
293308
start_pos) unless segment_id.match?(VALID_SEGMENT_ID)
294309

295-
return done(segment_id.to_sym, start_pos, input.drop!(offset))
310+
return done(segment_id.to_sym, start_pos, input.drop!(xx))
296311
end
297312

298313
# @param input should be positioned on an element separator: "NM1[*].."
@@ -429,27 +444,57 @@ def _read_component_element(input, repeatable, segment_id, element_idx, componen
429444
builder = @switcher_.switch(repeatable, input.position)
430445

431446
while input.defined_at?(offset)
432-
char = input.at(offset)
447+
# char = input.at(offset)
448+
449+
# if repeatable and char == @separators.repetition
450+
# builder.add(Tokens::SimpleElementTok.build(buffer, repeat_pos))
451+
# offset += 1
452+
# repeat_pos = input.position_at(offset)
453+
454+
# elsif char == @separators.segment \
455+
# or char == @separators.element \
456+
# or char == @separators.component \
457+
# or char == @separators.repetition
458+
# # Because we're not repeatable, a repetition seperator could
459+
# # belong to the parent/composite element. If it's not repeatable
460+
# # either, an error can be returned.
461+
# builder.add(Tokens::ComponentElementTok.build(buffer, repeat_pos))
462+
# return done(builder.build, builder.position, input.drop!(offset))
463+
464+
# else
465+
# # This is zero-copy as long as we haven't skipped any characters
466+
# buffer << char if input.graphic?(offset)
467+
# offset += 1
468+
# end
469+
470+
# Whichever occurs first
471+
tr = input.index(@separators.segment, offset) if @separators.segment
472+
gs = input.index(@separators.element, offset) if @separators.element
473+
us = input.index(@separators.component, offset) if @separators.component
474+
xx = if gs and tr and gs < tr; then gs end || tr || gs
475+
xx = if xx and us and xx < us; then xx end || us || xx
476+
477+
rs = input.index(@separators.repetition, offset) \
478+
if @separators.repetition and repeatable
479+
480+
if rs and rs < xx
481+
length = rs - offset
482+
buffer = input[offset, length]
433483

434-
if repeatable and char == @separators.repetition
435484
builder.add(Tokens::SimpleElementTok.build(buffer, repeat_pos))
436-
repeat_pos = input.position_at(offset + 1)
437-
offset += 1
485+
offset = rs + 1
486+
repeat_pos = input.position_at(offset)
487+
elsif xx
488+
length = xx - offset
489+
buffer = input[offset, length]
438490

439-
elsif char == @separators.segment \
440-
or char == @separators.element \
441-
or char == @separators.component \
442-
or char == @separators.repetition
443491
# Because we're not repeatable, a repetition seperator could
444492
# belong to the parent/composite element. If it's not repeatable
445493
# either, an error can be returned.
446494
builder.add(Tokens::ComponentElementTok.build(buffer, repeat_pos))
447-
return done(builder.build, builder.position, input.drop!(offset))
448-
495+
return done(builder.build, builder.position, input.drop!(xx))
449496
else
450-
# This is zero-copy as long as we haven't skipped any characters
451-
buffer << char if input.graphic?(offset)
452-
offset += 1
497+
break
453498
end
454499
end
455500

@@ -482,29 +527,58 @@ def _read_simple_element(input, repeatable, segment_id, element_idx)
482527
input.position) unless input.start_with?(@separators.element)
483528

484529
offset = input.min_graphic_index(1)
485-
buffer = input.pointer.drop_take(offset, 0)
530+
#buffer = input.pointer.drop_take(offset, 0)
486531
start_pos = input.position
487532
repeat_pos = input.position
488533
builder = @switcher.switch(repeatable, input.position)
489534

490535
while input.defined_at?(offset)
491-
char = input.at(offset)
492-
493-
if char == @separators.element \
494-
or char == @separators.segment
495-
builder.add(Tokens::SimpleElementTok.build(buffer, repeat_pos))
496-
return done(builder.build, start_pos, input.drop!(offset))
536+
# char = input.at(offset)
537+
538+
# if char == @separators.element \
539+
# or char == @separators.segment
540+
# builder.add(Tokens::SimpleElementTok.build(buffer, repeat_pos))
541+
# return done(builder.build, start_pos, input.drop!(offset))
542+
543+
# elsif repeatable and char == @separators.repetition
544+
# builder.add(Tokens::SimpleElementTok.build(buffer, repeat_pos))
545+
# offset += 1
546+
# buffer = input.pointer.drop_take(offset, 0)
547+
# repeat_pos = input.position_at(offset)
548+
549+
# else
550+
# # This is zero-copy as long as we haven't skipped any characters
551+
# buffer << char if input.graphic?(offset)
552+
# offset += 1
553+
# end
554+
555+
# Whichever occurs first
556+
tr = input.index(@separators.segment, offset) if @separators.segment
557+
gs = input.index(@separators.element, offset) if @separators.element
558+
xx = if gs and tr and gs < tr; then gs end || tr || gs
559+
break unless xx
560+
561+
rs = input.index(@separators.repetition, offset) if @separators.repetition
562+
563+
if rs and rs < xx
564+
# @sepatarors.repetition
565+
length = rs - offset
566+
buffer = input[offset, length]
497567

498-
elsif repeatable and char == @separators.repetition
499568
builder.add(Tokens::SimpleElementTok.build(buffer, repeat_pos))
500-
offset += 1
569+
offset = rs + 1
501570
buffer = input.pointer.drop_take(offset, 0)
502571
repeat_pos = input.position_at(offset)
572+
elsif xx
573+
# @separators.element
574+
# @separators.segment
575+
length = xx - offset
576+
buffer = input[offset, length]
503577

578+
builder.add(Tokens::SimpleElementTok.build(buffer, repeat_pos))
579+
return done(builder.build, start_pos, input.drop!(xx))
504580
else
505-
# This is zero-copy as long as we haven't skipped any characters
506-
buffer << char if input.graphic?(offset)
507-
offset += 1
581+
break
508582
end
509583
end
510584

0 commit comments

Comments
 (0)