Skip to content

Commit 925c777

Browse files
committed
Remove non-control characters from tokens before construction
1 parent 2a69df3 commit 925c777

File tree

10 files changed

+330
-68
lines changed

10 files changed

+330
-68
lines changed

ext/c/stupidedi/reader/native_ext.c

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -602,6 +602,83 @@ rb_min_graphic_index(int argc, const VALUE *argv, VALUE self)
602602
}
603603
}
604604

605+
/*
606+
* call-seq:
607+
* min_nongraphic_index(string, index=0) -> int
608+
*
609+
* Returns the smallest index (starting from the given index) that is not a
610+
* graphic character. If no non-graphic characters occur after the given index,
611+
* then the string length is returned.
612+
*
613+
* min_nongraphic_index("\r\nabc ") #=> 2
614+
* min_nongraphic_index("\r\nabc ", 2) #=> 2
615+
* min_nongraphic_index("\r\nabc ", 5) #=> 5
616+
* min_nongraphic_index("\r\n") #=> 2
617+
*/
618+
static VALUE
619+
rb_min_nongraphic_index(int argc, const VALUE *argv, VALUE self)
620+
{
621+
rb_check_arity(argc, 1, 2);
622+
623+
Check_Type(argv[0], T_STRING);
624+
if (argc >= 2)
625+
Check_Type(argv[1], T_FIXNUM);
626+
627+
VALUE str;
628+
str = argv[0];
629+
630+
char *ptr, *end;
631+
end = RSTRING_END(str);
632+
ptr = RSTRING_PTR(str);
633+
634+
long idx;
635+
idx = argc < 2 ? 0 : FIX2LONG(argv[1]);
636+
637+
int encidx;
638+
encidx = ENCODING_GET(str);
639+
640+
rb_encoding *enc;
641+
enc = rb_enc_from_index(encidx);
642+
643+
if (idx < 0) rb_raise(rb_eArgError, "index cannot be negative");
644+
if (ptr == NULL) return INT2FIX(0);
645+
646+
if (single_byte_optimizable(str, enc)) {
647+
ptr += idx; /* address of str[idx] */
648+
649+
if (end <= ptr)
650+
return LONG2NUM(RSTRING_LEN(str));
651+
652+
while (ptr < end && is_graphic(*ptr, encidx))
653+
ptr ++;
654+
655+
return LONG2NUM(ptr - RSTRING_PTR(str));
656+
} else {
657+
long len_, count;
658+
len_ = 1;
659+
count = 0;
660+
661+
/* address of str[idx], len is .bytesize */
662+
ptr = rb_str_subpos(str, idx, &len_);
663+
if (ptr == NULL) return rb_str_length(str);
664+
665+
unsigned int c;
666+
int len;
667+
668+
while (ptr < end) {
669+
c = rb_enc_codepoint_len(ptr, end, &len, enc);
670+
671+
if (!is_graphic(c, encidx))
672+
break;
673+
674+
ptr += len;
675+
count ++;
676+
}
677+
678+
return LONG2NUM(idx + count);
679+
}
680+
}
681+
605682
/*
606683
* call-seq:
607684
* min_nonspace_index(string, index=0) -> int
@@ -771,6 +848,7 @@ void Init_native_ext(void) {
771848
rb_define_singleton_method(rb_m, "graphic?", rb_graphic_p, -1);
772849
rb_define_singleton_method(rb_m, "whitespace?", rb_whitespace_p, -1);
773850
rb_define_singleton_method(rb_m, "min_graphic_index", rb_min_graphic_index, -1);
851+
rb_define_singleton_method(rb_m, "min_nongraphic_index",rb_min_nongraphic_index,-1);
774852
rb_define_singleton_method(rb_m, "min_nonspace_index", rb_min_nonspace_index, -1);
775853
rb_define_singleton_method(rb_m, "max_nonspace_index", rb_max_nonspace_index, -1);
776854
}

lib/stupidedi/reader.rb

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -230,7 +230,15 @@ class << self
230230

231231
# @return [StreamReader]
232232
def build(input, *args)
233-
Tokenizer.build(Input.build(input, *args))
233+
if args.last.is_a?(Hash)
234+
keywords = {}
235+
keywords[:config] = args.last.delete(:config) if args.last.include?(:config)
236+
keywords[:strict] = args.last.delete(:config) if args.last.include?(:strict)
237+
else
238+
keywords = {}
239+
end
240+
241+
Tokenizer.build(Input.build(input, *args), *keywords)
234242
end
235243
end
236244
end

lib/stupidedi/reader/input.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ class Input
1515
# @return [Position]
1616
attr_reader :position
1717

18-
def_delegators :@pointer, :head, :defined_at?, :empty?, :[],
18+
def_delegators :@pointer, :head, :defined_at?, :empty?, :[], :to_s,
1919
:take, :index, :=~, :==, :at, :encoding, :offset, type: Substring
2020

2121
def initialize(pointer, position)

lib/stupidedi/reader/substring.rb

Lines changed: 36 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -260,8 +260,9 @@ def lstrip!(start_at = 0)
260260
def rstrip(start_at = @length - 1)
261261
return self if @length.zero?
262262
raise ArgumentError, "start_at must be non-negative" if start_at < 0
263-
start_at = @length - 1 if start_at >= @length
264-
index = NativeExt.max_nonspace_index(@storage, @offset + start_at)
263+
264+
start_at = @length - 1 if start_at >= @length
265+
index = NativeExt.max_nonspace_index(@storage, @offset + start_at)
265266

266267
if index >= @offset + @length - 1
267268
self
@@ -315,6 +316,15 @@ def min_graphic_index(offset = 0)
315316
n - @offset
316317
end
317318

319+
def min_nongraphic_index(offset = 0)
320+
raise ArgumentError, "offset must be non-negative" if offset < 0
321+
offset = @length if offset > @length
322+
323+
n = NativeExt.min_nongraphic_index(@storage, @offset + offset)
324+
n = @offset + @length if n > @offset + @length
325+
n - @offset
326+
end
327+
318328
# Returns true if the character at the given offset is a control
319329
# character (defined by X222.pdf B.1.1.2.4 Control Characters)
320330
#
@@ -324,6 +334,30 @@ def graphic?(offset)
324334
NativeExt.graphic?(@storage, @offset + offset)
325335
end
326336

337+
# @private
338+
# @return [String]
339+
def clean
340+
from = 0
341+
upto = min_nongraphic_index
342+
return self unless upto < @length
343+
344+
# We know Substring#<< cannot be zero-copy at this point
345+
buffer = @storage[@offset + from, upto - from] unless from == upto
346+
347+
while from < @length and upto < @length
348+
from = min_graphic_index(upto)
349+
upto = min_nongraphic_index(from)
350+
351+
if buffer.nil?
352+
buffer = @storage[@offset + from, upto - from]
353+
else
354+
buffer << @storage[@offset + from, upto - from]
355+
end unless from == upto
356+
end
357+
358+
buffer || ""
359+
end
360+
327361
# @endgroup
328362
#########################################################################
329363

lib/stupidedi/reader/tokenizer.rb

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -282,14 +282,14 @@ def _next_segment_id(input)
282282
#
283283
# Later, the segment_id will be subject to many equality checks, so it
284284
# is faster overall to make the substring copy here.
285-
segment_id = buffer.to_s
285+
segment_id = buffer.clean.to_s
286286

287287
return expected("segment identifier, found %s" % segment_id.inspect,
288288
start_pos) unless segment_id.match?(VALID_SEGMENT_ID)
289289

290-
# DEBUG
291-
raise unless input.start_with?(@separators.segment, xx) \
292-
or input.start_with?(@separators.element, xx)
290+
# TODO: DEBUG
291+
# raise unless input.start_with?(@separators.segment, xx) \
292+
# or input.start_with?(@separators.element, xx)
293293

294294
return done(segment_id.to_sym, start_pos, input.drop!(xx))
295295
end
@@ -328,8 +328,8 @@ def _read_elements(input, segment_id, element_uses)
328328
element_idx += 1
329329
end
330330

331-
# DEBUG
332-
raise unless input.start_with?(@separators.segment)
331+
# TODO: DEBUG
332+
# raise unless input.start_with?(@separators.segment)
333333

334334
# Skip past the segment separator
335335
done(element_toks, nil, input.lstrip_nongraphic!(1))
@@ -415,18 +415,18 @@ def _read_component_element(input, repeatable, parent_repeatable, segment_id, el
415415
buffer = input[offset, length]
416416

417417
if repeatable
418-
builder.add(Tokens::SimpleElementTok.build(buffer, repeat_pos))
418+
builder.add(Tokens::SimpleElementTok.build(buffer.clean, repeat_pos))
419419
offset = rs + 1
420420
repeat_pos = input.position_at(offset)
421421
else # parent_repeatable
422-
builder.add(Tokens::ComponentElementTok.build(buffer, repeat_pos))
422+
builder.add(Tokens::ComponentElementTok.build(buffer.clean, repeat_pos))
423423
return done(builder.build, builder.position, input.drop!(rs))
424424
end
425425
elsif xx
426426
length = xx - offset
427427
buffer = input[offset, length]
428428

429-
builder.add(Tokens::ComponentElementTok.build(buffer, repeat_pos))
429+
builder.add(Tokens::ComponentElementTok.build(buffer.clean, repeat_pos))
430430
return done(builder.build, builder.position, input.drop!(xx))
431431
else
432432
break
@@ -465,7 +465,7 @@ def _read_simple_element(input, repeatable, segment_id, element_idx)
465465
length = rs - offset
466466
buffer = input[offset, length]
467467

468-
builder.add(Tokens::SimpleElementTok.build(buffer, repeat_pos))
468+
builder.add(Tokens::SimpleElementTok.build(buffer.clean, repeat_pos))
469469
offset = rs + 1
470470
repeat_pos = input.position_at(offset)
471471
elsif xx
@@ -474,7 +474,7 @@ def _read_simple_element(input, repeatable, segment_id, element_idx)
474474
length = xx - offset
475475
buffer = input[offset, length]
476476

477-
builder.add(Tokens::SimpleElementTok.build(buffer, repeat_pos))
477+
builder.add(Tokens::SimpleElementTok.build(buffer.clean, repeat_pos))
478478
return done(builder.build, start_pos, input.drop!(xx))
479479
else
480480
break

spec/lib/stupidedi/reader/substring_spec.rb

Lines changed: 36 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -935,8 +935,8 @@ def matchp(num_calls)
935935
end
936936

937937
describe "#rstrip" do
938-
let(:sb) { Stupidedi::Reader::Pointer.build(" abc ") }
939-
let(:mb) { Stupidedi::Reader::Pointer.build(" 💃🏽🕺🏻 ") }
938+
let(:sb) { pointer(" abc ") }
939+
let(:mb) { pointer(" 💃🏽🕺🏻 ") }
940940

941941
context "when string is empty" do
942942
specify { expect(sb.take(0).rstrip).to eq("") }
@@ -955,15 +955,16 @@ def matchp(num_calls)
955955
end
956956

957957
allocation do
958+
sb; mb # pre-allocate Substring
958959
expect{ mb.rstrip }.to allocate(String: 0, mb.class => 1)
959960
expect{ sb.rstrip }.to allocate(String: 0, sb.class => 1)
960961
end
961962
end
962963
end
963964

964965
describe "#lstrip" do
965-
let(:sb) { Stupidedi::Reader::Pointer.build(" abc ") }
966-
let(:mb) { Stupidedi::Reader::Pointer.build(" 💃🏽🕺🏻 ") }
966+
let(:sb) { pointer(" abc ") }
967+
let(:mb) { pointer(" 💃🏽🕺🏻 ") }
967968

968969
context "when string is empty" do
969970
specify { expect(sb.take(0).lstrip).to eq("") }
@@ -982,6 +983,7 @@ def matchp(num_calls)
982983
end
983984

984985
allocation do
986+
sb; mb # pre-allocate Substring
985987
expect{ sb.lstrip }.to allocate(String: 0, sb.class => 1)
986988
expect{ mb.lstrip }.to allocate(String: 0, mb.class => 1)
987989
end
@@ -1015,4 +1017,34 @@ def matchp(num_calls)
10151017

10161018
describe "#max_whitespace_index" do
10171019
end
1020+
1021+
describe "#clean" do
1022+
context "when no control characters present" do
1023+
let(:sb) { pointer("*A^B~C:D*A:B~C*D^") }
1024+
let(:mb) { pointer(" 💃🏽🕺🏻 ") }
1025+
1026+
allocation do
1027+
sb; mb # pre-allocate Substring
1028+
expect { sb.clean }.to allocate(String: 0)
1029+
expect { mb.clean }.to allocate(String: 0)
1030+
end
1031+
1032+
specify { expect(sb.clean).to eq(sb) }
1033+
specify { expect(mb.clean).to eq(mb) }
1034+
end
1035+
1036+
context "when some control characters present" do
1037+
let(:sb) { pointer("*A^B\r\nC:D*A:B\r\nC*D^") }
1038+
let(:mb) { pointer("\r\n💃🏽🕺🏻\r\n") }
1039+
1040+
allocation do
1041+
sb; mb # pre-allocate Substring
1042+
expect { sb.clean }.to allocate(String: 3)
1043+
expect { mb.clean }.to allocate(String: 1)
1044+
end
1045+
1046+
specify { expect(sb.clean).to eq(sb.storage.gsub(/[\r\n]/, "")) }
1047+
specify { expect(mb.clean).to eq(mb.storage.gsub(/[\r\n]/, "")) }
1048+
end
1049+
end
10181050
end

0 commit comments

Comments
 (0)