Skip to content

Commit 6e9c83c

Browse files
authored
[FileFormats.LP] allow newline in term and improve keyword identification (#2847)
1 parent b845d44 commit 6e9c83c

File tree

2 files changed

+194
-30
lines changed

2 files changed

+194
-30
lines changed

src/FileFormats/LP/read.jl

Lines changed: 101 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -280,13 +280,15 @@ A struct that is used to manage state when lexing. It stores:
280280
error message to the user on a parse error
281281
* `peek_char`: the next `Char` in the `io`
282282
* `peek_tokens`: the list of upcoming tokens that we have already peeked
283+
* `current_token`: the most recent token that we have `read`
283284
"""
284285
mutable struct _LexerState{O<:IO}
285286
io::O
286287
line::Int
287288
peek_char::Union{Nothing,Char}
288289
peek_tokens::Vector{_Token}
289-
_LexerState(io::IO) = new{typeof(io)}(io, 1, nothing, _Token[])
290+
current_token::Union{Nothing,_Token}
291+
_LexerState(io::IO) = new{typeof(io)}(io, 1, nothing, _Token[], nothing)
290292
end
291293

292294
"""
@@ -351,6 +353,7 @@ function Base.read(state::_LexerState, ::Type{_Token})
351353
)
352354
end
353355
popfirst!(state.peek_tokens)
356+
state.current_token = token
354357
return token
355358
end
356359

@@ -371,6 +374,16 @@ end
371374

372375
_is_number(c::Char) = isdigit(c) || c in ('.', 'e', 'E', '+', '-')
373376

377+
_nothing_or_newline(::Nothing) = true
378+
_nothing_or_newline(t::_Token) = t.kind == _TOKEN_NEWLINE
379+
380+
function _prior_token(state::_LexerState)
381+
if length(state.peek_tokens) <= 1
382+
return state.current_token
383+
end
384+
return state.peek_tokens[end-1]
385+
end
386+
374387
function Base.peek(state::_LexerState, ::Type{_Token}, n::Int = 1)
375388
@assert n >= 1
376389
while length(state.peek_tokens) < n
@@ -379,22 +392,58 @@ function Base.peek(state::_LexerState, ::Type{_Token}, n::Int = 1)
379392
return nothing
380393
end
381394
push!(state.peek_tokens, token)
382-
if _compare_case_insenstive(token, "subject")
395+
if token.kind != _TOKEN_IDENTIFIER
396+
continue
397+
end
398+
# Here we have a _TOKEN_IDENTIFIER. But if it is not preceeded by a
399+
# _TOKEN_NEWLINE, it cannot be a _TOKEN_KEYWORD.
400+
if !_nothing_or_newline(_prior_token(state))
401+
continue
402+
end
403+
# It might be a _TOKEN_KEYWORD.
404+
(kw = _case_insenstive_identifier_to_keyword(token.value))
405+
if kw !== nothing
406+
# The token matches a single word keyword. All keywords are followed
407+
# by a new line, or an EOF.
383408
t = _peek_inner(state)
384-
if _compare_case_insenstive(t, "to")
385-
state.peek_tokens[end] =
386-
_Token(_TOKEN_KEYWORD, "CONSTRAINTS", token.pos)
387-
else
409+
if _nothing_or_newline(t)
410+
state.peek_tokens[end] = _Token(_TOKEN_KEYWORD, kw, token.pos)
411+
end
412+
if t !== nothing
388413
push!(state.peek_tokens, t)
389414
end
390-
elseif _compare_case_insenstive(token, "such")
391-
t = _peek_inner(state)
392-
if _compare_case_insenstive(t, "that")
415+
continue
416+
end
417+
# There are two keyword that contain whitespace: `subject to` and
418+
# `such that`
419+
for (a, b) in ("subject" => "to", "such" => "that")
420+
if !_compare_case_insenstive(token, a)
421+
continue
422+
end
423+
# This _might_ be `subject to`, or it might just be a variable
424+
# named `subject`, like `obj:\n subject\n`.
425+
token_b = _peek_inner(state)
426+
if token_b === nothing
427+
# The next token is EOF. Nothing to do here.
428+
break
429+
elseif !_compare_case_insenstive(token_b, b)
430+
# The second token doesn't match. Store `token_b` and break
431+
push!(state.peek_tokens, token_b)
432+
break
433+
end
434+
# We have something that matches (a, b), but a TOKEN_KEYWORD needs
435+
# to be followed by a new line.
436+
token_nl = _peek_inner(state)
437+
if _nothing_or_newline(token_nl)
393438
state.peek_tokens[end] =
394439
_Token(_TOKEN_KEYWORD, "CONSTRAINTS", token.pos)
395440
else
396-
push!(state.peek_tokens, t)
441+
push!(state.peek_tokens, token_b)
442+
end
443+
if token_nl !== nothing
444+
push!(state.peek_tokens, token_nl)
397445
end
446+
break
398447
end
399448
end
400449
return state.peek_tokens[n]
@@ -426,11 +475,7 @@ function _peek_inner(state::_LexerState)
426475
write(buf, c)
427476
_ = read(state, Char)
428477
end
429-
val = String(take!(buf))
430-
if (kw = _case_insenstive_identifier_to_keyword(val)) !== nothing
431-
return _Token(_TOKEN_KEYWORD, kw, pos)
432-
end
433-
return _Token(_TOKEN_IDENTIFIER, val, pos)
478+
return _Token(_TOKEN_IDENTIFIER, String(take!(buf)), pos)
434479
elseif (op = get(_OPERATORS, c, nothing)) !== nothing
435480
_ = read(state, Char) # Skip c
436481
if c == '-' && peek(state, Char) == '>'
@@ -473,6 +518,19 @@ function _skip_newlines(state::_LexerState)
473518
return
474519
end
475520

521+
function _next_non_newline(state::_LexerState)
522+
n = 1
523+
while true
524+
t = peek(state, _Token, n)
525+
if t === nothing
526+
return nothing
527+
elseif t.kind != _TOKEN_NEWLINE
528+
return t
529+
end
530+
n += 1
531+
end
532+
end
533+
476534
# IDENTIFIER := "string"
477535
#
478536
# There _are_ rules to what an identifier can be. We handle these when lexing.
@@ -605,14 +663,10 @@ function _parse_quad_expression(
605663
)
606664
end
607665
end
608-
while _next_token_is(state, _TOKEN_NEWLINE)
609-
if _next_token_is(state, _TOKEN_KEYWORD, 2)
610-
break
611-
end
612-
_ = read(state, _Token, _TOKEN_NEWLINE)
613-
end
614-
if _next_token_is(state, _TOKEN_DIVISION)
615-
_ = read(state, _Token) # /
666+
t = _next_non_newline(state)
667+
if t !== nothing && t.kind == _TOKEN_DIVISION
668+
_skip_newlines(state)
669+
_ = read(state, _Token, _TOKEN_DIVISION) # /
616670
# Must be /2
617671
n = read(state, _Token, _TOKEN_NUMBER)
618672
if n.value != "2"
@@ -634,10 +688,11 @@ function _parse_quad_expression(
634688
end
635689

636690
# TERM :=
637-
# "+" TERM
691+
# [\n*] TERM
692+
# | "+" TERM
638693
# | "-" TERM
639-
# | NUMBER
640694
# | IDENTIFIER
695+
# | NUMBER
641696
# | NUMBER IDENTIFIER
642697
# | NUMBER "*" IDENTIFIER
643698
# | QUADRATIC_EXPRESSION
@@ -670,12 +725,28 @@ function _parse_term(
670725
_ = read(state, _Token, _TOKEN_MULTIPLICATION)
671726
x = _parse_variable(state, cache)
672727
return MOI.ScalarAffineTerm(coef, x)
673-
elseif _next_token_is(state, _TOKEN_NEWLINE) ||
674-
_next_token_is(state, _TOKEN_ADDITION) ||
675-
_next_token_is(state, _TOKEN_SUBTRACTION)
676-
# NUMBER
677-
return coef
728+
elseif _next_token_is(state, _TOKEN_NEWLINE)
729+
# This could either be NUMBER \nEND-OF-TERM, or it could be a term
730+
# split by a new line, like `2\nx`.
731+
t = _next_non_newline(state)
732+
if t === nothing
733+
# NUMBER
734+
return coef
735+
elseif t.kind == _TOKEN_MULTIPLICATION
736+
# NUMBER \n * [\n] IDENTIFIER
737+
_skip_newlines(state)
738+
_ = read(state, _Token, _TOKEN_MULTIPLICATION)
739+
_skip_newlines(state)
740+
x = _parse_variable(state, cache)
741+
return MOI.ScalarAffineTerm(coef, x)
742+
elseif t.kind == _TOKEN_IDENTIFIER
743+
# NUMBER \n IDENTIFIER
744+
x = _parse_variable(state, cache)
745+
return MOI.ScalarAffineTerm(coef, x)
746+
end
678747
end
748+
# NUMBER
749+
return coef
679750
elseif _next_token_is(state, _TOKEN_OPEN_BRACKET)
680751
# QUADRATIC_EXPRESSION
681752
return _parse_quad_expression(state, cache, prefix)

test/FileFormats/LP/LP.jl

Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1554,6 +1554,99 @@ function test_new_line_edge_case_fails()
15541554
return
15551555
end
15561556

1557+
function test_parse_keyword_edge_cases_identifier_is_keyword()
1558+
for name in ["max", "min", "st", "such", "bounds", "obj", "free"]
1559+
io = IOBuffer("""
1560+
maximize
1561+
obj: $name
1562+
subject to
1563+
$name <= 1
1564+
bounds
1565+
$name free
1566+
end
1567+
""")
1568+
seekstart(io)
1569+
model = LP.Model()
1570+
MOI.read!(io, model)
1571+
x = only(MOI.get(model, MOI.ListOfVariableIndices()))
1572+
@test MOI.get(model, MOI.VariableName(), x) == name
1573+
end
1574+
return
1575+
end
1576+
1577+
function test_parse_keyword_subject_to_errors()
1578+
for line in ["subject", "subject too", "subject to a:"]
1579+
io = IOBuffer("""
1580+
maximize
1581+
obj: x
1582+
$line
1583+
x <= 1
1584+
bounds
1585+
x free
1586+
end
1587+
""")
1588+
seekstart(io)
1589+
model = LP.Model()
1590+
@test_throws LP.ParseError MOI.read!(io, model)
1591+
end
1592+
return
1593+
end
1594+
1595+
function test_parse_newline_in_objective_expression()
1596+
for obj in ["2 x", "\n2 x", "2\nx", "2*\nx", "2\n*x", "2\n\n*\n\n\nx\n"]
1597+
io = IOBuffer("""
1598+
maximize
1599+
obj: $obj
1600+
subject to
1601+
bounds
1602+
x free
1603+
end
1604+
""")
1605+
seekstart(io)
1606+
model = LP.Model()
1607+
MOI.read!(io, model)
1608+
x = MOI.get(model, MOI.VariableIndex, "x")
1609+
f = 2.0 * x
1610+
g = MOI.get(model, MOI.ObjectiveFunction{typeof(f)}())
1611+
@test isapprox(f, g)
1612+
end
1613+
return
1614+
end
1615+
1616+
function test_parse_subject_eof()
1617+
io = IOBuffer("maximize\nobj:\nsubject")
1618+
seekstart(io)
1619+
model = LP.Model()
1620+
MOI.read!(io, model)
1621+
x = MOI.get(model, MOI.VariableIndex, "subject")
1622+
@test x isa MOI.VariableIndex
1623+
return
1624+
end
1625+
1626+
function test_parse_expr_eof()
1627+
io = IOBuffer("maximize\nobj: x + 2\n")
1628+
seekstart(io)
1629+
model = LP.Model()
1630+
MOI.read!(io, model)
1631+
x = MOI.get(model, MOI.VariableIndex, "x")
1632+
f = 1.0 * x + 2.0
1633+
g = MOI.get(model, MOI.ObjectiveFunction{typeof(f)}())
1634+
@test isapprox(f, g)
1635+
return
1636+
end
1637+
1638+
function test_parse_quadratic_expr_eof()
1639+
io = IOBuffer("maximize\nobj: [x * x]\n")
1640+
seekstart(io)
1641+
model = LP.Model()
1642+
MOI.read!(io, model)
1643+
x = MOI.get(model, MOI.VariableIndex, "x")
1644+
f = 1.0 * x * x
1645+
g = MOI.get(model, MOI.ObjectiveFunction{typeof(f)}())
1646+
@test isapprox(f, g)
1647+
return
1648+
end
1649+
15571650
end # module
15581651

15591652
TestLP.runtests()

0 commit comments

Comments
 (0)