uksiem-parser/textparser.py at master · finiteprods/uksiem-parser · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
import pyparsing as p
import json
import argparse
from pprint import pprint
from typing import Dict, Any
# import sys

# set <FF> to also be default whitespace so it may be ignored
p.ParserElement.setDefaultWhitespaceChars(' \n\t\x0c')

# p.printables only covers ASCII chars https://stackoverflow.com/a/2340659
# unicodePrintables = ''.join(chr(c)
#                             for c in range(sys.maxunicode)
#                             if not chr(c).isspace())
# word = p.Word(unicodePrintables)

word = p.Word(p.printables)


def multiLit(sentence: str) -> p.ParseExpression:
    return p.And(map(p.CaselessLiteral, sentence.split()))


mattersHead = 'Matters of special interest to the {0} Committee on ' + \
              'Statutory Instruments{1}'

# mostly taken from SIP circulars '04, '08, '10
secheadings = ['Introduction', 'Description',
               'Purpose of the instruments', 'Purpose of the instrument',
               'Purpose of the instrument(s)',
               'Purpose of the statutory instrument',
               'Purpose of instrument', 'Purpose of instruments',
               'Purpose of this instrument', 'Purpose of these instruments',
               'Matters of special interest to Parliament',
               'Matters of special interest',
               'Legislative Background', 'Legislative Context',
               'Legislative Content',
               'Extent', 'Territorial Extent',
               'Territorial Extent and Application',
               'Territorial Extent & Application',
               'Territorial Application and Extent',
               'Extent and Territorial Application',
               'European Convention on Human Rights', 'Policy background',
               'Policy background - What is being done and why',
               'Consultation outcome', 'Guidance', 'Impact',
               'Impact assessment',
               'Regulating small business', 'Consultation',
               'Monitoring & review', 'Monitoring and review']

# longer headings require more care, can span multiple lines
secLongHeads = [multiLit(mattersHead.format('Joint', '')),
                multiLit(mattersHead.format('Joint', '.')),
                multiLit(mattersHead.format('Select', '')),
                multiLit(mattersHead.format('Select', '.')),
                multiLit(mattersHead.format('Joint Committee on Statutory ' +
                                            'Instruments or the Select',
                                            '')),
                multiLit(mattersHead.format('[Joint Committee on Statutory ' +
                                            'Instruments or the Select',
                                            ']'))]

firstKeyword = p.CaselessLiteral('EXPLANATORY')
# NOTE p.CloseMatch is very buggy don't use it

secondKeyword = p.CaselessLiteral('MEMORANDUM') \
              | p.Literal('NOTE') \
              | p.Literal('DOCUMENT')

title0 = p.originalTextFor(firstKeyword +
                           secondKeyword +
                           p.restOfLine)

yr = p.Word(p.nums, exact=4)
year = yr + p.Optional('.') \
     | '[' + yr + ']'

num = p.Word(p.nums, max=4) \
    | p.Literal('DRAFT') \
    | p.Literal('XXXX')
number = num | '[' + num + ']'
# NOTE making sq brackets optionals will mess up originalTextFor real bad

separator = p.oneOf(['No.', 'No', 'No .', '/'], caseless=True)

docid = p.originalTextFor(year +
                          separator +
                          # p.SkipTo(number, failOn=p.LineEnd()) +
                          number +
                          p.restOfLine) \
      | p.originalTextFor(year + separator + p.LineEnd())

title = p.OneOrMore(word, stopOn=docid).setParseAction(' '.join)

startsec1 = p.OneOrMore(p.LineEnd()) \
          + p.Combine('1' + p.Optional('.')) + p.White() \
          | p.Literal('Introduction') + p.LineEnd()

frontMatter = p.Suppress(p.SkipTo(title0)) + \
              title0('Header') + \
              p.OneOrMore(p.Group(title('Title') + docid('ID')),
                          stopOn=startsec1)('Titles')

snum = p.Word(p.nums, max=2)

secnum = p.Combine(snum + p.Optional('.'))('Number')

unheadedSecnum = p.oneOf(['1.', '2.', '1', '2'])('Number')

# forbid free-form section headings for sanity's sake
sechead = p.Or(secLongHeads).setParseAction(' '.join)('Heading') \
        | p.oneOf(secheadings, caseless=True)('Heading')

subsecnum = p.Combine(snum + '.' + snum +
                      p.ZeroOrMore('.' + snum) + p.Optional('.'))

LEs = p.OneOrMore(p.LineEnd())
linebreak = p.LineEnd()*2 + p.ZeroOrMore(p.LineEnd())
pgbreak = p.LineEnd() + p.White('\x0c')

# sections don't always start with a number but assume they follow line break
# NOTE some examples break this assumption
# hence be more permissive at least for the most common case
startsec = LEs + secnum + sechead + p.LineEnd() \
         | linebreak + sechead + p.LineEnd() \
         | linebreak + unheadedSecnum + p.White() \
         | pgbreak + secnum + sechead + p.LineEnd() \
         | pgbreak + sechead + p.LineEnd()
startsubsec = LEs + subsecnum + p.White()
startSubsecOrSec = startsubsec | startsec
endSubsecText = startSubsecOrSec | linebreak

# subsec text should also stop when going back up to section text
# assume that is preceded by a line break
subsecTextContent = p.OneOrMore(word, stopOn=endSubsecText) \
                     .setParseAction(' '.join)

subsec = p.Group(subsecnum + subsecTextContent)

secTextContent = p.OneOrMore(word, stopOn=startSubsecOrSec) \
                  .setParseAction(' '.join)

subsecgroup = p.OneOrMore(subsec)

# gotta be careful with secTextContent it'll eat through sections if you let it
# do negative lookaheads to be safe
subsecsText = p.Group(subsecgroup +
                      p.ZeroOrMore(~startsec + secTextContent +
                                   subsecgroup) +
                      p.Optional(~startsec + secTextContent))
textSubsecs = p.Group(secTextContent +
                      p.ZeroOrMore(subsecgroup +
                                   ~startsec + secTextContent) +
                      p.Optional(subsecgroup))

content = (subsecsText | textSubsecs)('Content')

section = p.Group(secnum + sechead + content) \
        | p.Group(unheadedSecnum + content) \
        | p.Group(sechead + content)

termsechead = p.oneOf(['Contacts', 'Contact'], caseless=True)

terminalsec = p.OneOrMore(p.LineEnd()) + secnum + termsechead \
            | p.OneOrMore(p.LineEnd()) + termsechead + p.LineEnd()

document = frontMatter + p.OneOrMore(section, stopOn=terminalsec)('Sections')

intro = frontMatter + section('Section')

LS = p.Suppress(p.LineStart())
LE = p.Suppress(p.LineEnd())
space = p.Suppress(p.White(' '))

startFootnote = LS + p.Word(p.nums, max=2)('Number') + LE + space

endFootnote = startFootnote | p.White('\x0c')

footnote = p.Group(startFootnote + p.OneOrMore(word, stopOn=endFootnote)
                                    .setParseAction(' '.join)('Content'))

footnotegroup = p.OneOrMore(footnote, stopOn=p.White('\x0c'))

# NOTE seems to affect performance a fair bit...
document.ignore(footnotegroup)

watermark = p.Combine(p.Literal('TNA/EM/10-2015') + p.Optional('.1'))

# the order is significant here
# NOTE since 1st "token" is whitespace prefix with an ending word to latch on
footer = linebreak + p.Word(p.nums) + LE + p.FollowedBy(p.White('\x0c')) \
       | linebreak + p.Word(p.nums) + LE + watermark + LE \
       | linebreak + watermark + LE + p.Word(p.nums) + LE \
       | LS + p.Word(p.nums) + LE + p.FollowedBy(p.White('\x0c'))
footer.setParseAction(lambda: '\n')

# ideally also include to end of last page in case of footnotes there
# but matching on last page break prob too much trouble
trim = p.originalTextFor(firstKeyword + secondKeyword +
                         p.SkipTo(terminalsec) + terminalsec)


# TODO maybe remove or find a less fragile way
def check(emdict: Dict[str, Any]) -> str:
    # intended to perform some post-parse validation on the document
    # log any such failed checks
    # msgs = []
    # if 'Header' not in emdict:
    #     msgs.append('Header key missing')
    # if 'Intro' not in emdict:
    #     msgs.append('Intro key missing')
    # if 'Titles' not in emdict:
    #     msgs.append('Titles key missing')
    # else:
    #     idTitles = emdict['Titles']
    #     if len(idTitles) < 1:
    #         msgs.append('Titles missing')
    #     elif any('ID' not in p or 'Title' not in p for p in idTitles):
    #         msgs.append('ID or Title key missing')
    # if 'Sections' not in emdict:
    #     msgs.append('Sections key missing')
    # return '; '.join(msgs)
    return ''


def parse2json(infile, outfile, ftnotes: bool) -> str:
    msg = ''
    pdftxt = ''
    with open(infile) as f:
        pdftxt = f.read()
    if pdftxt is '':
        print('Nothing read from ' + infile)
    # attempt to trim out irrelevant parts of the document
    matches = trim('Trim').searchString(pdftxt, maxMatches=1)
    if len(matches) > 0:
        pdftxt = matches[0].Trim
    else:
        msg = 'No matches for trim'
    # attempt to trim footers / page numbers
    txt2 = (word + footer).transformString(pdftxt)
    # if not in hurry parse footnotes
    footnotes = []
    if ftnotes:
        footnotes = footnotegroup.searchString(txt2)
    # main document parse
    parse = document.parseString(txt2)
    parseDict = parse.asDict()
    if len(footnotes) > 0:
        parseDict['References'] = sum(footnotes).asList()
    with open(outfile, 'w') as f:
        json.dump(parseDict, f, indent=4, sort_keys=True)
    return msg


# TESTING CODE

testdocs = [('''                   DRAFT EXPLANATORY MEMORANDUM TO

THE EXCHANGE GAINS AND LOSSES (BRINGING INTO ACCOUNT GAINS OR
           LOSSES) (AMENDMENT) REGULATIONS 2015

                                       2015 No.

 THE LOAN RELATIONSHIPS AND DERIVATIVE CONTRACTS (DISREGARD
 AND BRINGING INTO ACCOUNT OF PROFITS AND LOSSES) (AMENDMENT)
                       REGULATIONS 2015

                                       [2015] No. 1961

     THE LOAN RELATIONSHIPS AND DERIVATIVE CONTRACT'S (CHANGE OF
       "BRITISH FILM") (AMENDMENT NO. 2) REGULATIONS 2015

                                       2015 No 1962

                                            AND

     THE LOAN RELATIONSHIPS - DERIVATIVE CONTRACTS (EXCHANGE
      GAINS AND LOSSES USING FAIR VALUE ACCOUNTING) (AMENDMENT)
                           REGULATIONS 2015

                                       2015 No. 1963 C.69
''', frontMatter), ('''
2. Purpose of the instrument

   This Order designates the Common Council of the City of London as a
   secondary authority in respect of areas listed at 7.5 of this memorandum to
   make an order providing for offences relating to dogs.
''', section('Section')), ('''
10. Impact

10.1   The impact on business, charities or voluntary bodies is neutral.

10.2   The impact on the public sector is neutral.

10.3   An Impact Assessment on the CNEA was completed in April 2005.

11. Regulating small business

The legislation does not apply to small businesses.
''', section('Section')), ('''
1.     This explanatory memo has been prepared by the Department of Energy and
       Climate Change and is laid before Parliament by Command of Her Majesty.
       This memo contains info for the Joint Committee on SIs.

2.     Description
''', section('Section')), ('''
                          EXPLANATORY MEMORANDUM TO

     THE REGISTERED PENSION SCHEMES AND RELIEVED NON-UK PENSION
        SCHEMES (LIFETIME ALLOWANCE TRANSITIONAL PROTECTION)
      (INDIVIDUAL PROTECTION 2014 NOTIFICATION) REGULATIONS 2014

                                         2014 No. 1842


1.     This explanatory memo has been prepared by Her Majesty's Revenue and
       Customs ("HMRC") and is laid before the House of Commons by Command
       of Her Majesty.
''', frontMatter), ('''
13.   Contact

      Fiona Henderson at HM Treasury Tel: 020 7270 5846 or email:
                    EXPLANATORY MEMORANDUM TO

 THE FINANCIAL SERVICES AND MARKETS ACT 2000 (AMENDMENTS
             TO PART 18A ETC.) REGULATIONS 2010

                                 2010 No. 1193
''', frontMatter), ('''
                          EXPLANATORY MEMORANDUM

     THE SCHOOL GOVERNANCE (ROLES, PROCEDURES AND ALLOWANCES)
              (ENGLAND) (AMENDMENT) REGULATIONS 2017

                                          2017 No. XXXX

1.      This explanatory memorandum has been prepared by the Department for
        Education and is laid before Parliament by Command of Her Majesty.

        This memorandum contains information for the Joint Committee on
        Statutory Instruments.

2.      Purpose of the instrument
''', intro), ('''
                              EXPLANATORY MEMORANDUM TO

     THE NATIONAL HEALTH SERVICE TRUST DEVELOPMENT AUTHORITY
    (DIRECTIONS AND MISCELLANEOUS AMENDMENTS ETC.) REGULATIONS
                               2016

                                              2016 No. 214

          Introduction
          This explanatory memorandum has been prepared by the Department of
          Health and is laid before Parliament by Command of Her Majesty.

          Purpose of the instrument
          The purpose of the NHS cannot be underestimated.
''', intro), ('''Majesty.

     2. Purpose of the instrument''', word + startsec), ('''
                       EXPLANATORY MEMORANDUM TO

         THE INCOME TAX (CONSTRUCTION INDUSTRY SCHEME)
                 (AMENDMENT) REGULATIONS 2013

                                    2013 No. 620

     1. This explanatory memorandum has been prepared by HM Revenue and
        Customs (HMRC) and is laid before the House of Commons by Command of
        Her Majesty.

     2. Purpose of the instrument

        2.1    These Regulations make minor amendments to the Income Tax
        (Construction Industry Scheme) Regulations 2005 (S.I. 2005/2045) ("the
        principal Regulations") which are necessary for the operation of HMRC's
        Real Time Information (RTI) programme from April 2013.
''', intro), ('''
2.      Purpose of the instrument

        2.1      Here is some content for the first subsection.

        2.2      Here is the next subsection content, that has a line starting
                 3 i.e. a number which trips up the parser thinking it's the
                 start of a new section. Now some bullets:
                     x supplemental instruments under section 42,
                     x onward property transfer instruments under section 43
                     x property transfer orders under section 45 of the Act.

        2.3      Last subsection.
''', section('Section2')), ('''
3. Matters of special interest to the Select Committee on Statutory
   Instruments

    3.1      This Order is subject to the affirmative resolution procedure.

3.   Matters of special interest to the [Joint Committee on Statutory
     Instruments or the Select Committee on Statutory Instruments]

     3.1     None

3 Matters of special interest to the Joint Committee on Statutory Instruments

   3.1 Nada

7.      Policy background

        What is being done and why
7.1     These Regulations form part of the scheme.

       Consolidation
7.6    The Department does not intend to consolidate these Regulations.

3.    Matters of special interest to Parliament

      Matters of special interest to the Joint Committee on Statutory
      Instruments
3.1   None.

      Other matters of interest to the House of Commons
3.2   Does not arise at this stage.
''', p.OneOrMore(section)('Sections')), ('''
2.   Purpose of the instrument

     Students

     2.1    These Regulations make amendments to the income-related benefit
     Regulations to increase the amount of the disregards from student loan
     or personal maintenance grant income in respect of travel, books and
     equipment.

     Jobseekers Allowance

     2.2    They also amend schedule 1 to the Jobseeker's Allowance regulations
     regarding the amount payable to couples with one member age 18 or over
     and the other member under 18.
''', section('Purpose')), ('''
9.   Guidance

     9.1    There's a page break straight after this sentence.'''
                           + '\n\x0c'
                           + '''10.   Impact

10.1 No Impact Assessment has been prepared because there is no
      additional impact on any part.

11.   Regulating small business

      11.1 The legislation applies to small business.''',
                           section('Guidance') +
                           section('Impact') +
                           section('RSB')), ('''
1
 "Maintained school" is defined in section 84 of the SSFA 1998 to mean a
community, foundation or voluntary school.
2
 Here's another footnote for good measure.
                           ''' + '\x0c', footnotegroup('Footnotes')), ('''
1.   This explanatory memorandum has been prepared by Her Majesty's Revenue
     & Customs (HMRC) and is laid before the House of Commons by Command
     of Her Majesty.
     This memorandum contains information for the Select Committee on Statutory
     Instruments.
2.   Purpose of instrument
     This Order amends the Capital Allowances (Energy-Saving Plant and
     Machinery) Order 2001 (S.I. 2001/2541).
3.   Matters of special interest to the Select Committee on Statutory
     Instruments
     None
''', section('Intro') + section('Purpose') + section('Matters'))]

testTransform = [('''bottom.

                           1
TNA/EM/10-2015
''' + '\x0c' + '''7.       Policy background

         What is being done and why
''', word + footer), ('''
           x Bottom of page.


                                             1
''' + '\x0c' + '''3.   Matters of special interest to the Joint Committee on SI

     3.1      None.
''', word + footer), ('''
         boundaries of wards or divisions for a specific local authority.


TNA/EM/10-2015
                                                  1
''' + '\x0c' + '''         the electoral review of Exeter in December 2014.
''', word + footer), ('''
7.   Policy background
     What is being done and why

1
''' + '\x0c' + '''    7.1     The UK "Securing the Border" Strategy was published
''', word + footer)]

testSearch = [('''
7.5        Our assessment of the proposed changes concludes that they have no
           impact on the private or civil society sectors. Schools and

9
    Section 88H of the SSFA 1998, as amended by section 36 of the Education
Act 2011.
10
     Section 88H of the SSFA 1998, as amended by section 64 of the Education
Act 2011
''' + '\x0c' + '     educational institutions in the public sector',
               footnotegroup)]

if __name__ == '__main__':
    ap = argparse.ArgumentParser()
    ap.add_argument('-t', '--test', help='run sanity tests',
                    action='store_true')
    args = ap.parse_args()
    if args.test:
        print('Parsing test fragments...\n')
        for pair in testdocs:
            print(pair[0])
            try:
                parse = pair[1].parseString(pair[0])
                # parse.pprint()
                pprint(parse.asDict())
                parse.pprint()
                print('\n' + '*'*80)
            except p.ParseException as err:
                print(err.line)
                print(' '*(err.column-1) + '^')
                print(err)
        for pair in testTransform:
            print(pair[0])
            try:
                newString = pair[1].transformString(pair[0])
                print(newString)
                print('\n' + '*'*80)
            except p.ParseException as err:
                print(err.line)
                print(' '*(err.column-1) + '^')
                print(err)
        for pair in testSearch:
            print(pair[0])
            try:
                parse = sum(pair[1].searchString(pair[0]))
                pprint(parse.asList())
                print('\n' + '*'*80)
            except p.ParseException as err:
                print(err.line)
                print(' '*(err.column-1) + '^')
                print(err)