keep-sorted/keepsorted/line_group.go at a9533a9640b2a44a31656b97f7bae32cd67f8778 · baz1/keep-sorted · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
// Copyright 2023 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package keepsorted

import (
	"fmt"
	"regexp"
	"strings"
	"sync"
	"unicode"

	"github.com/rs/zerolog/log"
)

// lineGroup is a logical unit of source code. It's one or more lines combined
// with zero or more comment lines about the source code lines.
type lineGroup struct {
	opts        blockOptions
	prefixOrder func() *prefixOrder

	// The actual content of the lineGroup.
	lineGroupContent

	// Track which methods are used during sorting so we can filter debugging
	// output to just the parts that are relevant.
	access accessRecorder
}

var compareLineGroups = comparingFunc((*lineGroup).commentOnly, falseFirst()).
	andThen(comparingFunc((*lineGroup).regexTokens, lexicographically(compareRegexTokens))).
	andThen(comparing((*lineGroup).joinedLines)).
	andThen(comparing((*lineGroup).joinedComment))

var compareRegexTokens = comparingFunc(func(t regexToken) bool { return t == nil }, falseFirst()).
	andThen(comparingFunc(func(t regexToken) []*captureGroupToken { return t }, lexicographically(compareCaptureGroupTokens)))

var compareCaptureGroupTokens = comparingFunc((*captureGroupToken).prefix, orderedPrefix.compare).
	andThen(comparingFunc((*captureGroupToken).transform, numericTokens.compare))

type lineGroupContent struct {
	comment []string
	lines   []string
}

type accessRecorder struct {
	commentOnly   bool
	regexTokens   []regexTokenAccessRecorder
	joinedLines   bool
	joinedComment bool
}

// matchesAnyRegex returns true if s matches one of the regexes.
func matchesAnyRegex(s string, regexes []*regexp.Regexp) bool {
	for _, regex := range regexes {
		if regex.FindStringSubmatch(s) != nil {
			return true
		}
	}
	return false
}

// groupLines splits lines into one or more lineGroups based on the provided options.
func groupLines(lines []string, metadata blockMetadata) []*lineGroup {
	var groups []*lineGroup
	// Tracks which subsection of lines contains the comments for the current lineGroup.
	var commentRange indexRange
	// Tracks which subsection of lines contains the content for the current lineGroup.
	var lineRange indexRange

	// group=yes and block=no, these pieces of information are used to determine
	// when we group lines together into a single group.

	// Indent: All lines indented further than the first line are grouped together.
	// Edge case: Whitespace-only lines are included in the group based on the
	// indentation of the next non-empty line after the whitespace-only line.
	var indents []int
	var initialIndent *int
	// Counts the number of unmatched start directives we've seen in the current group.
	// We will include entire keep-sorted blocks as grouped lines to avoid
	// breaking nested keep-sorted blocks that don't have indentation.
	var numUnmatchedStartDirectives int

	// block=yes: The code block that we're constructing until we have matched braces and quotations.
	var block codeBlock

	prefixOrder := sync.OnceValue(func() *prefixOrder { return newPrefixOrder(metadata.opts) })

	if metadata.opts.Group {
		indents = calculateIndents(lines)
	}

	// Determines whether the current block is still accepting additional lines.
	shouldAddToBlock := func() bool {
		return metadata.opts.Block && !lineRange.empty() && block.expectsContinuation()
	}
	// Determines whether the current group should accept the next line.
	shouldAddToGroup := func(i int, l string) bool {
		if !metadata.opts.Group {
			return false
		}

		increasedIndent := !lineRange.empty() && initialIndent != nil && indents[i] > *initialIndent
		return increasedIndent || numUnmatchedStartDirectives > 0 || metadata.opts.hasGroupPrefix(l)
	}
	// Determines whether the current line should be part of a regex-delimited
	// group including any prior lines already visited.
	// Returns another boolean indicating whether the group should be ending
	// after that line if so.
	shouldAddToRegexDelimitedGroup := func(l string) (addToGroup bool, finishGroupAfter bool) {
        if metadata.opts.GroupStartRegex != nil {
			// For GroupStartRegex, all non-regex-matching lines should be
			// part of the group including prior lines.
			return !matchesAnyRegex(l, metadata.opts.GroupStartRegex), false
		}
		if metadata.opts.GroupEndRegex != nil {
			// For GroupEndRegex, the line should always be included in the
			// group including prior lines, but possibly terminate it.
			return true, matchesAnyRegex(l, metadata.opts.GroupEndRegex)
		}
		return false, false
	}
	countStartDirectives := func(l string) {
		if strings.Contains(l, metadata.startDirective) {
			numUnmatchedStartDirectives++
		} else if strings.Contains(l, metadata.endDirective) {
			numUnmatchedStartDirectives--
		}
	}
	// append a line to both lineRange, and block, if necessary.
	appendLine := func(i int, l string) {
		lineRange.append(i)
		if metadata.opts.Block {
			block.append(l, metadata.opts)
		}
		if metadata.opts.Group {
			countStartDirectives(l)
		}

		if metadata.opts.Group && initialIndent == nil {
			initialIndent = &indents[i]
			log.Printf("initialIndent: %d", *initialIndent)
		}
	}
	// finish an outstanding lineGroup and reset our state to prepare for a new lineGroup.
	finishGroup := func() {
		// If the current lineRange ends with an extra empty line, remove it and place it in a separate group.
		// This is notably needed to support group_start_regex or group_end_regex being set at the same time as newline_separated.
		endingEmptyLines := 0
		for lineRange.size() > 1 && lines[lineRange.end-1] == "" {
			endingEmptyLines++
			lineRange.end--
		}
		groups = append(groups, &lineGroup{
			opts:             metadata.opts,
			prefixOrder:      prefixOrder,
			lineGroupContent: lineGroupContent{comment: slice(lines, commentRange), lines: slice(lines, lineRange)},
		})
		commentRange = indexRange{}
		lineRange = indexRange{}
		block = codeBlock{}
		for ; endingEmptyLines > 0; endingEmptyLines-- {
			groups = append(groups, &lineGroup{
				opts:             metadata.opts,
				prefixOrder:      prefixOrder,
				lineGroupContent: lineGroupContent{lines: []string{""}},
			})
		}
	}
	for i, l := range lines {
		if shouldAddToBlock() || shouldAddToGroup(i, l) {
			appendLine(i, l)
		} else if metadata.opts.hasStickyPrefix(l) {
			// Top-level comments break the current block/group.
			if !lineRange.empty() {
				finishGroup()
			}

			commentRange.append(i)
			if metadata.opts.Group {
				// Note: This line will not count end directives. If this call ever
				// finds a start directive, it will set numUnmatchedStartDirectives > 0
				// and then we will enter the shouldAddToGroup branch above where we'll
				// count end directives via its appendLine call.
				countStartDirectives(l)
			}
		} else if addToGroup, finishGroupAfter := shouldAddToRegexDelimitedGroup(l); addToGroup {
			appendLine(i, l)
			if finishGroupAfter {
				finishGroup()
			}
		} else {
			// Begin a new block or group.
			if !lineRange.empty() {
				finishGroup()
			}
			appendLine(i, l)
		}
	}
	if !commentRange.empty() || !lineRange.empty() {
		finishGroup()
	}
	return groups
}

// calculateIndents precalculates the indentation for each line.
// We do this precalculation so that we don't get bad worst-case behavior if
// someone had a bunch of newlines in a group=yes block.
func calculateIndents(lines []string) []int {
	ret := make([]int, len(lines))
	for i, l := range lines {
		indent, ok := countIndent(l)
		if !ok {
			indent = -1
		}
		ret[i] = indent
	}

	// Allow for newlines to have an indent if the next non-empty line has hanging
	// indent.
	// Go backwards through the indent list so that it's harder to accidentally
	// get O(n^2) behavior for a long section of newlines.
	indent := -1
	for i := len(ret) - 1; i >= 0; i-- {
		if ret[i] == -1 {
			ret[i] = indent
			continue
		}

		indent = ret[i]
	}

	return ret
}

// countIndent counts how many space characters occur at the beginning of s.
func countIndent(s string) (indent int, hasNonSpaceCharacter bool) {
	c := 0
	for _, ch := range s {
		if unicode.IsSpace(ch) {
			c++
			continue
		}
		break
	}
	if c == len(s) {
		return 0, false
	}
	return c, true
}

// indexRange is a helper struct that let us gradually figure out how big a
// lineGroup is without having to re-slice the underlying data multiple times.
type indexRange struct {
	start, end int
	init       bool
}

func (r *indexRange) empty() bool {
	return !r.init || r.start == r.end
}

func (r *indexRange) size() int {
	if !r.init {
		return 0
	}
	return r.end - r.start
}

func (r *indexRange) append(i int) {
	if !r.init {
		r.start = i
		r.end = i + 1
		r.init = true
		return
	}

	if r.end != i {
		panic(fmt.Errorf("cannot append %d to %#v because end is %d", i, r, r.end))
	}
	r.end = i + 1
}

func slice(s []string, r indexRange) []string {
	if r.empty() {
		return nil
	}
	return s[r.start:r.end]
}

var (
	braces = []struct {
		open  string
		close string
	}{
		{"{", "}"},
		{"[", "]"},
		{"(", ")"},
	}
	quotes = []string{
		`"""`, `'''`, "```",
		`"`, `'`, "`",
	}
)

// codeBlock is a helper struct that let us try to understand if a section of
// code expects more lines to be "complete".
type codeBlock struct {
	braceCounts   map[string]int
	expectedQuote string
}

// expectsContinuation determines whether it seems like the lines seen so far
// expect a continuation of characters.
//
// Current naive definition of this is to just see if the typically balanced
// symbols (parenthesis, square brackets, braces, and quotes) are balanced. If
// not, we'll assume the next line is a continuation. Quotation marks within
// strings are ignored. This could be extended in the future (and possibly
// controlled by further options).
//
// Known limitations:
// - Parenthesis, square brackets, and braces could appear in any order
// - Parenthesis, square brackets, and braces within strings aren't ignored
func (cb *codeBlock) expectsContinuation() bool {
	for _, b := range braces {
		if cb.braceCounts[b.open] != cb.braceCounts[b.close] {
			return true
		}
	}

	return cb.expectedQuote != ""
}

// append the given line to this codeblock, and update expectsContinuation appropriately.
func (cb *codeBlock) append(s string, opts blockOptions) {
	if cb.braceCounts == nil {
		cb.braceCounts = make(map[string]int)
	}

	// TODO(jfalgout): Does this need to handle runes more correctly?
	for i := 0; i < len(s); {
		if cb.expectedQuote == "" {
			// We do not appear to be inside a string literal.
			// Treat braces as part of the syntax.
			for _, b := range braces {
				if s[i:i+1] == b.open {
					cb.braceCounts[b.open]++
				}
				if s[i:i+1] == b.close {
					cb.braceCounts[b.close]++
				}
			}
			// Ignore trailing comments (rest of the line).
			if cm := opts.commentMarker; cm != "" && len(s[i:]) >= len(cm) && s[i:i+len(cm)] == cm {
				break
			}
		}
		if q := findQuote(s, i); cb.expectedQuote == "" && q != "" {
			cb.expectedQuote = q
			i += len(q)
			continue
		} else if cb.expectedQuote != "" && q == cb.expectedQuote {
			cb.expectedQuote = ""
			i += len(q)
			continue
		}

		i++
	}
}

// findQuote looks for one of the quotes in s at position i, returning which
// quote was found if one was found.
func findQuote(s string, i int) string {
	for _, q := range quotes {
		if len(s[i:]) < len(q) {
			continue
		}
		if len(q) == 1 && i > 0 && string(s[i-1]) == `\` {
			// Ignore quote literals (\", \', \`)
			continue
		}
		if s[i:i+len(q)] == q {
			return q
		}
	}
	return ""
}

func (lg *lineGroup) append(s string) {
	lg.access = accessRecorder{}
	lg.lines[len(lg.lines)-1] = lg.lines[len(lg.lines)-1] + s
}

func (lg *lineGroup) hasSuffix(s string) bool {
	return len(lg.lines) > 0 && strings.HasSuffix(lg.lines[len(lg.lines)-1], s)
}

func (lg *lineGroup) trimSuffix(s string) {
	lg.access = accessRecorder{}
	lg.lines[len(lg.lines)-1] = strings.TrimSuffix(lg.lines[len(lg.lines)-1], s)
}

func (lg *lineGroup) commentOnly() bool {
	lg.access.commentOnly = true
	return len(lg.lines) == 0
}

func (lg *lineGroup) regexTokens() []regexToken {
	// TODO: jfaer - Should we match regexes on the original content?
	regexMatches := lg.opts.matchRegexes(lg.internalJoinedLines())
	ret := make([]regexToken, len(regexMatches))
	if lg.access.regexTokens == nil {
		lg.access.regexTokens = make([]regexTokenAccessRecorder, len(regexMatches))
	}
	for i, match := range regexMatches {
		if match == nil {
			// Regex did not match.
			continue
		}

		ret[i] = make(regexToken, len(match))
		if lg.access.regexTokens[i] == nil {
			lg.access.regexTokens[i] = make(regexTokenAccessRecorder, len(match))
		}
		for j, s := range match {
			order := lg.prefixOrder
			if j != 0 {
				// Only try to match PrefixOrder on the first capture group in a regex.
				// TODO: jfaer - Should this just be the first capture group in the first regex match?
				order = func() *prefixOrder { return nil }
			}
			ret[i][j] = &captureGroupToken{
				opts:        &lg.opts,
				prefixOrder: order,
				raw:         s,
				access:      &lg.access.regexTokens[i][j],
			}
		}
	}
	return ret
}

// internalJoinedLines calculates the same thing as joinedLines, except it
// doesn't record that it was used in the accessRecorder.
func (lg *lineGroup) internalJoinedLines() string {
	if len(lg.lines) == 0 {
		return ""
	}

	endsWithWordChar := regexp.MustCompile(`\w$`)
	startsWithWordChar := regexp.MustCompile(`^\w`)
	var s strings.Builder
	var last string
	for _, l := range lg.lines {
		l := strings.TrimLeftFunc(l, unicode.IsSpace)
		if len(last) > 0 && len(l) > 0 && endsWithWordChar.MatchString(last) && startsWithWordChar.MatchString(l) {
			s.WriteString(" ")
		}
		s.WriteString(l)
		last = l
	}
	return s.String()
}

func (lg *lineGroup) joinedLines() string {
	lg.access.joinedLines = true
	return lg.internalJoinedLines()
}

func (lg *lineGroup) joinedComment() string {
	lg.access.joinedComment = true
	if len(lg.comment) == 0 {
		return ""
	}
	return strings.Join(lg.comment, "\n")
}

func (lg *lineGroup) DebugString() string {
	var s strings.Builder
	s.WriteString("LineGroup{\n")
	if len(lg.comment) > 0 {
		s.WriteString("comment=\n")
		for _, c := range lg.comment {
			fmt.Fprintf(&s, "  %#v\n", c)
		}
	}
	if len(lg.lines) > 0 {
		s.WriteString("lines=\n")
		for _, l := range lg.lines {
			fmt.Fprintf(&s, "  %#v\n", l)
		}
	}
	if lg.access.commentOnly {
		fmt.Fprintf(&s, "commentOnly=%t\n", lg.commentOnly())
	}
	if lg.access.regexTokens != nil {
		for i, regex := range lg.regexTokens() {
			if regex.wasUsed() {
				fmt.Fprintf(&s, "regex[%d]=%s\n", i, regex.DebugString())
			}
		}
	}
	if lg.access.joinedLines {
		if len(lg.lines) > 1 {
			// Only print the joinedLines when they're meaningfully different from the
			// raw lines above.
			fmt.Fprintf(&s, "joinedLines=%#v\n", lg.joinedLines())
		} else if !lg.access.joinedComment {
			s.WriteString("linesTiebreaker=true\n")
		}
	}
	if lg.access.joinedComment {
		s.WriteString("commentTiebreaker=true\n")
	}
	s.WriteString("}")
	return s.String()
}

func (lg *lineGroup) allLines() []string {
	var all []string
	all = append(all, lg.comment...)
	all = append(all, lg.lines...)
	return all
}

func (lg *lineGroup) String() string {
	return strings.Join(lg.allLines(), "\n")
}

type regexToken []*captureGroupToken

type regexTokenAccessRecorder []captureGroupTokenAccessRecorder

func (t regexToken) wasUsed() bool {
	if t == nil {
		// Report that the regex didn't match.
		return true
	}
	for _, cg := range t {
		if cg.wasUsed() {
			return true
		}
	}
	return false
}

func (t regexToken) DebugString() string {
	if t == nil {
		return "<did not match>"
	}

	captureGroups := make([]string, len(t))
	for i, cg := range t {
		if cg.wasUsed() {
			captureGroups[i] = cg.DebugString()
		} else {
			captureGroups[i] = "<unused>"
		}
	}

	if len(captureGroups) == 1 {
		return captureGroups[0]
	}
	return fmt.Sprintf("%v", captureGroups)
}

type captureGroupToken struct {
	opts        *blockOptions
	prefixOrder func() *prefixOrder

	raw string

	access *captureGroupTokenAccessRecorder
}

type captureGroupTokenAccessRecorder struct {
	prefix    bool
	transform bool
}

func (t *captureGroupToken) prefix() orderedPrefix {
	ord := t.prefixOrder()
	if ord == nil {
		return orderedPrefix{}
	}
	t.access.prefix = true
	return ord.match(t.raw)
}

func (t *captureGroupToken) transform() numericTokens {
	t.access.transform = true
	// Combinations of switches (for example, case-insensitive and numeric
	// ordering) which must be applied to create a single comparison key,
	// otherwise a sub-ordering can preempt a total ordering:
	//   Foo_45
	//   foo_123
	//   foo_6
	// would be sorted as either (numeric but not case-insensitive)
	//   Foo_45
	//   foo_6
	//   foo_123
	// or (case-insensitive but not numeric)
	//   foo_123
	//   Foo_45
	//   foo_6
	// but should be (case-insensitive and numeric)
	//   foo_6
	//   Foo_45
	//   foo_123
	s := t.opts.trimIgnorePrefix(t.raw)
	if !t.opts.CaseSensitive {
		s = strings.ToLower(s)
	}
	return t.opts.maybeParseNumeric(s)
}

func (t captureGroupToken) wasUsed() bool {
	return t.access.prefix || t.access.transform
}

func (t captureGroupToken) DebugString() string {
	var s []string
	if t.access.prefix {
		s = append(s, fmt.Sprintf("prefix:%q", t.prefix().prefix))
	}
	if t.access.transform {
		var tokens strings.Builder
		if len(s) > 0 {
			tokens.WriteString("tokens:")
		}
		fmt.Fprintf(&tokens, "%s", t.transform().DebugString())
		s = append(s, tokens.String())
	}

	ret := strings.Join(s, " ")
	if len(s) > 1 {
		ret = "[" + ret + "]"
	}
	return ret
}