project--shell-script-performance-and-portability/bin/t-file-grep-vs-match-in-memory.sh at master · jaalto/project--shell-script-performance-and-portability · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
#! /usr/bin/env bash
#
# Q: To search file for matches: in memory search vs `grep`
# A: It is about 8-10x faster to read file into memory and then do matching
# priority: 10
#
#     t1a real 0m0.049s read once + bash regexp (read file once + use loop)
#     t1b real 0m0.054s read once + case..MATCH..esac (read file once + use loop)
#     t2  real 0m0.283s grep
#     t3  real 0m0.407s read + case..MATCH..esac (separate file calls)
#     t4  real 0m0.440s read + bash regexp (separate file calls)
#
# Code:
#
# See the test code for more information. Overview:
#
#     t1a read once and loop [[ str =~~ RE ]]
#     t1b read once and loop case..MATCH..end
#     t2  grep RE file in loop
#     t3  read every time in loop. case..MATCH..end
#     t4  read every time in loop. [[ str =~~ RE ]]
#
# Notes:
#
# Repeated reads of the same file
# probably utilizes Kernel cache to some
# extent. But it is still much faster to
# read file once into memory and then
# apply matching multiple times.
#
# The `grep` command is leaps ahead of
# re-reading the file in a loop and using
# the shell’s own matching capabilities.
#
# In Ksh, the "read into memory first, then
# match" is extremely fast:
#
#     t1a real 0.005  user 0.002  sys 0.002
#     t1b real 0.334  user 0.328  sys 0.005
#     t2  real 0.221  user 0.126  sys 0.096
#     t3  real 0.478  user 0.343  sys 0.137
#     t4  real 0.222  user 0.153  sys 0.071

. ./t-lib.sh ; rand=$random_file

f=$(mktemp -t $TMPBASE.random.file.XXX)

string=abc
pattern="$string*$string"
re="$string.*$string"

Setup ()
{
    { echo "$string $string"; cat $rand; } > $f
}

Info ()
{
    echo "INFO: test file: $(ls -l $f)"
}

Read ()
{
    # Not supported by all shells:
    #   read -N$((100 * 1024)) REPLY < "$1"

    # Use POSIX
    REPLY=$(cat "$1")
}

MatchFileContentPattern ()  # POSIX
{
    Read "$1"

    case ${REPLY:-} in
        $pattern)
            return 0
            ;;
        *)
            return 1
            ;;
    esac
}

# Hide from other shells
MatchFileContentRegexp () { : ; } # stub
t1a () { : ; } # stub

cat << 'EOF' > t.bash
MatchFileContentRegexp () # Bash regexp
{
    Read "$1"

    [[ "$REPLY" =~ $re ]]
}

t1a () # read once
{
    Read "$f"
    re=$string

    for i in $(seq $loop_max)
    do
        [[ $REPLY =~ $re ]]
    done
}
EOF

IsFeatureMatchRegexp && . ./t.bash
rm --force t.bash

t1b () # read once
{
    Read "$f"

    for i in $(seq $loop_max)
    do
        case ${REPLY:-} in
            *$pattern*) ;;
        esac
    done
}

t2 () # read every time
{
    for i in $(seq $loop_max)
    do
        # "grep -E" is the one that is typically used
        grep --quiet --extended-regexp --files-with-matches "$re" $f
    done
}

t3 () # read every time
{
    for i in $(seq $loop_max)
    do
        MatchFileContentPattern $f
    done
}

t4 () # read every time
{
    for i in $(seq $loop_max)
    do
        MatchFileContentRegexp $f
    done
}

t="\
:t t1a IsFeatureMatchRegexp
:t t1b
:t t2
:t t3
:t t4 IsFeatureMatchRegexp
"

SetupTrapAtExit
Setup

if [ "$source" ]; then
     :
elif [ "$run" ]; then
    "$@"
else
    RunTests "$t" "$@"
fi

# End of file