Skip to content

Commit 090889a

Browse files
authored
Merge pull request #115 from infosiftr/sources-cache
Add caching for `sources.json` creation
2 parents 1ac45c0 + aa0fbf5 commit 090889a

File tree

4 files changed

+191
-52
lines changed

4 files changed

+191
-52
lines changed

.github/workflows/ci.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,8 +24,8 @@ jobs:
2424
run: |
2525
# not doing "uses: docker-library/bashbrew@xxx" because it'll build which is slow and we don't need more than just bashbrew here
2626
mkdir .bin
27-
wget --timeout=5 -O .bin/bashbrew 'https://github.com/docker-library/bashbrew/releases/download/v0.1.11/bashbrew-amd64'
28-
echo '6203635644d0efef2886f8ea9c487995a7abc4166db7a4773e94f89c943a4b04 *.bin/bashbrew' | sha256sum --strict --check -
27+
wget --timeout=5 -O .bin/bashbrew 'https://github.com/docker-library/bashbrew/releases/download/v0.1.13/bashbrew-amd64'
28+
echo 'a13dca73181bc68dc9fb695ca1b4003a12077551ccc02eb0c232a0313e88d7c1 *.bin/bashbrew' | sha256sum --strict --check -
2929
chmod +x .bin/bashbrew
3030
.bin/bashbrew --version
3131
echo "$PWD/.bin" >> "$GITHUB_PATH"

.test/test.sh

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -31,16 +31,21 @@ set -- docker:cli docker:dind docker:windowsservercore notary busybox:{latest,gl
3131

3232
time bashbrew fetch "$@"
3333

34-
time "$dir/../sources.sh" "$@" > "$dir/sources-doi.json"
34+
# generate sources, but remove the first item so we can test cache with some missing
35+
time "$dir/../sources.sh" "$@" | jq 'del(first(.[]))' > "$dir/sources-cache.json"
36+
# again but with cache
37+
time "$dir/../sources.sh" --cache-file "$dir/sources-cache.json" "$@" > "$dir/sources-doi.json"
3538

3639
# also fetch/include Tianon's more cursed "infosiftr/moby" example (a valid manifest with arch-specific non-archTags that end up mapping to the same sourceId)
3740
bashbrew fetch infosiftr-moby
38-
( BASHBREW_ARCH_NAMESPACES= "$dir/../sources.sh" infosiftr-moby > "$dir/sources-moby.json" )
41+
( BASHBREW_ARCH_NAMESPACES= "$dir/../sources.sh" infosiftr-moby > "$dir/sources-cache.json" )
42+
# again but with cache
43+
( BASHBREW_ARCH_NAMESPACES= "$dir/../sources.sh" --cache-file="$dir/sources-cache.json" infosiftr-moby > "$dir/sources-moby.json" )
3944
# technically, this *also* needs BASHBREW_STAGING_TEMPLATE='tianon/zz-staging:ARCH-BUILD', but that's a "builds.sh" flag and separating that would complicate including this even more, so Tianon has run the following one-liner to "inject" those builds as if they lived in 'oisupport/staging-ARCH:BUILD' instead:
4045
# jq -r '[ .[] | select(any(.source.arches[].tags[]; startswith("infosiftr-moby:"))) | "tianon/zz-staging:\(.build.arch)-\(.buildId)" as $tianon | @sh "../bin/lookup \($tianon) | jq --arg img \(.build.img) \("{ indexes: { ($img): . } }")" ] | "{ " + join(" && ") + @sh " && cat cache-builds.json; } | jq -s --tab \("reduce .[] as $i ({ indexes: { } }; .indexes += $i.indexes)") > cache-builds.json.new && mv cache-builds.json.new cache-builds.json"' builds.json | bash -Eeuo pipefail -x
4146
# (and then re-run the tests to canonicalize the file ordering)
4247
jq -s 'add' "$dir/sources-doi.json" "$dir/sources-moby.json" > "$dir/sources.json"
43-
rm -f "$dir/sources-doi.json" "$dir/sources-moby.json"
48+
rm -f "$dir/sources-doi.json" "$dir/sources-moby.json" "$dir/sources-cache.json"
4449

4550
# an attempt to highlight tag mapping bugs in the future
4651
jq '

Jenkinsfile.meta

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,13 @@ node {
7373
fi
7474

7575
if [ -n "$needsBuild" ]; then
76-
.scripts/sources.sh --all > sources.json
76+
# use previous run as cache
77+
[ -s sources.json ] && cp sources.json sources-copy.json
78+
79+
.scripts/sources.sh --cache-file sources-copy.json --all > sources.json
80+
81+
# clean up temporary cache
82+
rm -f sources-copy.json
7783
fi
7884
'''
7985
}

sources.sh

Lines changed: 174 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,21 @@
11
#!/usr/bin/env bash
22
set -Eeuo pipefail
33

4+
cacheFile=
5+
if [ "$#" -gt 0 ]; then
6+
case "$1" in
7+
--cache-file=*)
8+
cacheFile="${1#*=}"
9+
shift
10+
;;
11+
--cache-file)
12+
shift
13+
cacheFile="$1"
14+
shift
15+
;;
16+
esac
17+
fi
18+
419
if [ "$#" -eq 0 ]; then
520
set -- --all
621
fi
@@ -39,60 +54,172 @@ for tag in $externalPins; do
3954
externalPinsJson="$(jq <<<"$externalPinsJson" -c --arg tag "${tag#library/}" --arg digest "$digest" '.[$tag] = $digest')"
4055
done
4156

42-
bashbrew cat --build-order --format '
43-
{{- range $e := .SortedEntries false -}}
44-
{{- range $a := $e.Architectures -}}
45-
{{- $archNs := archNamespace $a -}}
46-
{{- with $e -}}
47-
{{- $sum := $.ArchGitChecksum $a . -}}
48-
{{- $file := .ArchFile $a -}}
49-
{{- $builder := .ArchBuilder $a -}}
50-
{
51-
"sourceId": {{ join "\n" $sum $file $builder "" | sha256sum | json }},
52-
"reproducibleGitChecksum": {{ $sum | json }},
53-
"entries": [ {
54-
"GitRepo": {{ .ArchGitRepo $a | json }},
55-
"GitFetch": {{ .ArchGitFetch $a | json }},
56-
"GitCommit": {{ .ArchGitCommit $a | json }},
57-
"Directory": {{ .ArchDirectory $a | json }},
58-
"File": {{ $file | json }},
59-
"Builder": {{ $builder | json }},
60-
"SOURCE_DATE_EPOCH": {{ ($.ArchGitTime $a .).Unix | json }}
61-
} ],
62-
"arches": {
63-
{{ $a | json }}: {
64-
"tags": {{ $.Tags namespace false . | json }},
65-
"archTags": {{ if $archNs -}} {{ $.Tags $archNs false . | json }} {{- else -}} [] {{- end }},
66-
"froms": {{ $.ArchDockerFroms $a . | json }},
67-
"lastStageFrom": {{ if eq $builder "oci-import" -}}
68-
{{- /* TODO remove this special case: https://github.com/docker-library/bashbrew/pull/92 */ -}}
69-
"scratch"
70-
{{- else -}}
71-
{{ $.ArchLastStageFrom $a . | json }}
72-
{{- end }},
73-
"platformString": {{ (ociPlatform $a).String | json }},
74-
"platform": {{ ociPlatform $a | json }},
75-
"parents": { }
57+
bashbrew_cat() {
58+
local HEAVY_CALC=''
59+
if [ "$1" = '--do-heavy' ]; then
60+
shift
61+
HEAVY_CALC=1
62+
fi
63+
64+
bbCat=( bashbrew cat --build-order --format '
65+
{{- range $e := .SortedEntries false -}}
66+
{{- range $a := $e.Architectures -}}
67+
{{- $archNs := archNamespace $a -}}
68+
{{- with $e -}}
69+
{{- $file := .ArchFile $a -}}
70+
{{- $builder := .ArchBuilder $a -}}
71+
{
72+
{{- if getenv "HEAVY_CALC" -}}
73+
{{- $sum := $.ArchGitChecksum $a . }}
74+
"sourceId": {{ join "\n" $sum $file $builder "" | sha256sum | json }},
75+
"reproducibleGitChecksum": {{ $sum | json }},
76+
{{- else }}
77+
"sourceId": null,
78+
"reproducibleGitChecksum": null,
79+
{{- end }}
80+
"entries": [ {
81+
"GitRepo": {{ .ArchGitRepo $a | json }},
82+
"GitFetch": {{ .ArchGitFetch $a | json }},
83+
"GitCommit": {{ .ArchGitCommit $a | json }},
84+
"Directory": {{ .ArchDirectory $a | json }},
85+
"File": {{ $file | json }},
86+
"Builder": {{ $builder | json }},
87+
"SOURCE_DATE_EPOCH": {{ if getenv "HEAVY_CALC" -}} {{ ($.ArchGitTime $a .).Unix | json }} {{- else -}} null {{- end }}
88+
} ],
89+
"arches": {
90+
{{ $a | json }}: {
91+
"tags": {{ $.Tags namespace false . | json }},
92+
"archTags": {{ if $archNs -}} {{ $.Tags $archNs false . | json }} {{- else -}} [] {{- end }},
93+
"froms": {{ if getenv "HEAVY_CALC" -}} {{ $.ArchDockerFroms $a . | json }} {{- else -}} [] {{- end }},
94+
"lastStageFrom": {{ if getenv "HEAVY_CALC" -}} {{ $.ArchLastStageFrom $a . | json }} {{- else -}} null {{- end }},
95+
"platformString": {{ (ociPlatform $a).String | json }},
96+
"platform": {{ ociPlatform $a | json }},
97+
"parents": { }
98+
}
7699
}
77100
}
78-
}
101+
{{- end -}}
79102
{{- end -}}
80103
{{- end -}}
81-
{{- end -}}
82-
' "$@" | jq 3>&1 1>&2 2>&3- -r '
83-
# https://github.com/jqlang/jq/issues/2063 - "stderr" cannot functionally output a string correctly until jq 1.7+ (which is very very recent), so we hack around it to get some progress output by using Bash to swap stdout and stderr so we can output our objects to stderr and our progress text to stdout and "fix it in post"
84-
# TODO balk / error at multiple arches entries
85-
first(.arches | keys_unsorted[]) as $arch
86-
| .arches[$arch].tags[0] as $tag
87-
| stderr
88-
| "\($tag) (\($arch)): \(.sourceId)"
89-
# TODO if we could get jq 1.7+ for sure, we can drop this entire "jq" invocation and instead have the reduce loop of the following invocation print status strings directly to "stderr"
90-
' | jq -n --argjson pins "$externalPinsJson" '
104+
' "$@" )
105+
if [ -n "$HEAVY_CALC" ]; then
106+
HEAVY_CALC="$HEAVY_CALC" "${bbCat[@]}" | jq 3>&1 1>&2 2>&3- -r '
107+
# https://github.com/jqlang/jq/issues/2063 - "stderr" cannot functionally output a string correctly until jq 1.7+ (which is very very recent), so we hack around it to get some progress output by using Bash to swap stdout and stderr so we can output our objects to stderr and our progress text to stdout and "fix it in post"
108+
# TODO balk / error at multiple arches entries
109+
first(.arches | keys_unsorted[]) as $arch
110+
| .arches[$arch].tags[0] as $tag
111+
| stderr
112+
| "\($tag) (\($arch)): \(.sourceId)"
113+
# TODO if we could get jq 1.7+ for sure, we can drop this entire "jq" invocation and instead have the reduce loop of the following invocation print status strings directly to "stderr"
114+
' | jq -n '[ inputs ]'
115+
else
116+
"${bbCat[@]}" | jq -n '[ inputs ]'
117+
fi
118+
}
119+
120+
# merges heavy-to-calculate data from the second json input (list or map of sources) into the first json input (list of sources)
121+
# uses "mostlyUniqueBitsSum" as a rough analogue for sourceId to correlate data between the input lists
122+
# (sourceId, reproducibleGitChecksum, SOURCE_DATE_EPOCH, froms, lastStageFrom)
123+
# echo '[{}, {},...] [{extraData},...]' | mergeData
124+
mergeData() {
125+
jq --slurp '
126+
def mostlyUniqueBitsSum($arch):
127+
{
128+
GitCommit,
129+
Directory,
130+
File,
131+
Builder,
132+
133+
# "sourceId" normally does not include arch, but we have to because of the complexity below in needing to match/extract "froms" and "lastStageFrom" correctly since one or both sides of the `mergeData` input is always the uncombined version and we will otherwise lose/clobber data if our fake sourceId is not as granular as our input data
134+
$arch,
135+
} | @json
136+
;
137+
(
138+
[
139+
.[1][] as $source
140+
| $source.arches
141+
| keys[] as $arch
142+
| $source.entries[]
143+
| {
144+
key: mostlyUniqueBitsSum($arch),
145+
value: {
146+
entry: .,
147+
source: $source,
148+
}
149+
}
150+
] | from_entries
151+
) as $cacheFile
152+
| .[0]
153+
| map(
154+
. as $it
155+
| (
156+
$it.arches | keys_unsorted
157+
# ensure input data is just one architecture per source
158+
| if length != 1 then
159+
error("too many architectures in input list: \($it)")
160+
else . end
161+
)[0] as $arch
162+
| (
163+
# match an item by the unique bits that we have
164+
$cacheFile[
165+
# because it is one architecture per source, it will only have one entry (verfied below)
166+
$it.entries[0]
167+
| mostlyUniqueBitsSum($arch)
168+
]
169+
| select(.source.sourceId)
170+
| .entry as $entry
171+
| .source
172+
| $it * {
173+
# this might pull in "null" values from the cache if we change the format, but they will get fixed on the second round of "mergeData"
174+
sourceId,
175+
reproducibleGitChecksum,
176+
arches: {
177+
($arch): {
178+
froms: .arches[$arch].froms,
179+
lastStageFrom: .arches[$arch].lastStageFrom,
180+
},
181+
},
182+
}
183+
# because it is one architecture per source, it should also only have one entry
184+
| if .entries | length != 1 then
185+
error("more than one entry in an input source: \(.)")
186+
else . end
187+
| .entries[0].SOURCE_DATE_EPOCH = $entry.SOURCE_DATE_EPOCH
188+
) // $it
189+
)
190+
'
191+
}
192+
193+
sources=
194+
if [ -s "$cacheFile" ]; then
195+
sources="$({ bashbrew_cat "$@"; cat "$cacheFile"; } | mergeData)"
196+
heavy="$(
197+
jq <<<"$sources" -r '
198+
map(
199+
select(any( ..; type == "null" or (type == "array" and length == 0) ))
200+
| first(.arches[].tags[])
201+
| @sh
202+
) | unique
203+
| join(" ")
204+
'
205+
)"
206+
eval "heavy=( $heavy )"
207+
208+
# items missing sourceId/reproducibleGitChecksum (i.e. missing from cache) need to use bashbrew cat to sum files from build context
209+
if [ "${#heavy[@]}" -gt 0 ]; then
210+
# TODO fetch heavy lookup data only for specific architectures
211+
sources="$({ cat <<<"$sources"; bashbrew_cat --do-heavy "${heavy[@]}"; } | mergeData)"
212+
fi
213+
else
214+
sources="$(bashbrew_cat --do-heavy "$@")"
215+
fi
216+
217+
jq <<<"$sources" --argjson pins "$externalPinsJson" '
91218
def unique_unsorted:
92219
# https://unix.stackexchange.com/a/738744/153467
93220
reduce .[] as $a ([]; if IN(.[]; $a) then . else . += [$a] end)
94221
;
95-
reduce inputs as $in ({};
222+
reduce .[] as $in ({};
96223
.[$in.sourceId] |=
97224
if . == null then
98225
$in
@@ -127,6 +254,7 @@ bashbrew cat --build-order --format '
127254
# TODO a lot of this could be removed/parsed during the above reduce, since it has to parse things in build order anyhow
128255
# TODO actually, instead, this bit should be a totally separate script so the use case of "combine sources.json files together" works better 👀
129256
| (
257+
# TODO make this faster, this reduce takes the longest time now
130258
reduce to_entries[] as $e ({};
131259
$e.key as $sourceId
132260
| .[ $e.value.arches[] | .tags[], .archTags[] ] |= (

0 commit comments

Comments
 (0)