Skip to content

Commit 2422255

Browse files
committed
pythonbuild: normalize all tar archives
The impetus for this was wanting to put PYTHON.json earlier in the archive. But we do end up making archives deterministic where they weren't before. So this seems like a good move all around.
1 parent 67691ea commit 2422255

File tree

4 files changed

+104
-5
lines changed

4 files changed

+104
-5
lines changed

cpython-windows/build.py

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55

66
import argparse
77
import concurrent.futures
8+
import io
89
import json
910
import os
1011
import pathlib
@@ -24,6 +25,7 @@
2425
extract_tar_to_directory,
2526
extract_zip_to_directory,
2627
compress_python_archive,
28+
normalize_tar_archive,
2729
release_tag_from_git,
2830
validate_python_json,
2931
)
@@ -2260,8 +2262,19 @@ def build_cpython(
22602262
"cpython-%s-%s-%s.tar" % (entry["version"], target_triple, profile,)
22612263
)
22622264

2265+
data = io.BytesIO()
2266+
create_tar_from_directory(data, td / "out")
2267+
data.seek(0)
2268+
2269+
data = normalize_tar_archive(data)
2270+
22632271
with dest_path.open("wb") as fh:
2264-
create_tar_from_directory(fh, td / "out")
2272+
while True:
2273+
chunk = data.read(32768)
2274+
if not chunk:
2275+
break
2276+
2277+
fh.write(chunk)
22652278

22662279
return dest_path
22672280

pythonbuild/buildenv.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,12 @@
1414
from .docker import container_exec, container_get_archive, copy_file_to_container
1515
from .downloads import DOWNLOADS
1616
from .logging import log
17-
from .utils import create_tar_from_directory, exec_and_log, extract_tar_to_directory
17+
from .utils import (
18+
create_tar_from_directory,
19+
exec_and_log,
20+
extract_tar_to_directory,
21+
normalize_tar_archive,
22+
)
1823

1924

2025
class ContainerContext(object):
@@ -108,6 +113,8 @@ def get_output_archive(self, path=None, as_tar=False):
108113
data = container_get_archive(self.container, p)
109114
data = io.BytesIO(data)
110115

116+
data = normalize_tar_archive(data)
117+
111118
if as_tar:
112119
return tarfile.open(fileobj=data)
113120
else:
@@ -212,9 +219,10 @@ def get_output_archive(self, path, as_tar=False):
212219

213220
data = io.BytesIO()
214221
create_tar_from_directory(data, p, path_prefix=p.parts[-1])
215-
216222
data.seek(0)
217223

224+
data = normalize_tar_archive(data)
225+
218226
if as_tar:
219227
return tarfile.open(fileobj=data)
220228
else:

pythonbuild/utils.py

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,11 @@
44

55
import gzip
66
import hashlib
7+
import io
78
import multiprocessing
89
import os
910
import pathlib
11+
import stat
1012
import subprocess
1113
import sys
1214
import tarfile
@@ -247,6 +249,67 @@ def extract_zip_to_directory(source: pathlib.Path, dest: pathlib.Path):
247249
zf.extractall(dest)
248250

249251

252+
# 2021-01-01T00:00:00
253+
DEFAULT_MTIME = 1609488000
254+
255+
256+
def normalize_tar_archive(data: io.BytesIO) -> io.BytesIO:
257+
"""Normalize the contents of a tar archive.
258+
259+
We want tar archives to be as deterministic as possible. This function will
260+
take tar archive data in a buffer and return a new buffer containing a more
261+
deterministic tar archive.
262+
"""
263+
members = []
264+
265+
with tarfile.open(fileobj=data) as tf:
266+
for ti in tf:
267+
# We don't care about directory entries. Tools can handle this fine.
268+
if ti.isdir():
269+
continue
270+
271+
filedata = tf.extractfile(ti)
272+
if filedata is not None:
273+
filedata = io.BytesIO(filedata.read())
274+
275+
members.append((ti, filedata))
276+
277+
# Sort the archive members. We put PYTHON.json first so metadata can
278+
# be read without reading the entire archive.
279+
def sort_key(v):
280+
if v[0].name == "python/PYTHON.json":
281+
return 0, v[0].name
282+
else:
283+
return 1, v[0].name
284+
285+
members.sort(key=sort_key)
286+
287+
# Normalize attributes on archive members.
288+
for entry in members:
289+
ti = entry[0]
290+
ti.mtime = DEFAULT_MTIME
291+
ti.uid = 0
292+
ti.uname = "root"
293+
ti.gid = 0
294+
ti.gname = "root"
295+
296+
# Give user/group read/write on all entries.
297+
ti.mode |= stat.S_IRUSR | stat.S_IWUSR | stat.S_IRGRP | stat.S_IWGRP
298+
299+
# If user executable, give to group as well.
300+
if ti.mode & stat.S_IXUSR:
301+
ti.mode |= stat.S_IXGRP
302+
303+
dest = io.BytesIO()
304+
with tarfile.open(fileobj=dest, mode="w") as tf:
305+
for (ti, filedata) in members:
306+
tf.addfile(ti, filedata)
307+
308+
dest.seek(0)
309+
310+
return dest
311+
312+
250313
def compress_python_archive(
251314
source_path: pathlib.Path, dist_path: pathlib.Path, basename: str
252315
):

src/main.rs

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -700,7 +700,23 @@ fn validate_distribution(dist_path: &Path) -> Result<Vec<String>> {
700700
let dctx = zstd::stream::Decoder::new(reader)?;
701701
let mut tf = tar::Archive::new(dctx);
702702

703-
for entry in tf.entries()? {
703+
// First entry in archive should be python/PYTHON.json.
704+
let mut entries = tf.entries()?;
705+
706+
let mut entry = entries.next().unwrap()?;
707+
if entry.path()?.display().to_string() == "python/PYTHON.json" {
708+
let mut data = Vec::new();
709+
entry.read_to_end(&mut data)?;
710+
let json = parse_python_json(&data).context("parsing PYTHON.json")?;
711+
errors.extend(validate_json(&json, triple)?);
712+
} else {
713+
errors.push(format!(
714+
"1st archive entry should be for python/PYTHON.json; got {}",
715+
entry.path()?.display()
716+
));
717+
}
718+
719+
for entry in entries {
704720
let mut entry = entry.map_err(|e| anyhow!("failed to iterate over archive: {}", e))?;
705721
let path = entry.path()?.to_path_buf();
706722

@@ -741,7 +757,6 @@ fn validate_distribution(dist_path: &Path) -> Result<Vec<String>> {
741757

742758
if path == PathBuf::from("python/PYTHON.json") {
743759
let json = parse_python_json(&data).context("parsing PYTHON.json")?;
744-
745760
errors.extend(validate_json(&json, triple)?);
746761
}
747762
}

0 commit comments

Comments
 (0)