Skip to content

Commit 848aa38

Browse files
authored
Merge pull request #3474 from cyphar/seccomp-enosys-setup
seccomp: enosys: always return -ENOSYS for setup(2)
2 parents 967079d + 6a79271 commit 848aa38

File tree

3 files changed

+66
-21
lines changed

3 files changed

+66
-21
lines changed

CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
2222

2323
* In case the runc binary resides on tmpfs, `runc init` no longer re-execs
2424
itself twice. (#3342)
25+
* Our seccomp `-ENOSYS` stub now correctly handles multiplexed syscalls on
26+
s390 and s390x. This solves the issue where syscalls the host kernel did not
27+
support would return `-EPERM` despite the existence of the `-ENOSYS` stub
28+
code (this was due to how s390x does syscall multiplexing). (#3474)
2529

2630
## [1.1.0] - 2022-01-14
2731

libcontainer/seccomp/patchbpf/enosys_linux.go

Lines changed: 49 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,14 @@ import "C"
8080

8181
var retErrnoEnosys = uint32(C.C_ACT_ERRNO_ENOSYS)
8282

83+
// Assume sizeof(int) == 4 in the BPF program.
84+
const bpfSizeofInt = 4
85+
86+
// This syscall is used for multiplexing "large" syscalls on s390(x). Unknown
87+
// syscalls will end up with this syscall number, so we need to explcitly
88+
// return -ENOSYS for this syscall on those architectures.
89+
const s390xMultiplexSyscall libseccomp.ScmpSyscall = 0
90+
8391
func isAllowAction(action configs.Action) bool {
8492
switch action {
8593
// Trace is considered an "allow" action because a good tracer should
@@ -94,15 +102,14 @@ func isAllowAction(action configs.Action) bool {
94102

95103
func parseProgram(rdr io.Reader) ([]bpf.RawInstruction, error) {
96104
var program []bpf.RawInstruction
97-
loop:
98105
for {
99106
// Read the next instruction. We have to use NativeEndian because
100107
// seccomp_export_bpf outputs the program in *host* endian-ness.
101108
var insn unix.SockFilter
102109
if err := binary.Read(rdr, utils.NativeEndian, &insn); err != nil {
103110
if errors.Is(err, io.EOF) {
104111
// Parsing complete.
105-
break loop
112+
break
106113
}
107114
if errors.Is(err, io.ErrUnexpectedEOF) {
108115
// Parsing stopped mid-instruction.
@@ -315,19 +322,46 @@ func generateEnosysStub(lastSyscalls lastSyscallMap) ([]bpf.Instruction, error)
315322
// directly from the arch code so we need to do it here. Sadly we can't
316323
// share this code between architecture branches.
317324
section := []bpf.Instruction{
318-
// load [0]
319-
bpf.LoadAbsolute{Off: 0, Size: 4}, // NOTE: We assume sizeof(int) == 4.
325+
// load [0] (syscall number)
326+
bpf.LoadAbsolute{Off: 0, Size: bpfSizeofInt},
320327
}
321328

322329
switch len(maxSyscalls) {
323330
case 0:
324331
// No syscalls found for this arch -- skip it and move on.
325332
continue
326333
case 1:
327-
// Get the only syscall in the map.
328-
var sysno libseccomp.ScmpSyscall
329-
for _, no := range maxSyscalls {
334+
// Get the only syscall and scmpArch in the map.
335+
var (
336+
scmpArch libseccomp.ScmpArch
337+
sysno libseccomp.ScmpSyscall
338+
)
339+
for arch, no := range maxSyscalls {
330340
sysno = no
341+
scmpArch = arch
342+
}
343+
344+
switch scmpArch {
345+
// Return -ENOSYS for setup(2) on s390(x). This syscall is used for
346+
// multiplexing "large syscall number" syscalls, but if the syscall
347+
// number is not known to the kernel then the syscall number is
348+
// left unchanged (and because it is sysno=0, you'll end up with
349+
// EPERM for syscalls the kernel doesn't know about).
350+
//
351+
// The actual setup(2) syscall is never used by userspace anymore
352+
// (and hasn't existed for decades) outside of this multiplexing
353+
// scheme so returning -ENOSYS is fine.
354+
case libseccomp.ArchS390, libseccomp.ArchS390X:
355+
section = append(section, []bpf.Instruction{
356+
// jne [setup=0],1
357+
bpf.JumpIf{
358+
Cond: bpf.JumpNotEqual,
359+
Val: uint32(s390xMultiplexSyscall),
360+
SkipTrue: 1,
361+
},
362+
// ret [ENOSYS]
363+
bpf.RetConstant{Val: retErrnoEnosys},
364+
}...)
331365
}
332366

333367
// The simplest case just boils down to a single jgt instruction,
@@ -349,8 +383,8 @@ func generateEnosysStub(lastSyscalls lastSyscallMap) ([]bpf.Instruction, error)
349383
sectionTail = []bpf.Instruction{
350384
// jle [syscall],1
351385
bpf.JumpIf{Cond: bpf.JumpLessOrEqual, Val: uint32(sysno), SkipTrue: 1},
352-
// ja [baseJumpEnosys+1]
353-
bpf.Jump{Skip: baseJumpEnosys + 1},
386+
// ret [ENOSYS]
387+
bpf.RetConstant{Val: retErrnoEnosys},
354388
// ja [baseJumpFilter]
355389
bpf.Jump{Skip: baseJumpFilter},
356390
}
@@ -359,12 +393,6 @@ func generateEnosysStub(lastSyscalls lastSyscallMap) ([]bpf.Instruction, error)
359393
// If we're on x86 we need to add a check for x32 and if we're in
360394
// the wrong mode we jump over the section.
361395
if uint32(nativeArch) == uint32(C.C_AUDIT_ARCH_X86_64) {
362-
// Grab the only architecture in the map.
363-
var scmpArch libseccomp.ScmpArch
364-
for arch := range maxSyscalls {
365-
scmpArch = arch
366-
}
367-
368396
// Generate a prefix to check the mode.
369397
switch scmpArch {
370398
case libseccomp.ArchAMD64:
@@ -440,7 +468,7 @@ func generateEnosysStub(lastSyscalls lastSyscallMap) ([]bpf.Instruction, error)
440468
// jset (1<<30),1
441469
// jgt [x86 syscall],1,2
442470
// jle [x32 syscall],1
443-
// ja [baseJumpEnosys+1]
471+
// ret [ENOSYS]
444472
// ja [baseJumpFilter]
445473
section = append(section, []bpf.Instruction{
446474
// jset (1<<30),1
@@ -451,14 +479,14 @@ func generateEnosysStub(lastSyscalls lastSyscallMap) ([]bpf.Instruction, error)
451479
Val: uint32(x86sysno),
452480
SkipTrue: 1, SkipFalse: 2,
453481
},
454-
// jle [x32 syscall],[baseJumpEnosys]
482+
// jle [x32 syscall],1
455483
bpf.JumpIf{
456484
Cond: bpf.JumpLessOrEqual,
457485
Val: uint32(x32sysno),
458486
SkipTrue: 1,
459487
},
460-
// ja [baseJumpEnosys+1]
461-
bpf.Jump{Skip: baseJumpEnosys + 1},
488+
// ret [ENOSYS]
489+
bpf.RetConstant{Val: retErrnoEnosys},
462490
// ja [baseJumpFilter]
463491
bpf.Jump{Skip: baseJumpFilter},
464492
}...)
@@ -522,8 +550,8 @@ func generateEnosysStub(lastSyscalls lastSyscallMap) ([]bpf.Instruction, error)
522550

523551
// Prepend the load instruction for the architecture.
524552
programTail = append([]bpf.Instruction{
525-
// load [4]
526-
bpf.LoadAbsolute{Off: 4, Size: 4}, // NOTE: We assume sizeof(int) == 4.
553+
// load [4] (architecture)
554+
bpf.LoadAbsolute{Off: bpfSizeofInt, Size: bpfSizeofInt},
527555
}, programTail...)
528556

529557
// And that's all folks!

libcontainer/seccomp/patchbpf/enosys_linux_test.go

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -213,6 +213,19 @@ func testEnosysStub(t *testing.T, defaultAction configs.Action, arches []string)
213213
})
214214
}
215215

216+
// If we're on s390(x) make sure you get -ENOSYS for the "setup"
217+
// syscall (this is done to work around an issue with s390x's
218+
// syscall multiplexing which results in unknown syscalls being a
219+
// setup(2) invocation).
220+
switch scmpArch {
221+
case libseccomp.ArchS390, libseccomp.ArchS390X:
222+
syscallTests = append(syscallTests, syscallTest{
223+
sysno: s390xMultiplexSyscall,
224+
syscall: "setup",
225+
expected: retErrnoEnosys,
226+
})
227+
}
228+
216229
// Test syscalls in the explicit list.
217230
for _, test := range syscallTests {
218231
// Override the expected value in the two special cases.

0 commit comments

Comments
 (0)