Skip to content

Commit a1acca9

Browse files
committed
merge #4219 into opencontainers/runc:main
Aleksa Sarai (2): seccomp: patchbpf: always include native architecture in stub seccomp: patchbpf: rename nativeArch -> linuxAuditArch LGTMs: AkihiroSuda kolyshkin
2 parents 8e69225 + ccc500c commit a1acca9

File tree

2 files changed

+105
-56
lines changed

2 files changed

+105
-56
lines changed

libcontainer/seccomp/patchbpf/enosys_linux.go

Lines changed: 59 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -171,87 +171,101 @@ func disassembleFilter(filter *libseccomp.ScmpFilter) ([]bpf.Instruction, error)
171171
return program, nil
172172
}
173173

174-
type nativeArch uint32
174+
type linuxAuditArch uint32
175175

176-
const invalidArch nativeArch = 0
176+
const invalidArch linuxAuditArch = 0
177177

178-
func archToNative(arch libseccomp.ScmpArch) (nativeArch, error) {
178+
func scmpArchToAuditArch(arch libseccomp.ScmpArch) (linuxAuditArch, error) {
179179
switch arch {
180180
case libseccomp.ArchNative:
181181
// Convert to actual native architecture.
182182
arch, err := libseccomp.GetNativeArch()
183183
if err != nil {
184184
return invalidArch, fmt.Errorf("unable to get native arch: %w", err)
185185
}
186-
return archToNative(arch)
186+
return scmpArchToAuditArch(arch)
187187
case libseccomp.ArchX86:
188-
return nativeArch(C.C_AUDIT_ARCH_I386), nil
188+
return linuxAuditArch(C.C_AUDIT_ARCH_I386), nil
189189
case libseccomp.ArchAMD64, libseccomp.ArchX32:
190190
// NOTE: x32 is treated like x86_64 except all x32 syscalls have the
191191
// 30th bit of the syscall number set to indicate that it's not a
192192
// normal x86_64 syscall.
193-
return nativeArch(C.C_AUDIT_ARCH_X86_64), nil
193+
return linuxAuditArch(C.C_AUDIT_ARCH_X86_64), nil
194194
case libseccomp.ArchARM:
195-
return nativeArch(C.C_AUDIT_ARCH_ARM), nil
195+
return linuxAuditArch(C.C_AUDIT_ARCH_ARM), nil
196196
case libseccomp.ArchARM64:
197-
return nativeArch(C.C_AUDIT_ARCH_AARCH64), nil
197+
return linuxAuditArch(C.C_AUDIT_ARCH_AARCH64), nil
198198
case libseccomp.ArchMIPS:
199-
return nativeArch(C.C_AUDIT_ARCH_MIPS), nil
199+
return linuxAuditArch(C.C_AUDIT_ARCH_MIPS), nil
200200
case libseccomp.ArchMIPS64:
201-
return nativeArch(C.C_AUDIT_ARCH_MIPS64), nil
201+
return linuxAuditArch(C.C_AUDIT_ARCH_MIPS64), nil
202202
case libseccomp.ArchMIPS64N32:
203-
return nativeArch(C.C_AUDIT_ARCH_MIPS64N32), nil
203+
return linuxAuditArch(C.C_AUDIT_ARCH_MIPS64N32), nil
204204
case libseccomp.ArchMIPSEL:
205-
return nativeArch(C.C_AUDIT_ARCH_MIPSEL), nil
205+
return linuxAuditArch(C.C_AUDIT_ARCH_MIPSEL), nil
206206
case libseccomp.ArchMIPSEL64:
207-
return nativeArch(C.C_AUDIT_ARCH_MIPSEL64), nil
207+
return linuxAuditArch(C.C_AUDIT_ARCH_MIPSEL64), nil
208208
case libseccomp.ArchMIPSEL64N32:
209-
return nativeArch(C.C_AUDIT_ARCH_MIPSEL64N32), nil
209+
return linuxAuditArch(C.C_AUDIT_ARCH_MIPSEL64N32), nil
210210
case libseccomp.ArchPPC:
211-
return nativeArch(C.C_AUDIT_ARCH_PPC), nil
211+
return linuxAuditArch(C.C_AUDIT_ARCH_PPC), nil
212212
case libseccomp.ArchPPC64:
213-
return nativeArch(C.C_AUDIT_ARCH_PPC64), nil
213+
return linuxAuditArch(C.C_AUDIT_ARCH_PPC64), nil
214214
case libseccomp.ArchPPC64LE:
215-
return nativeArch(C.C_AUDIT_ARCH_PPC64LE), nil
215+
return linuxAuditArch(C.C_AUDIT_ARCH_PPC64LE), nil
216216
case libseccomp.ArchS390:
217-
return nativeArch(C.C_AUDIT_ARCH_S390), nil
217+
return linuxAuditArch(C.C_AUDIT_ARCH_S390), nil
218218
case libseccomp.ArchS390X:
219-
return nativeArch(C.C_AUDIT_ARCH_S390X), nil
219+
return linuxAuditArch(C.C_AUDIT_ARCH_S390X), nil
220220
case libseccomp.ArchRISCV64:
221-
return nativeArch(C.C_AUDIT_ARCH_RISCV64), nil
221+
return linuxAuditArch(C.C_AUDIT_ARCH_RISCV64), nil
222222
default:
223223
return invalidArch, fmt.Errorf("unknown architecture: %v", arch)
224224
}
225225
}
226226

227-
type lastSyscallMap map[nativeArch]map[libseccomp.ScmpArch]libseccomp.ScmpSyscall
227+
type lastSyscallMap map[linuxAuditArch]map[libseccomp.ScmpArch]libseccomp.ScmpSyscall
228228

229229
// Figure out largest syscall number referenced in the filter for each
230230
// architecture. We will be generating code based on the native architecture
231231
// representation, but SCMP_ARCH_X32 means we have to track cases where the
232232
// same architecture has different largest syscalls based on the mode.
233233
func findLastSyscalls(config *configs.Seccomp) (lastSyscallMap, error) {
234-
lastSyscalls := make(lastSyscallMap)
235-
// Only loop over architectures which are present in the filter. Any other
236-
// architectures will get the libseccomp bad architecture action anyway.
234+
scmpArchs := make(map[libseccomp.ScmpArch]struct{})
237235
for _, ociArch := range config.Architectures {
238236
arch, err := libseccomp.GetArchFromString(ociArch)
239237
if err != nil {
240238
return nil, fmt.Errorf("unable to validate seccomp architecture: %w", err)
241239
}
240+
scmpArchs[arch] = struct{}{}
241+
}
242+
// On architectures like ppc64le, Docker inexplicably doesn't include the
243+
// native architecture in the architecture list which results in no
244+
// architectures being present in the list at all (rendering the ENOSYS
245+
// stub a no-op). So, always include the native architecture.
246+
if nativeScmpArch, err := libseccomp.GetNativeArch(); err != nil {
247+
return nil, fmt.Errorf("unable to get native arch: %w", err)
248+
} else if _, ok := scmpArchs[nativeScmpArch]; !ok {
249+
logrus.Debugf("seccomp: adding implied native architecture %v to config set", nativeScmpArch)
250+
scmpArchs[nativeScmpArch] = struct{}{}
251+
}
252+
logrus.Debugf("seccomp: configured architecture set: %s", scmpArchs)
242253

243-
// Figure out native architecture representation of the architecture.
244-
nativeArch, err := archToNative(arch)
254+
// Only loop over architectures which are present in the filter. Any other
255+
// architectures will get the libseccomp bad architecture action anyway.
256+
lastSyscalls := make(lastSyscallMap)
257+
for arch := range scmpArchs {
258+
auditArch, err := scmpArchToAuditArch(arch)
245259
if err != nil {
246260
return nil, fmt.Errorf("cannot map architecture %v to AUDIT_ARCH_ constant: %w", arch, err)
247261
}
248262

249-
if _, ok := lastSyscalls[nativeArch]; !ok {
250-
lastSyscalls[nativeArch] = map[libseccomp.ScmpArch]libseccomp.ScmpSyscall{}
263+
if _, ok := lastSyscalls[auditArch]; !ok {
264+
lastSyscalls[auditArch] = map[libseccomp.ScmpArch]libseccomp.ScmpSyscall{}
251265
}
252-
if _, ok := lastSyscalls[nativeArch][arch]; ok {
266+
if _, ok := lastSyscalls[auditArch][arch]; ok {
253267
// Because of ArchNative we may hit the same entry multiple times.
254-
// Just skip it if we've seen this (nativeArch, ScmpArch)
268+
// Just skip it if we've seen this (linuxAuditArch, ScmpArch)
255269
// combination before.
256270
continue
257271
}
@@ -269,10 +283,11 @@ func findLastSyscalls(config *configs.Seccomp) (lastSyscallMap, error) {
269283
}
270284
}
271285
if largestSyscall != 0 {
272-
lastSyscalls[nativeArch][arch] = largestSyscall
286+
logrus.Debugf("seccomp: largest syscall number for arch %v is %v", arch, largestSyscall)
287+
lastSyscalls[auditArch][arch] = largestSyscall
273288
} else {
274-
logrus.Warnf("could not find any syscalls for arch %s", ociArch)
275-
delete(lastSyscalls[nativeArch], arch)
289+
logrus.Warnf("could not find any syscalls for arch %v", arch)
290+
delete(lastSyscalls[auditArch], arch)
276291
}
277292
}
278293
return lastSyscalls, nil
@@ -290,10 +305,10 @@ func findLastSyscalls(config *configs.Seccomp) (lastSyscallMap, error) {
290305
// close_range(2) which were added out-of-order in the syscall table between
291306
// kernel releases.
292307
func generateEnosysStub(lastSyscalls lastSyscallMap) ([]bpf.Instruction, error) {
293-
// A jump-table for each nativeArch used to generate the initial
308+
// A jump-table for each linuxAuditArch used to generate the initial
294309
// conditional jumps -- measured from the *END* of the program so they
295310
// remain valid after prepending to the tail.
296-
archJumpTable := map[nativeArch]uint32{}
311+
archJumpTable := map[linuxAuditArch]uint32{}
297312

298313
// Generate our own -ENOSYS rules for each architecture. They have to be
299314
// generated in reverse (prepended to the tail of the program) because the
@@ -306,7 +321,7 @@ func generateEnosysStub(lastSyscalls lastSyscallMap) ([]bpf.Instruction, error)
306321
}
307322

308323
// Generate the syscall -ENOSYS rules.
309-
for nativeArch, maxSyscalls := range lastSyscalls {
324+
for auditArch, maxSyscalls := range lastSyscalls {
310325
// The number of instructions from the tail of this section which need
311326
// to be jumped in order to reach the -ENOSYS return. If the section
312327
// does not jump, it will fall through to the actual filter.
@@ -387,7 +402,7 @@ func generateEnosysStub(lastSyscalls lastSyscallMap) ([]bpf.Instruction, error)
387402

388403
// If we're on x86 we need to add a check for x32 and if we're in
389404
// the wrong mode we jump over the section.
390-
if uint32(nativeArch) == uint32(C.C_AUDIT_ARCH_X86_64) {
405+
if uint32(auditArch) == uint32(C.C_AUDIT_ARCH_X86_64) {
391406
// Generate a prefix to check the mode.
392407
switch scmpArch {
393408
case libseccomp.ArchAMD64:
@@ -416,8 +431,8 @@ func generateEnosysStub(lastSyscalls lastSyscallMap) ([]bpf.Instruction, error)
416431
section = append(section, sectionTail...)
417432
case 2:
418433
// x32 and x86_64 are a unique case, we can't handle any others.
419-
if uint32(nativeArch) != uint32(C.C_AUDIT_ARCH_X86_64) {
420-
return nil, fmt.Errorf("unknown architecture overlap on native arch %#x", nativeArch)
434+
if uint32(auditArch) != uint32(C.C_AUDIT_ARCH_X86_64) {
435+
return nil, fmt.Errorf("unknown architecture overlap on native arch %#x", auditArch)
421436
}
422437

423438
x32sysno, ok := maxSyscalls[libseccomp.ArchX32]
@@ -494,7 +509,7 @@ func generateEnosysStub(lastSyscalls lastSyscallMap) ([]bpf.Instruction, error)
494509
programTail = append(section, programTail...)
495510

496511
// Update jump table.
497-
archJumpTable[nativeArch] = uint32(len(programTail))
512+
archJumpTable[auditArch] = uint32(len(programTail))
498513
}
499514

500515
// Add a dummy "jump to filter" for any architecture we might miss below.
@@ -514,9 +529,9 @@ func generateEnosysStub(lastSyscalls lastSyscallMap) ([]bpf.Instruction, error)
514529
// architectures based on how large the jumps are going to be, or
515530
// re-sort the candidate architectures each time to make sure that we
516531
// pick the largest jump which is going to be smaller than 255.
517-
for nativeArch := range lastSyscalls {
532+
for auditArch := range lastSyscalls {
518533
// We jump forwards but the jump table is calculated from the *END*.
519-
jump := uint32(len(programTail)) - archJumpTable[nativeArch]
534+
jump := uint32(len(programTail)) - archJumpTable[auditArch]
520535

521536
// Same routine as above -- this is a basic jeq check, complicated
522537
// slightly if it turns out that we need to do a long jump.
@@ -525,7 +540,7 @@ func generateEnosysStub(lastSyscalls lastSyscallMap) ([]bpf.Instruction, error)
525540
// jeq [arch],[jump]
526541
bpf.JumpIf{
527542
Cond: bpf.JumpEqual,
528-
Val: uint32(nativeArch),
543+
Val: uint32(auditArch),
529544
SkipTrue: uint8(jump),
530545
},
531546
}, programTail...)
@@ -534,7 +549,7 @@ func generateEnosysStub(lastSyscalls lastSyscallMap) ([]bpf.Instruction, error)
534549
// jne [arch],1
535550
bpf.JumpIf{
536551
Cond: bpf.JumpNotEqual,
537-
Val: uint32(nativeArch),
552+
Val: uint32(auditArch),
538553
SkipTrue: 1,
539554
},
540555
// ja [jump]

libcontainer/seccomp/patchbpf/enosys_linux_test.go

Lines changed: 46 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ type seccompData struct {
2323
}
2424

2525
// mockSyscallPayload creates a fake seccomp_data struct with the given data.
26-
func mockSyscallPayload(t *testing.T, sysno libseccomp.ScmpSyscall, arch nativeArch, args ...uint64) []byte {
26+
func mockSyscallPayload(t *testing.T, sysno libseccomp.ScmpSyscall, arch linuxAuditArch, args ...uint64) []byte {
2727
var buf bytes.Buffer
2828

2929
data := seccompData{
@@ -105,8 +105,16 @@ var testArches = []string{
105105
"ppc64le",
106106
"s390",
107107
"s390x",
108+
// Dummy value to indicate a configuration with no architecture specified.
109+
"native",
108110
}
109111

112+
// Used for the "native" architecture.
113+
var (
114+
scmpNativeArch, _ = libseccomp.GetNativeArch()
115+
nativeArch = scmpNativeArch.String()
116+
)
117+
110118
func testEnosysStub(t *testing.T, defaultAction configs.Action, arches []string) {
111119
explicitSyscalls := []string{
112120
"setns",
@@ -150,17 +158,20 @@ func testEnosysStub(t *testing.T, defaultAction configs.Action, arches []string)
150158

151159
for _, arch := range testArches {
152160
type syscallTest struct {
153-
syscall string
154161
sysno libseccomp.ScmpSyscall
162+
syscall string
155163
expected uint32
156164
}
157165

166+
if arch == "native" {
167+
arch = nativeArch
168+
}
158169
scmpArch, err := libseccomp.GetArchFromString(arch)
159170
if err != nil {
160171
t.Fatalf("unknown libseccomp architecture %q: %v", arch, err)
161172
}
162173

163-
nativeArch, err := archToNative(scmpArch)
174+
auditArch, err := scmpArchToAuditArch(scmpArch)
164175
if err != nil {
165176
t.Fatalf("unknown audit architecture %q: %v", arch, err)
166177
}
@@ -179,9 +190,9 @@ func testEnosysStub(t *testing.T, defaultAction configs.Action, arches []string)
179190
t.Fatalf("unknown syscall %q on arch %q: %v", syscall, arch, err)
180191
}
181192
syscallTests = append(syscallTests, syscallTest{
182-
syscall,
183-
sysno,
184-
expected,
193+
sysno: sysno,
194+
syscall: syscall,
195+
expected: expected,
185196
})
186197
}
187198

@@ -228,12 +239,19 @@ func testEnosysStub(t *testing.T, defaultAction configs.Action, arches []string)
228239

229240
// Test syscalls in the explicit list.
230241
for _, test := range syscallTests {
231-
// Override the expected value in the two special cases.
232-
if !archSet[arch] || isAllowAction(defaultAction) {
242+
// Override the expected value in the two special cases:
243+
// 1. If the default action is allow, the filter won't have
244+
// the stub prepended so we expect a fallthrough.
245+
// 2. If the executing architecture is not in the architecture
246+
// set, then the architecture is not handled by the stub --
247+
// *except* in the case of the native architecture (which
248+
// is always included in the stub).
249+
if isAllowAction(defaultAction) ||
250+
(!archSet[arch] && arch != nativeArch) {
233251
test.expected = retFallthrough
234252
}
235253

236-
payload := mockSyscallPayload(t, test.sysno, nativeArch, 0x1337, 0xF00BA5)
254+
payload := mockSyscallPayload(t, test.sysno, auditArch, 0x1337, 0xF00BA5)
237255
// NOTE: golang.org/x/net/bpf returns int here rather
238256
// than uint32.
239257
rawRet, err := filter.Run(payload)
@@ -247,7 +265,7 @@ func testEnosysStub(t *testing.T, defaultAction configs.Action, arches []string)
247265
t.Logf(" [%4.1d] %s", idx, insn)
248266
}
249267
t.Logf("payload: %#v", payload)
250-
t.Errorf("filter %s(%d) %q(%d): got %#x, want %#x", arch, nativeArch, test.syscall, test.sysno, ret, test.expected)
268+
t.Errorf("filter %s(%d) %q(%d): got %#x, want %#x", arch, auditArch, test.syscall, test.sysno, ret, test.expected)
251269
}
252270
}
253271
}
@@ -263,7 +281,14 @@ var testActions = map[string]configs.Action{
263281

264282
func TestEnosysStub_SingleArch(t *testing.T) {
265283
for _, arch := range testArches {
266-
arches := []string{arch}
284+
var arches []string
285+
// "native" indicates a blank architecture field for seccomp, to test
286+
// the case where the running architecture was not included in the
287+
// architecture. Docker doesn't always set the architecture for some
288+
// reason (namely for ppc64le).
289+
if arch != "native" {
290+
arches = append(arches, arch)
291+
}
267292
t.Run("arch="+arch, func(t *testing.T) {
268293
for name, action := range testActions {
269294
t.Run("action="+name, func(t *testing.T) {
@@ -277,7 +302,16 @@ func TestEnosysStub_SingleArch(t *testing.T) {
277302
func TestEnosysStub_MultiArch(t *testing.T) {
278303
for end := 0; end < len(testArches); end++ {
279304
for start := 0; start < end; start++ {
280-
arches := testArches[start:end]
305+
var arches []string
306+
for _, arch := range testArches[start:end] {
307+
// "native" indicates a blank architecture field for seccomp, to test
308+
// the case where the running architecture was not included in the
309+
// architecture. Docker doesn't always set the architecture for some
310+
// reason (namely for ppc64le).
311+
if arch != "native" {
312+
arches = append(arches, arch)
313+
}
314+
}
281315
if len(arches) <= 1 {
282316
continue
283317
}

0 commit comments

Comments
 (0)