Skip to content

Commit 3216d3b

Browse files
committed
merge #4391 into opencontainers/runc:release-1.1
Aleksa Sarai (2): [1.1] seccomp: patchbpf: always include native architecture in stub [1.1] seccomp: patchbpf: rename nativeArch -> linuxAuditArch Kir Kolyshkin (1): [1.1] libct/seccomp/patchbpf: rm duplicated code LGTMs: kolyshkin rata
2 parents bd671b6 + 618e149 commit 3216d3b

File tree

2 files changed

+110
-66
lines changed

2 files changed

+110
-66
lines changed

libcontainer/seccomp/patchbpf/enosys_linux.go

Lines changed: 59 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -164,97 +164,101 @@ func disassembleFilter(filter *libseccomp.ScmpFilter) ([]bpf.Instruction, error)
164164
return program, nil
165165
}
166166

167-
type nativeArch uint32
167+
type linuxAuditArch uint32
168168

169-
const invalidArch nativeArch = 0
169+
const invalidArch linuxAuditArch = 0
170170

171-
func archToNative(arch libseccomp.ScmpArch) (nativeArch, error) {
171+
func scmpArchToAuditArch(arch libseccomp.ScmpArch) (linuxAuditArch, error) {
172172
switch arch {
173173
case libseccomp.ArchNative:
174174
// Convert to actual native architecture.
175175
arch, err := libseccomp.GetNativeArch()
176176
if err != nil {
177177
return invalidArch, fmt.Errorf("unable to get native arch: %w", err)
178178
}
179-
return archToNative(arch)
179+
return scmpArchToAuditArch(arch)
180180
case libseccomp.ArchX86:
181-
return nativeArch(C.C_AUDIT_ARCH_I386), nil
181+
return linuxAuditArch(C.C_AUDIT_ARCH_I386), nil
182182
case libseccomp.ArchAMD64, libseccomp.ArchX32:
183183
// NOTE: x32 is treated like x86_64 except all x32 syscalls have the
184184
// 30th bit of the syscall number set to indicate that it's not a
185185
// normal x86_64 syscall.
186-
return nativeArch(C.C_AUDIT_ARCH_X86_64), nil
186+
return linuxAuditArch(C.C_AUDIT_ARCH_X86_64), nil
187187
case libseccomp.ArchARM:
188-
return nativeArch(C.C_AUDIT_ARCH_ARM), nil
188+
return linuxAuditArch(C.C_AUDIT_ARCH_ARM), nil
189189
case libseccomp.ArchARM64:
190-
return nativeArch(C.C_AUDIT_ARCH_AARCH64), nil
190+
return linuxAuditArch(C.C_AUDIT_ARCH_AARCH64), nil
191191
case libseccomp.ArchMIPS:
192-
return nativeArch(C.C_AUDIT_ARCH_MIPS), nil
192+
return linuxAuditArch(C.C_AUDIT_ARCH_MIPS), nil
193193
case libseccomp.ArchMIPS64:
194-
return nativeArch(C.C_AUDIT_ARCH_MIPS64), nil
194+
return linuxAuditArch(C.C_AUDIT_ARCH_MIPS64), nil
195195
case libseccomp.ArchMIPS64N32:
196-
return nativeArch(C.C_AUDIT_ARCH_MIPS64N32), nil
196+
return linuxAuditArch(C.C_AUDIT_ARCH_MIPS64N32), nil
197197
case libseccomp.ArchMIPSEL:
198-
return nativeArch(C.C_AUDIT_ARCH_MIPSEL), nil
198+
return linuxAuditArch(C.C_AUDIT_ARCH_MIPSEL), nil
199199
case libseccomp.ArchMIPSEL64:
200-
return nativeArch(C.C_AUDIT_ARCH_MIPSEL64), nil
200+
return linuxAuditArch(C.C_AUDIT_ARCH_MIPSEL64), nil
201201
case libseccomp.ArchMIPSEL64N32:
202-
return nativeArch(C.C_AUDIT_ARCH_MIPSEL64N32), nil
202+
return linuxAuditArch(C.C_AUDIT_ARCH_MIPSEL64N32), nil
203203
case libseccomp.ArchPPC:
204-
return nativeArch(C.C_AUDIT_ARCH_PPC), nil
204+
return linuxAuditArch(C.C_AUDIT_ARCH_PPC), nil
205205
case libseccomp.ArchPPC64:
206-
return nativeArch(C.C_AUDIT_ARCH_PPC64), nil
206+
return linuxAuditArch(C.C_AUDIT_ARCH_PPC64), nil
207207
case libseccomp.ArchPPC64LE:
208-
return nativeArch(C.C_AUDIT_ARCH_PPC64LE), nil
208+
return linuxAuditArch(C.C_AUDIT_ARCH_PPC64LE), nil
209209
case libseccomp.ArchS390:
210-
return nativeArch(C.C_AUDIT_ARCH_S390), nil
210+
return linuxAuditArch(C.C_AUDIT_ARCH_S390), nil
211211
case libseccomp.ArchS390X:
212-
return nativeArch(C.C_AUDIT_ARCH_S390X), nil
212+
return linuxAuditArch(C.C_AUDIT_ARCH_S390X), nil
213213
case libseccomp.ArchRISCV64:
214-
return nativeArch(C.C_AUDIT_ARCH_RISCV64), nil
214+
return linuxAuditArch(C.C_AUDIT_ARCH_RISCV64), nil
215215
default:
216216
return invalidArch, fmt.Errorf("unknown architecture: %v", arch)
217217
}
218218
}
219219

220-
type lastSyscallMap map[nativeArch]map[libseccomp.ScmpArch]libseccomp.ScmpSyscall
220+
type lastSyscallMap map[linuxAuditArch]map[libseccomp.ScmpArch]libseccomp.ScmpSyscall
221221

222222
// Figure out largest syscall number referenced in the filter for each
223223
// architecture. We will be generating code based on the native architecture
224224
// representation, but SCMP_ARCH_X32 means we have to track cases where the
225225
// same architecture has different largest syscalls based on the mode.
226226
func findLastSyscalls(config *configs.Seccomp) (lastSyscallMap, error) {
227-
lastSyscalls := make(lastSyscallMap)
228-
// Only loop over architectures which are present in the filter. Any other
229-
// architectures will get the libseccomp bad architecture action anyway.
227+
scmpArchs := make(map[libseccomp.ScmpArch]struct{})
230228
for _, ociArch := range config.Architectures {
231229
arch, err := libseccomp.GetArchFromString(ociArch)
232230
if err != nil {
233231
return nil, fmt.Errorf("unable to validate seccomp architecture: %w", err)
234232
}
233+
scmpArchs[arch] = struct{}{}
234+
}
235+
// On architectures like ppc64le, Docker inexplicably doesn't include the
236+
// native architecture in the architecture list which results in no
237+
// architectures being present in the list at all (rendering the ENOSYS
238+
// stub a no-op). So, always include the native architecture.
239+
if nativeScmpArch, err := libseccomp.GetNativeArch(); err != nil {
240+
return nil, fmt.Errorf("unable to get native arch: %w", err)
241+
} else if _, ok := scmpArchs[nativeScmpArch]; !ok {
242+
logrus.Debugf("seccomp: adding implied native architecture %v to config set", nativeScmpArch)
243+
scmpArchs[nativeScmpArch] = struct{}{}
244+
}
245+
logrus.Debugf("seccomp: configured architecture set: %s", scmpArchs)
235246

236-
// Map native architecture to a real architecture value to avoid
237-
// doubling-up the lastSyscall mapping.
238-
if arch == libseccomp.ArchNative {
239-
nativeArch, err := libseccomp.GetNativeArch()
240-
if err != nil {
241-
return nil, fmt.Errorf("unable to get native architecture: %w", err)
242-
}
243-
arch = nativeArch
244-
}
245-
246-
// Figure out native architecture representation of the architecture.
247-
nativeArch, err := archToNative(arch)
247+
// Only loop over architectures which are present in the filter. Any other
248+
// architectures will get the libseccomp bad architecture action anyway.
249+
lastSyscalls := make(lastSyscallMap)
250+
for arch := range scmpArchs {
251+
auditArch, err := scmpArchToAuditArch(arch)
248252
if err != nil {
249253
return nil, fmt.Errorf("cannot map architecture %v to AUDIT_ARCH_ constant: %w", arch, err)
250254
}
251255

252-
if _, ok := lastSyscalls[nativeArch]; !ok {
253-
lastSyscalls[nativeArch] = map[libseccomp.ScmpArch]libseccomp.ScmpSyscall{}
256+
if _, ok := lastSyscalls[auditArch]; !ok {
257+
lastSyscalls[auditArch] = map[libseccomp.ScmpArch]libseccomp.ScmpSyscall{}
254258
}
255-
if _, ok := lastSyscalls[nativeArch][arch]; ok {
259+
if _, ok := lastSyscalls[auditArch][arch]; ok {
256260
// Because of ArchNative we may hit the same entry multiple times.
257-
// Just skip it if we've seen this (nativeArch, ScmpArch)
261+
// Just skip it if we've seen this (linuxAuditArch, ScmpArch)
258262
// combination before.
259263
continue
260264
}
@@ -272,10 +276,11 @@ func findLastSyscalls(config *configs.Seccomp) (lastSyscallMap, error) {
272276
}
273277
}
274278
if largestSyscall != 0 {
275-
lastSyscalls[nativeArch][arch] = largestSyscall
279+
logrus.Debugf("seccomp: largest syscall number for arch %v is %v", arch, largestSyscall)
280+
lastSyscalls[auditArch][arch] = largestSyscall
276281
} else {
277-
logrus.Warnf("could not find any syscalls for arch %s", ociArch)
278-
delete(lastSyscalls[nativeArch], arch)
282+
logrus.Warnf("could not find any syscalls for arch %v", arch)
283+
delete(lastSyscalls[auditArch], arch)
279284
}
280285
}
281286
return lastSyscalls, nil
@@ -293,10 +298,10 @@ func findLastSyscalls(config *configs.Seccomp) (lastSyscallMap, error) {
293298
// close_range(2) which were added out-of-order in the syscall table between
294299
// kernel releases.
295300
func generateEnosysStub(lastSyscalls lastSyscallMap) ([]bpf.Instruction, error) {
296-
// A jump-table for each nativeArch used to generate the initial
301+
// A jump-table for each linuxAuditArch used to generate the initial
297302
// conditional jumps -- measured from the *END* of the program so they
298303
// remain valid after prepending to the tail.
299-
archJumpTable := map[nativeArch]uint32{}
304+
archJumpTable := map[linuxAuditArch]uint32{}
300305

301306
// Generate our own -ENOSYS rules for each architecture. They have to be
302307
// generated in reverse (prepended to the tail of the program) because the
@@ -309,7 +314,7 @@ func generateEnosysStub(lastSyscalls lastSyscallMap) ([]bpf.Instruction, error)
309314
}
310315

311316
// Generate the syscall -ENOSYS rules.
312-
for nativeArch, maxSyscalls := range lastSyscalls {
317+
for auditArch, maxSyscalls := range lastSyscalls {
313318
// The number of instructions from the tail of this section which need
314319
// to be jumped in order to reach the -ENOSYS return. If the section
315320
// does not jump, it will fall through to the actual filter.
@@ -390,7 +395,7 @@ func generateEnosysStub(lastSyscalls lastSyscallMap) ([]bpf.Instruction, error)
390395

391396
// If we're on x86 we need to add a check for x32 and if we're in
392397
// the wrong mode we jump over the section.
393-
if uint32(nativeArch) == uint32(C.C_AUDIT_ARCH_X86_64) {
398+
if uint32(auditArch) == uint32(C.C_AUDIT_ARCH_X86_64) {
394399
// Generate a prefix to check the mode.
395400
switch scmpArch {
396401
case libseccomp.ArchAMD64:
@@ -419,8 +424,8 @@ func generateEnosysStub(lastSyscalls lastSyscallMap) ([]bpf.Instruction, error)
419424
section = append(section, sectionTail...)
420425
case 2:
421426
// x32 and x86_64 are a unique case, we can't handle any others.
422-
if uint32(nativeArch) != uint32(C.C_AUDIT_ARCH_X86_64) {
423-
return nil, fmt.Errorf("unknown architecture overlap on native arch %#x", nativeArch)
427+
if uint32(auditArch) != uint32(C.C_AUDIT_ARCH_X86_64) {
428+
return nil, fmt.Errorf("unknown architecture overlap on native arch %#x", auditArch)
424429
}
425430

426431
x32sysno, ok := maxSyscalls[libseccomp.ArchX32]
@@ -497,7 +502,7 @@ func generateEnosysStub(lastSyscalls lastSyscallMap) ([]bpf.Instruction, error)
497502
programTail = append(section, programTail...)
498503

499504
// Update jump table.
500-
archJumpTable[nativeArch] = uint32(len(programTail))
505+
archJumpTable[auditArch] = uint32(len(programTail))
501506
}
502507

503508
// Add a dummy "jump to filter" for any architecture we might miss below.
@@ -517,9 +522,9 @@ func generateEnosysStub(lastSyscalls lastSyscallMap) ([]bpf.Instruction, error)
517522
// architectures based on how large the jumps are going to be, or
518523
// re-sort the candidate architectures each time to make sure that we
519524
// pick the largest jump which is going to be smaller than 255.
520-
for nativeArch := range lastSyscalls {
525+
for auditArch := range lastSyscalls {
521526
// We jump forwards but the jump table is calculated from the *END*.
522-
jump := uint32(len(programTail)) - archJumpTable[nativeArch]
527+
jump := uint32(len(programTail)) - archJumpTable[auditArch]
523528

524529
// Same routine as above -- this is a basic jeq check, complicated
525530
// slightly if it turns out that we need to do a long jump.
@@ -528,7 +533,7 @@ func generateEnosysStub(lastSyscalls lastSyscallMap) ([]bpf.Instruction, error)
528533
// jeq [arch],[jump]
529534
bpf.JumpIf{
530535
Cond: bpf.JumpEqual,
531-
Val: uint32(nativeArch),
536+
Val: uint32(auditArch),
532537
SkipTrue: uint8(jump),
533538
},
534539
}, programTail...)
@@ -537,7 +542,7 @@ func generateEnosysStub(lastSyscalls lastSyscallMap) ([]bpf.Instruction, error)
537542
// jne [arch],1
538543
bpf.JumpIf{
539544
Cond: bpf.JumpNotEqual,
540-
Val: uint32(nativeArch),
545+
Val: uint32(auditArch),
541546
SkipTrue: 1,
542547
},
543548
// ja [jump]

libcontainer/seccomp/patchbpf/enosys_linux_test.go

Lines changed: 51 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ import (
1212
"github.com/opencontainers/runc/libcontainer/configs"
1313

1414
libseccomp "github.com/seccomp/libseccomp-golang"
15+
"github.com/sirupsen/logrus"
1516
"golang.org/x/net/bpf"
1617
)
1718

@@ -23,7 +24,7 @@ type seccompData struct {
2324
}
2425

2526
// mockSyscallPayload creates a fake seccomp_data struct with the given data.
26-
func mockSyscallPayload(t *testing.T, sysno libseccomp.ScmpSyscall, arch nativeArch, args ...uint64) []byte {
27+
func mockSyscallPayload(t *testing.T, sysno libseccomp.ScmpSyscall, arch linuxAuditArch, args ...uint64) []byte {
2728
var buf bytes.Buffer
2829

2930
data := seccompData{
@@ -105,6 +106,18 @@ var testArches = []string{
105106
"ppc64le",
106107
"s390",
107108
"s390x",
109+
// Dummy value to indicate a configuration with no architecture specified.
110+
"native",
111+
}
112+
113+
var nativeArch string
114+
115+
func init() {
116+
scmpNativeArch, err := libseccomp.GetNativeArch()
117+
if err != nil {
118+
logrus.Panicf("get native arch: %v", err)
119+
}
120+
nativeArch = scmpNativeArch.String()
108121
}
109122

110123
func testEnosysStub(t *testing.T, defaultAction configs.Action, arches []string) {
@@ -150,17 +163,20 @@ func testEnosysStub(t *testing.T, defaultAction configs.Action, arches []string)
150163

151164
for _, arch := range testArches {
152165
type syscallTest struct {
153-
syscall string
154166
sysno libseccomp.ScmpSyscall
167+
syscall string
155168
expected uint32
156169
}
157170

171+
if arch == "native" {
172+
arch = nativeArch
173+
}
158174
scmpArch, err := libseccomp.GetArchFromString(arch)
159175
if err != nil {
160176
t.Fatalf("unknown libseccomp architecture %q: %v", arch, err)
161177
}
162178

163-
nativeArch, err := archToNative(scmpArch)
179+
auditArch, err := scmpArchToAuditArch(scmpArch)
164180
if err != nil {
165181
t.Fatalf("unknown audit architecture %q: %v", arch, err)
166182
}
@@ -179,9 +195,9 @@ func testEnosysStub(t *testing.T, defaultAction configs.Action, arches []string)
179195
t.Fatalf("unknown syscall %q on arch %q: %v", syscall, arch, err)
180196
}
181197
syscallTests = append(syscallTests, syscallTest{
182-
syscall,
183-
sysno,
184-
expected,
198+
sysno: sysno,
199+
syscall: syscall,
200+
expected: expected,
185201
})
186202
}
187203

@@ -228,12 +244,19 @@ func testEnosysStub(t *testing.T, defaultAction configs.Action, arches []string)
228244

229245
// Test syscalls in the explicit list.
230246
for _, test := range syscallTests {
231-
// Override the expected value in the two special cases.
232-
if !archSet[arch] || isAllowAction(defaultAction) {
247+
// Override the expected value in the two special cases:
248+
// 1. If the default action is allow, the filter won't have
249+
// the stub prepended so we expect a fallthrough.
250+
// 2. If the executing architecture is not in the architecture
251+
// set, then the architecture is not handled by the stub --
252+
// *except* in the case of the native architecture (which
253+
// is always included in the stub).
254+
if isAllowAction(defaultAction) ||
255+
(!archSet[arch] && arch != nativeArch) {
233256
test.expected = retFallthrough
234257
}
235258

236-
payload := mockSyscallPayload(t, test.sysno, nativeArch, 0x1337, 0xF00BA5)
259+
payload := mockSyscallPayload(t, test.sysno, auditArch, 0x1337, 0xF00BA5)
237260
// NOTE: golang.org/x/net/bpf returns int here rather
238261
// than uint32.
239262
rawRet, err := filter.Run(payload)
@@ -247,7 +270,7 @@ func testEnosysStub(t *testing.T, defaultAction configs.Action, arches []string)
247270
t.Logf(" [%4.1d] %s", idx, insn)
248271
}
249272
t.Logf("payload: %#v", payload)
250-
t.Errorf("filter %s(%d) %q(%d): got %#x, want %#x", arch, nativeArch, test.syscall, test.sysno, ret, test.expected)
273+
t.Errorf("filter %s(%d) %q(%d): got %#x, want %#x", arch, auditArch, test.syscall, test.sysno, ret, test.expected)
251274
}
252275
}
253276
}
@@ -263,7 +286,14 @@ var testActions = map[string]configs.Action{
263286

264287
func TestEnosysStub_SingleArch(t *testing.T) {
265288
for _, arch := range testArches {
266-
arches := []string{arch}
289+
var arches []string
290+
// "native" indicates a blank architecture field for seccomp, to test
291+
// the case where the running architecture was not included in the
292+
// architecture. Docker doesn't always set the architecture for some
293+
// reason (namely for ppc64le).
294+
if arch != "native" {
295+
arches = append(arches, arch)
296+
}
267297
t.Run("arch="+arch, func(t *testing.T) {
268298
for name, action := range testActions {
269299
t.Run("action="+name, func(t *testing.T) {
@@ -277,7 +307,16 @@ func TestEnosysStub_SingleArch(t *testing.T) {
277307
func TestEnosysStub_MultiArch(t *testing.T) {
278308
for end := 0; end < len(testArches); end++ {
279309
for start := 0; start < end; start++ {
280-
arches := testArches[start:end]
310+
var arches []string
311+
for _, arch := range testArches[start:end] {
312+
// "native" indicates a blank architecture field for seccomp, to test
313+
// the case where the running architecture was not included in the
314+
// architecture. Docker doesn't always set the architecture for some
315+
// reason (namely for ppc64le).
316+
if arch != "native" {
317+
arches = append(arches, arch)
318+
}
319+
}
281320
if len(arches) <= 1 {
282321
continue
283322
}

0 commit comments

Comments
 (0)