Skip to content

Commit cccc98a

Browse files
authored
Fix disallowed cr0 write protection and close_fd (#80)
Since the commit 8dbec27a242cd3e2816eeb98d3237b9f57cf6232 [1] (kernel version v5.3+ [2]) the sensitive CR0 bits in x86 is pinned, we need to use the inline asm [3][4] to bypass it. commit 8dbec27a242cd3e2816eeb98d3237b9f57cf6232 : > With sensitive CR4 bits pinned now, it's possible that the WP bit for > CR0 might become a target as well. > > Following the same reasoning for the CR4 pinning, pin CR0's WP > bit. Contrary to the cpu feature dependend CR4 pinning this can be done > with a constant value. Also, getting "sys_call_table" [8] from the symbol lookup by using the address of "close_fd" does not work for v5.11+ [5][6]. The reason is the entry of "sys_call_table[__NR_close]" is not the address of "close_fd", actually it is "__x64_sys_close" in x86. Two solutions were proposed: using "kallsyms_lookup_name" [7] or just specifying the address into the module. The symbol "kallsyms_lookup_name" is unexported since v5.7; the address of "sys_call_table" can be found in "/boot/System.map" or "/proc/kallsyms". Since v5.7, the manual symbol lookup is not guaranteed to work because of control-flow integrity (or control-flow enforcement [9][10]) is added [11] for x86, but it is disabled since v5.11 [12][13]. To make sure manual symbol lookup work, it only uses up to v5.4. Reference: [1] torvalds/linux@8dbec27 [2] https://outflux.net/blog/archives/2019/11/14/security-things-in-linux-v5-3/ [3] https://patchwork.kernel.org/project/linux-kbuild/patch/[email protected]/ [4] https://stackoverflow.com/questions/58512430/how-to-write-to-protected-pages-in-the-linux-kernel [5] https://lore.kernel.org/bpf/[email protected]/ [6] https://lore.kernel.org/bpf/[email protected]/ [7] torvalds/linux@0bd476e [8] torvalds/linux@8f27766 [9] https://lore.kernel.org/lkml/[email protected]/ [10] https://lore.kernel.org/linux-doc/[email protected]/T/ [11] torvalds/linux@5790921 [12] torvalds/linux@20bf2b3 [13] https://lore.kernel.org/bpf/[email protected]/
1 parent d3bde7d commit cccc98a

File tree

2 files changed

+193
-12
lines changed

2 files changed

+193
-12
lines changed

examples/syscall.c

Lines changed: 99 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -23,17 +23,44 @@
2323
#include <linux/sched.h>
2424
#include <linux/uaccess.h>
2525

26+
27+
/* The way we access "sys_call_table" varies as kernel internal changes.
28+
* - ver <= 5.4 : manual symbol lookup
29+
* - 5.4 < ver < 5.7 : kallsyms_lookup_name
30+
* - 5.7 <= ver : Kprobes or specific kernel module parameter
31+
*/
32+
2633
/* The in-kernel calls to the ksys_close() syscall were removed in Linux v5.11+.
2734
*/
28-
#if (LINUX_VERSION_CODE < KERNEL_VERSION(5, 11, 0))
29-
#include <linux/syscalls.h> /* ksys_close() wrapper for backward compatibility */
30-
#define close_fd ksys_close
35+
#if (LINUX_VERSION_CODE < KERNEL_VERSION(5, 7, 0))
36+
37+
#if LINUX_VERSION_CODE <= KERNEL_VERSION(5, 4, 0)
38+
#define HAVE_KSYS_CLOSE 1
39+
#include <linux/syscalls.h> /* For ksys_close() */
3140
#else
32-
#include <linux/fdtable.h> /* For close_fd */
41+
#include <linux/kallsyms.h> /* For kallsyms_lookup_name */
42+
#endif
43+
44+
#else
45+
46+
#if defined(CONFIG_KPROBES)
47+
#define HAVE_KPROBES 1
48+
#include <linux/kprobes.h>
49+
#else
50+
#define HAVE_PARAM 1
51+
#include <linux/kallsyms.h> /* For sprint_symbol */
52+
/* The address of the sys_call_table, which can be obtained with looking up
53+
* "/boot/System.map" or "/proc/kallsyms". When the kernel version is v5.7+,
54+
* without CONFIG_KPROBES, you can input the parameter or the module will look
55+
* up all the memory.
56+
*/
57+
static unsigned long sym = 0;
58+
module_param(sym, ulong, 0644);
59+
#endif
60+
3361
#endif
3462

3563
unsigned long **sys_call_table;
36-
unsigned long original_cr0;
3764

3865
/* UID we want to spy on - will be filled from the command line. */
3966
static int uid;
@@ -83,37 +110,97 @@ asmlinkage int our_sys_open(const char *filename, int flags, int mode)
83110

84111
static unsigned long **aquire_sys_call_table(void)
85112
{
113+
#ifdef HAVE_KSYS_CLOSE
86114
unsigned long int offset = PAGE_OFFSET;
87115
unsigned long **sct;
88116

89117
while (offset < ULLONG_MAX) {
90118
sct = (unsigned long **) offset;
91119

92-
if (sct[__NR_close] == (unsigned long *) close_fd)
120+
if (sct[__NR_close] == (unsigned long *) ksys_close)
93121
return sct;
94122

95123
offset += sizeof(void *);
96124
}
97125

98126
return NULL;
127+
#endif
128+
129+
#ifdef HAVE_PARAM
130+
const char sct_name[15] = "sys_call_table";
131+
char symbol[40] = {0};
132+
133+
if (sym == 0) {
134+
pr_alert(
135+
"For Linux v5.7+, Kprobes is the preferable way to get "
136+
"symbol.\n");
137+
pr_info(
138+
"If Kprobes is absent, you have to specify the address of "
139+
"sys_call_table symbol\n");
140+
pr_info(
141+
"by /boot/System.map or /proc/kallsyms, which contains all the "
142+
"symbol addresses, into sym parameter.\n");
143+
return NULL;
144+
}
145+
sprint_symbol(symbol, sym);
146+
if (!strncmp(sct_name, symbol, sizeof(sct_name) - 1))
147+
return (unsigned long **) sym;
148+
149+
return NULL;
150+
#endif
151+
152+
#ifdef HAVE_KPROBES
153+
unsigned long (*kallsyms_lookup_name)(const char *name);
154+
struct kprobe kp = {
155+
.symbol_name = "kallsyms_lookup_name",
156+
};
157+
158+
if (register_kprobe(&kp) < 0)
159+
return NULL;
160+
kallsyms_lookup_name = (unsigned long (*)(const char *name)) kp.addr;
161+
unregister_kprobe(&kp);
162+
#endif
163+
164+
return (unsigned long **) kallsyms_lookup_name("sys_call_table");
165+
}
166+
167+
#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 3, 0)
168+
static inline void __write_cr0(unsigned long cr0)
169+
{
170+
asm volatile("mov %0,%%cr0" : "+r"(cr0) : : "memory");
171+
}
172+
#else
173+
#define __write_cr0 write_cr0
174+
#endif
175+
176+
static void enable_write_protection(void)
177+
{
178+
unsigned long cr0 = read_cr0();
179+
set_bit(16, &cr0);
180+
__write_cr0(cr0);
181+
}
182+
183+
static void disable_write_protection(void)
184+
{
185+
unsigned long cr0 = read_cr0();
186+
clear_bit(16, &cr0);
187+
__write_cr0(cr0);
99188
}
100189

101190
static int __init syscall_start(void)
102191
{
103192
if (!(sys_call_table = aquire_sys_call_table()))
104193
return -1;
105194

106-
original_cr0 = read_cr0();
107-
108-
write_cr0(original_cr0 & ~0x00010000);
195+
disable_write_protection();
109196

110197
/* keep track of the original open function */
111198
original_call = (void *) sys_call_table[__NR_open];
112199

113200
/* use our open function instead */
114201
sys_call_table[__NR_open] = (unsigned long *) our_sys_open;
115202

116-
write_cr0(original_cr0);
203+
enable_write_protection();
117204

118205
pr_info("Spying on UID:%d\n", uid);
119206

@@ -133,9 +220,9 @@ static void __exit syscall_end(void)
133220
pr_alert("an unstable state.\n");
134221
}
135222

136-
write_cr0(original_cr0 & ~0x00010000);
223+
disable_write_protection();
137224
sys_call_table[__NR_open] = (unsigned long *) original_call;
138-
write_cr0(original_cr0);
225+
enable_write_protection();
139226

140227
msleep(2000);
141228
}

lkmpg.tex

Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1204,6 +1204,100 @@ \section{System Calls}
12041204
So, if we want to change the way a certain system call works, what we need to do is to write our own function to implement it (usually by adding a bit of our own code, and then calling the original function) and then change the pointer at \cpp|sys_call_table| to point to our function.
12051205
Because we might be removed later and we don't want to leave the system in an unstable state, it's important for \cpp|cleanup_module| to restore the table to its original state.
12061206

1207+
To modify the content of \cpp|sys_call_table|, we need to consider the control register.
1208+
A control register is a processor register that changes or controls the general behavior of the CPU.
1209+
For x86 architecture, the \verb|cr0| register has various control flags that modify the basic operation of the processor.
1210+
The \verb|WP| flag in \verb|cr0| stands for write protection.
1211+
Once the \verb|WP| flag is set, the processor disallows further write attempts to the read-only sections
1212+
Therefore, we must disable the \verb|WP| flag before modifying \cpp|sys_call_table|.
1213+
Since Linux v5.3, the \cpp|write_cr0| function cannot be used because of the sensitive \verb|cr0| bits pinned by the security issue, the attacker may write into CPU control registers to disable CPU protections like write protection.
1214+
As a result, we have to provide the custom assembly routine to bypass it.
1215+
1216+
However, \cpp|sys_call_table| symbol is unexported to prevent misuse.
1217+
But there have few ways to get the symbol, manual symbol lookup and \cpp|kallsyms_lookup_name|.
1218+
Here we use both depend on the kernel version.
1219+
1220+
Because of the \textit{control-flow integrity}, which is a technique to prevent the redirect execution code from the attacker, for making sure that the indirect calls go to the expected addresses and the return addresses are not changed.
1221+
Since Linux v5.7, the kernel patched the series of \textit{control-flow enforcement} (CET) for x86, and some configurations of GCC, like GCC versions 9 and 10 in Ubuntu, will add with CET (the \verb|-fcf-protection| option) in the kernel by default.
1222+
Using that GCC to compile the kernel with retpoline off may result in CET being enabled in the kernel.
1223+
You can use the following command to check out the \verb|-fcf-protection| option is enabled or not:
1224+
\begin{verbatim}
1225+
$ gcc -v -Q -O2 --help=target | grep protection
1226+
Using built-in specs.
1227+
COLLECT_GCC=gcc
1228+
COLLECT_LTO_WRAPPER=/usr/lib/gcc/x86_64-linux-gnu/9/lto-wrapper
1229+
...
1230+
gcc version 9.3.0 (Ubuntu 9.3.0-17ubuntu1~20.04)
1231+
COLLECT_GCC_OPTIONS='-v' '-Q' '-O2' '--help=target' '-mtune=generic' '-march=x86-64'
1232+
/usr/lib/gcc/x86_64-linux-gnu/9/cc1 -v ... -fcf-protection ...
1233+
GNU C17 (Ubuntu 9.3.0-17ubuntu1~20.04) version 9.3.0 (x86_64-linux-gnu)
1234+
...
1235+
\end{verbatim}
1236+
But CET should not be enabled in the kernel, it may break the Kprobes and bpf.
1237+
Consequently, CET is disabled since v.11.
1238+
To guarantee the manual symbol lookup worked, we only use up to v5.4.
1239+
1240+
Unfortunately, since Linux v5.7 \cpp|kallsyms_lookup_name| is also unexported, it needs certain trick to get the address of \cpp|kallsyms_lookup_name|.
1241+
If \cpp|CONFIG_KPROBES| is enabled, we can facilitate the retrieval of function addresses by means of Kprobes to dynamically break into the specific kernel routine.
1242+
Kprobes inserts a breakpoint at the entry of function by replacing the first bytes of the probed instruction.
1243+
When a CPU hits the breakpoint, registers are stored, and the control will pass to Kprobes.
1244+
It passes the addresses of the saved registers and the Kprobe struct to the handler you defined, then executes it.
1245+
Kprobes can be registered by symbol name or address.
1246+
Within the symbol name, the address will be handled by the kernel.
1247+
1248+
Otherwise, specify the address of \cpp|sys_call_table| from \verb|/proc/kallsyms| and \verb|/boot/System.map| into \cpp|sym| parameter.
1249+
Following is the sample usage for \verb|/proc/kallsyms|:
1250+
\begin{verbatim}
1251+
$ sudo grep sys_call_table /proc/kallsyms
1252+
ffffffff82000280 R x32_sys_call_table
1253+
ffffffff820013a0 R sys_call_table
1254+
ffffffff820023e0 R ia32_sys_call_table
1255+
$ sudo insmod syscall.ko sym=0xffffffff820013a0
1256+
\end{verbatim}
1257+
1258+
Using the address from \verb|/boot/System.map|, be careful about \verb|KASLR| (Kernel Address Space Layout Randomization).
1259+
\verb|KASLR| may randomize the address of kernel code and data at every boot time, such as the static address listed in \verb|/boot/System.map| will offset by some entropy.
1260+
The purpose of \verb|KASLR| is to protect the kernel space from the attacker.
1261+
Without \verb|KASLR|, the attacker may find the target address in the fixed address easily.
1262+
Then the attacker can use return-oriented programming to insert some malicious codes to execute or receive the target data by a tampered pointer.
1263+
\verb|KASLR| mitigates these kinds of attacks because the attacker cannot immediately know the target address, but a brute-force attack can still work.
1264+
If the address of a symbol in \verb|/proc/kallsyms| is different from the address in \verb|/boot/System.map|, \verb|KASLR| is enabled with the kernel, which your system running on.
1265+
\begin{verbatim}
1266+
$ grep GRUB_CMDLINE_LINUX_DEFAULT /etc/default/grub
1267+
GRUB_CMDLINE_LINUX_DEFAULT="quiet splash"
1268+
$ sudo grep sys_call_table /boot/System.map-$(uname -r)
1269+
ffffffff82000300 R sys_call_table
1270+
$ sudo grep sys_call_table /proc/kallsyms
1271+
ffffffff820013a0 R sys_call_table
1272+
# Reboot
1273+
$ sudo grep sys_call_table /boot/System.map-$(uname -r)
1274+
ffffffff82000300 R sys_call_table
1275+
$ sudo grep sys_call_table /proc/kallsyms
1276+
ffffffff86400300 R sys_call_table
1277+
\end{verbatim}
1278+
If \verb|KASLR| is enabled, we have to take care of the address from \verb|/proc/kallsyms| each time we reboot the machine.
1279+
In order to use the address from \verb|/boot/System.map|, make sure that \verb|KASLR| is disabled.
1280+
You can add the \verb|nokaslr| for disabling \verb|KASLR| in next booting time:
1281+
\begin{verbatim}
1282+
$ grep GRUB_CMDLINE_LINUX_DEFAULT /etc/default/grub
1283+
GRUB_CMDLINE_LINUX_DEFAULT="quiet splash"
1284+
$ sudo perl -i -pe 'm/quiet/ and s//quiet nokaslr/' /etc/default/grub
1285+
$ grep quiet /etc/default/grub
1286+
GRUB_CMDLINE_LINUX_DEFAULT="quiet nokaslr splash"
1287+
$ sudo update-grub
1288+
\end{verbatim}
1289+
1290+
For more information, check out the following:
1291+
1292+
\begin{itemize}
1293+
\item \href{https://lwn.net/Articles/804849/}{Cook: Security things in Linux v5.3}
1294+
\item \href{https://lwn.net/Articles/12211/}{Unexporting the system call table}
1295+
\item \href{https://lwn.net/Articles/810077/}{Control-flow integrity for the kernel}
1296+
\item \href{https://lwn.net/Articles/813350/}{Unexporting kallsyms\_lookup\_name()}
1297+
\item \href{https://www.kernel.org/doc/Documentation/kprobes.txt}{Kernel Probes (Kprobes)}
1298+
\item \href{https://lwn.net/Articles/569635/}{Kernel address space layout randomization}
1299+
\end{itemize}
1300+
12071301
The source code here is an example of such a kernel module.
12081302
We want to ``spy'' on a certain user, and to \cpp|pr_info()| a message whenever that user opens a file.
12091303
Towards this end, we replace the system call to open a file with our own function, called \cpp|our_sys_open|.

0 commit comments

Comments
 (0)