這一個章節要跟大家介紹如何寫一個filesystem,主要是讓大家對VFS的framework有一些基本認識。
首先要透過register_filesystem()註冊對應的filesystem,而register_filesystem()需要struct file_system_type的一些基本參數如下:
static struct file_system_type bv_fs_type = {
.owner = THIS_MODULE,
.name = "bvfs",
.mount = bvfs_mount,
.kill_sb = bvfs_kill_sb,
};
MODULE_ALIAS_FS("bv");
static int __init init_bv_fs(void)
{
printk("%s(#%d)\n", __func__, __LINE__);
return register_filesystem(&bv_fs_type);
}
module_init(init_bv_fs);
name是filesystem type的name,mount是mount的時候會被呼叫的method,mount() method必須回傳root dentry,kill_sb則是umount會被呼叫的method。
通常mount() method會調用generic mount() implementations,包含mount_bdev、mount_nodev、mount_single並帶入fill_super() callback用於初始化struct super_block *與創建root dentry。關係圖概略如下:
在bvfs_fill_super()中主要要填入super block operations,並透過bvfs_get_inode()取得mount point/root的inode,再透過d_make_root()取得root dentry。
struct inode *bvfs_get_inode(struct super_block *sb, const struct inode *dir, umode_t mode, dev_t dev)
{
struct inode *inode = new_inode(sb);
pr_debug("%s(#%d): mode:0%o\n", __func__, __LINE__, mode);
if (inode) {
inode->i_ino = get_next_ino();
pr_debug("%s(#%d): i_ino:%lx\n", __func__, __LINE__, inode->i_ino);
inode_init_owner(inode, dir, mode);
inode->i_mapping->a_ops = &bvfs_aops;
inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
switch (mode & S_IFMT) {
default:
init_special_inode(inode, mode, dev);
break;
case S_IFREG:
inode->i_op = &bvfs_file_inode_operations;
inode->i_fop = &bvfs_file_operations;
break;
case S_IFDIR:
inode->i_op = &bvfs_dir_inode_ops;
inode->i_fop = &simple_dir_operations;
inc_nlink(inode);
break;
}
}
return inode;
}
#define BVFS_MAGIC 0x20201204
static int bvfs_fill_super(struct super_block *sb, void *data, int silent)
{
struct bvfs_fs_info *fsi;
struct inode *inode;
pr_debug("%s(#%d): data:%s, silent:%d\n", __func__, __LINE__, (char *) data, silent);
fsi = kzalloc(sizeof(*fsi), GFP_KERNEL);
sb->s_fs_info = fsi;
if (!fsi) {
pr_err("%s(#%d): kzalloc()\n", __func__, __LINE__);
return -ENOMEM;
}
fsi->mount_opts.mode = S_IRWXU | (S_IRGRP | S_IXGRP) | (S_IROTH | S_IXOTH);
sb->s_maxbytes = MAX_LFS_FILESIZE;
sb->s_blocksize = PAGE_SIZE;
sb->s_blocksize_bits = PAGE_SHIFT;
sb->s_magic = BVFS_MAGIC;
sb->s_op = &bvfs_ops;
sb->s_time_gran = 1;
inode = bvfs_get_inode(sb, NULL, S_IFDIR | fsi->mount_opts.mode, 0);
if (!inode) {
pr_err("%s(#%d): bvfs_get_inode failed\n", __func__, __LINE__);
return -ENOMEM;
}
sb->s_root = d_make_root(inode);
if (!sb->s_root) {
pr_err("%s(#%d): d_make_root()\n", __func__, __LINE__);
return -ENOMEM;
}
dump_stack();
return 0;
}
static struct dentry *bvfs_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data)
{
pr_debug("%s(#%d): flags:%d, dev_name:%s, data:%s\n", __func__, __LINE__, flags, dev_name, (char *) data);
return mount_nodev(fs_type, flags, data, bvfs_fill_super);
}
static void bvfs_kill_sb(struct super_block *sb)
{
pr_debug("%s(#%d)\n", __func__, __LINE__);
kill_litter_super(sb);
}
透過簡單的mount可以更了解一下整個call flow
/ # sysctl kernel.printk='8 8 8 8'
kernel.printk = 8 8 8 8
/ # insmod bvfs.ko
[ 181.936205] init_bv_fs(#425)
/ # mount bvfs mnt -t bvfs
sys_mount()
|-> ksys_mount()
|-> do_mount()
|-> do_new_mount()
|-> vfs_kern_mount()
|-> mount_fs()
|-> bvfs_mount()
|-> mount_nodev()
|-> bvfs_fill_super()
bvfs_mount(#392): flags:32768, dev_name:bvfs, data:(null)
bvfs_fill_super(#346): data:(null), silent:1
bvfs_get_inode(#302): mode:040755
bvfs_get_inode(#305): i_ino:1340
CPU: 0 PID: 810 Comm: mount Tainted: G O 4.19.0-rc8+ #32
Hardware name: ARM-Versatile Express
[<80111eb4>] (unwind_backtrace) from [<8010d8e4>] (show_stack+0x10/0x14)
[<8010d8e4>] (show_stack) from [<806c81b0>] (dump_stack+0x88/0x9c)
[<806c81b0>] (dump_stack) from [<7f000b34>] (bvfs_fill_super+0x108/0x124 [bvfs])
[<7f000b34>] (bvfs_fill_super [bvfs]) from [<8024d8ec>] (mount_nodev+0x44/0x90)
[<8024d8ec>] (mount_nodev) from [<7f000468>] (bvfs_mount+0x68/0x78 [bvfs])
[<7f000468>] (bvfs_mount [bvfs]) from [<8024e3f0>] (mount_fs+0x14/0xa8)
[<8024e3f0>] (mount_fs) from [<8026af6c>] (vfs_kern_mount.part.3+0x48/0xf8)
[<8026af6c>] (vfs_kern_mount.part.3) from [<8026d80c>] (do_mount+0x57c/0xc80)
[<8026d80c>] (do_mount) from [<8026e2a4>] (ksys_mount+0x8c/0xb4)
[<8026e2a4>] (ksys_mount) from [<80101000>] (ret_fast_syscall+0x0/0x54)
Exception stack(0x8550dfa8 to 0x8550dff0)
dfa0: 00000000 00000000 7e9d8f85 7e9d8f8a 7e9d8f91 00008000
dfc0: 00000000 00000000 7e9d8f85 00000015 7e9d8f91 00008000 00000000 00000000
dfe0: 00000000 7e9d8b68 0007e5a7 00012fba
/ # cd mnt
sys_stat64()
|-> vfs_statx()
|-> filename_lookup()
|-> walk_component()
|-> lookup_slow()
|-> __lookup_slow()
|-> bvfs_lookup()
bvfs_lookup(#218):
CPU: 0 PID: 800 Comm: sh Tainted: G O 4.19.0-rc8+ #32
Hardware name: ARM-Versatile Express
[<80111eb4>] (unwind_backtrace) from [<8010d8e4>] (show_stack+0x10/0x14)
[<8010d8e4>] (show_stack) from [<806c81b0>] (dump_stack+0x88/0x9c)
[<806c81b0>] (dump_stack) from [<7f000224>] (bvfs_lookup+0x40/0x5c [bvfs])
[<7f000224>] (bvfs_lookup [bvfs]) from [<802551a0>] (__lookup_slow+0x8c/0x154)
[<802551a0>] (__lookup_slow) from [<80255298>] (lookup_slow+0x30/0x44)
[<80255298>] (lookup_slow) from [<802579c8>] (walk_component+0x1c8/0x300)
[<802579c8>] (walk_component) from [<802580c8>] (path_lookupat+0x70/0x1fc)
[<802580c8>] (path_lookupat) from [<80259da8>] (filename_lookup+0x9c/0x10c)
[<80259da8>] (filename_lookup) from [<8024f3d0>] (vfs_statx+0x68/0xd0)
[<8024f3d0>] (vfs_statx) from [<8024fbc8>] (sys_stat64+0x38/0x68)
[<8024fbc8>] (sys_stat64) from [<80101000>] (ret_fast_syscall+0x0/0x54)
Exception stack(0x85505fa8 to 0x85505ff0)
5fa0: 0018a058 00000000 0018a25c 7e97caf8 7e97caf8 0018a266
5fc0: 0018a058 00000000 00000001 000000c3 001864dc 00186478 00000000 7e97cadc
5fe0: 000000c3 7e97cac4 000ddb9b 00011726
基本上dentry就是一個representation/cache結構,用於反應file-system的目錄結構,而不須真的讀從file-system讀出資料。
A "dentry" in the Linux kernel is the in-memory representation of a directory entry;
it is a way of remembering the resolution of a given file or directory name
without having to search through the filesystem to find it.
The dentry cache speeds lookups considerably;
keeping dentries for frequently accessed names like /tmp, /dev/null,
or /usr/bin/tetris saves a lot of filesystem I/O.
概略的一些關係圖如下,希望能讓大家能對這些關係有較清楚的概念。
目前為止的功能只能達到mount,其餘的下一個章節繼續囉。