這一個章節要跟大家介紹如何寫一個filesystem,主要是讓大家對VFS的framework有一些基本認識。
首先要透過register_filesystem()註冊對應的filesystem,而register_filesystem()需要struct file_system_type的一些基本參數如下:
static struct file_system_type bv_fs_type = { // for internal VFS use: you should initialize this to THIS_MODULE in most cases .owner = THIS_MODULE, // the name of the filesystem type, such as "ext2", "iso9660" .name = "bvfs", // the method to call when a new instance of this filesystem should be mounted .mount = bvfs_mount, // the method to call when an instance of this filesystem should be shut down .kill_sb = bvfs_kill_sb, }; MODULE_ALIAS_FS("bv"); static int __init init_bv_fs(void) { printk("%s(#%d)\n", __func__, __LINE__); return register_filesystem(&bv_fs_type); } module_init(init_bv_fs);name是filesystem type的name,mount是mount的時候會被呼叫的method,mount() method必須回傳root dentry,kill_sb則是umount會被呼叫的method。
通常mount() method會調用generic mount() implementations,包含mount_bdev、mount_nodev、mount_single並帶入fill_super() callback用於初始化struct super_block *與創建root dentry。關係圖概略如下:
在bvfs_fill_super()中主要要填入super block operations,並透過bvfs_get_inode()取得mount point/root的inode,再透過d_make_root()取得root dentry。
struct inode *bvfs_get_inode(struct super_block *sb, const struct inode *dir, umode_t mode, dev_t dev) { // Allocates a new inode for given superblock. // The default gfp_mask for allocations related // to inode->i_mapping is GFP_HIGHUSER_MOVABLE. // If HIGHMEM pages are unsuitable or it is known // that pages allocated for the page cache are // not reclaimable or migratable, mapping_set_gfp_mask // must be called with suitable flags on // the newly created inode's mapping struct inode *inode = new_inode(sb); pr_debug("%s(#%d): mode:0%o\n", __func__, __LINE__, mode); if (inode) { inode->i_ino = get_next_ino(); pr_debug("%s(#%d): i_ino:%lx\n", __func__, __LINE__, inode->i_ino); // Init uid,gid,mode for new inode according to posix standards inode_init_owner(inode, dir, mode); // add file operation inode->i_mapping->a_ops = &bvfs_aops; inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); switch (mode & S_IFMT) { default: init_special_inode(inode, mode, dev); break; case S_IFREG: inode->i_op = &bvfs_file_inode_operations; inode->i_fop = &bvfs_file_operations; break; case S_IFDIR: inode->i_op = &bvfs_dir_inode_ops; inode->i_fop = &simple_dir_operations; /* directory inodes start off with i_nlink == 2 (for "." entry) */ inc_nlink(inode); break; } } return inode; } #define BVFS_MAGIC 0x20201204 /** * create and populate the root directory for our new filesystem. */ static int bvfs_fill_super(struct super_block *sb, void *data, int silent) { struct bvfs_fs_info *fsi; struct inode *inode; pr_debug("%s(#%d): data:%s, silent:%d\n", __func__, __LINE__, (char *) data, silent); fsi = kzalloc(sizeof(*fsi), GFP_KERNEL); sb->s_fs_info = fsi; if (!fsi) { pr_err("%s(#%d): kzalloc()\n", __func__, __LINE__); return -ENOMEM; } // set default mount permission to 755 fsi->mount_opts.mode = S_IRWXU | (S_IRGRP | S_IXGRP) | (S_IROTH | S_IXOTH); // The super block operations are set at the time of mounting. sb->s_maxbytes = MAX_LFS_FILESIZE; // The maximum file system block size is limited by the page cache size // (which is 4k on x86 systems). sb->s_blocksize = PAGE_SIZE; // The number of bits that make up the filesystem block size sb->s_blocksize_bits = PAGE_SHIFT; sb->s_magic = BVFS_MAGIC; sb->s_op = &bvfs_ops; sb->s_time_gran = 1; // passed S_IFDIR, the returned inode will describe a directory inode = bvfs_get_inode(sb, NULL, S_IFDIR | fsi->mount_opts.mode, 0); if (!inode) { pr_err("%s(#%d): bvfs_get_inode failed\n", __func__, __LINE__); return -ENOMEM; } // This directory inode must be put into the directory cache // (by way of a "dentry" structure) so that the VFS can find it sb->s_root = d_make_root(inode); if (!sb->s_root) { pr_err("%s(#%d): d_make_root()\n", __func__, __LINE__); return -ENOMEM; } dump_stack(); return 0; } /* * @param fs_type describes the filesystem, partly initialized by the specific filesystem code * @param flags mount flags * @param dev_name the device name we are mounting * @param data arbitrary mount options, usually comes as an ASCII string * * @return struct dentry * */ static struct dentry *bvfs_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) { pr_debug("%s(#%d): flags:%d, dev_name:%s, data:%s\n", __func__, __LINE__, flags, dev_name, (char *) data); /* * Usually, a filesystem uses one of the generic mount() implementations * and provides a fill_super() callback instead. The generic variants are: * * mount_bdev: mount a filesystem residing on a block device * * mount_nodev: mount a filesystem that is not backed by a device * * mount_single: mount a filesystem which shares the instance between all mounts */ return mount_nodev(fs_type, flags, data, bvfs_fill_super); } static void bvfs_kill_sb(struct super_block *sb) { pr_debug("%s(#%d)\n", __func__, __LINE__); // kill_litter_super() is a generic function provided by the VFS; // it simply cleans up all of the in-core structures when the filesystem is unmounted kill_litter_super(sb); }
透過簡單的mount可以更了解一下整個call flow
/ # sysctl kernel.printk='8 8 8 8'
kernel.printk = 8 8 8 8
/ # insmod bvfs.ko
[ 181.936205] init_bv_fs(#425)
/ # mount bvfs mnt -t bvfs
sys_mount()
|-> ksys_mount()
|-> do_mount()
|-> do_new_mount()
|-> vfs_kern_mount()
|-> mount_fs()
|-> bvfs_mount()
|-> mount_nodev()
|-> bvfs_fill_super()
bvfs_mount(#392): flags:32768, dev_name:bvfs, data:(null)
bvfs_fill_super(#346): data:(null), silent:1
bvfs_get_inode(#302): mode:040755
bvfs_get_inode(#305): i_ino:1340
CPU: 0 PID: 810 Comm: mount Tainted: G O 4.19.0-rc8+ #32
Hardware name: ARM-Versatile Express
[<80111eb4>] (unwind_backtrace) from [<8010d8e4>] (show_stack+0x10/0x14)
[<8010d8e4>] (show_stack) from [<806c81b0>] (dump_stack+0x88/0x9c)
[<806c81b0>] (dump_stack) from [<7f000b34>] (bvfs_fill_super+0x108/0x124 [bvfs])
[<7f000b34>] (bvfs_fill_super [bvfs]) from [<8024d8ec>] (mount_nodev+0x44/0x90)
[<8024d8ec>] (mount_nodev) from [<7f000468>] (bvfs_mount+0x68/0x78 [bvfs])
[<7f000468>] (bvfs_mount [bvfs]) from [<8024e3f0>] (mount_fs+0x14/0xa8)
[<8024e3f0>] (mount_fs) from [<8026af6c>] (vfs_kern_mount.part.3+0x48/0xf8)
[<8026af6c>] (vfs_kern_mount.part.3) from [<8026d80c>] (do_mount+0x57c/0xc80)
[<8026d80c>] (do_mount) from [<8026e2a4>] (ksys_mount+0x8c/0xb4)
[<8026e2a4>] (ksys_mount) from [<80101000>] (ret_fast_syscall+0x0/0x54)
Exception stack(0x8550dfa8 to 0x8550dff0)
dfa0: 00000000 00000000 7e9d8f85 7e9d8f8a 7e9d8f91 00008000
dfc0: 00000000 00000000 7e9d8f85 00000015 7e9d8f91 00008000 00000000 00000000
dfe0: 00000000 7e9d8b68 0007e5a7 00012fba
/ # cd mnt
sys_stat64()
|-> vfs_statx()
|-> filename_lookup()
|-> walk_component()
|-> lookup_slow()
|-> __lookup_slow()
|-> bvfs_lookup()
bvfs_lookup(#218):
CPU: 0 PID: 800 Comm: sh Tainted: G O 4.19.0-rc8+ #32
Hardware name: ARM-Versatile Express
[<80111eb4>] (unwind_backtrace) from [<8010d8e4>] (show_stack+0x10/0x14)
[<8010d8e4>] (show_stack) from [<806c81b0>] (dump_stack+0x88/0x9c)
[<806c81b0>] (dump_stack) from [<7f000224>] (bvfs_lookup+0x40/0x5c [bvfs])
[<7f000224>] (bvfs_lookup [bvfs]) from [<802551a0>] (__lookup_slow+0x8c/0x154)
[<802551a0>] (__lookup_slow) from [<80255298>] (lookup_slow+0x30/0x44)
[<80255298>] (lookup_slow) from [<802579c8>] (walk_component+0x1c8/0x300)
[<802579c8>] (walk_component) from [<802580c8>] (path_lookupat+0x70/0x1fc)
[<802580c8>] (path_lookupat) from [<80259da8>] (filename_lookup+0x9c/0x10c)
[<80259da8>] (filename_lookup) from [<8024f3d0>] (vfs_statx+0x68/0xd0)
[<8024f3d0>] (vfs_statx) from [<8024fbc8>] (sys_stat64+0x38/0x68)
[<8024fbc8>] (sys_stat64) from [<80101000>] (ret_fast_syscall+0x0/0x54)
Exception stack(0x85505fa8 to 0x85505ff0)
5fa0: 0018a058 00000000 0018a25c 7e97caf8 7e97caf8 0018a266
5fc0: 0018a058 00000000 00000001 000000c3 001864dc 00186478 00000000 7e97cadc
5fe0: 000000c3 7e97cac4 000ddb9b 00011726
基本上dentry就是一個representation/cache結構,用於反應file-system的目錄結構,而不須真的讀從file-system讀出資料。
A "dentry" in the Linux kernel is the in-memory representation of a directory entry; it is a way of remembering the resolution of a given file or directory name without having to search through the filesystem to find it. The dentry cache speeds lookups considerably; keeping dentries for frequently accessed names like /tmp, /dev/null, or /usr/bin/tetris saves a lot of filesystem I/O.
概略的一些關係圖如下,希望能讓大家能對這些關係有較清楚的概念。
目前為止的功能只能達到mount,其餘的下一個章節繼續囉。
-
參考資料:
- Overview of the Linux Virtual File System
- Creating Linux virtual filesystems
- Creating Linux virtual filesystems
- Kernel index
- The Linux Virtual File System
- mount过程分析之一(基于3.16.3内核)【转】
- Linux VFS机制简析(一)
- Linux Filesystems API
- Linux Filesystems in 45 minutes
- VFS文件系统结构分析
- VFS file system structure analysis
- Linux Filesystems API
- Writing a Simple File System
- VFS中的file,dentry和inode



https://lwn.net/Articles/814535/
回覆刪除A negative dentry is a little different, though: it is a memory of a filesystem lookup that failed. If a user types "more cowbell" and no file named cowbell exists, the kernel will create a negative dentry recording that fact.