這一個章節要跟大家介紹如何寫一個filesystem,主要是讓大家對VFS的framework有一些基本認識。
首先要透過register_filesystem()註冊對應的filesystem,而register_filesystem()需要struct file_system_type的一些基本參數如下:
static struct file_system_type bv_fs_type = { // for internal VFS use: you should initialize this to THIS_MODULE in most cases .owner = THIS_MODULE, // the name of the filesystem type, such as "ext2", "iso9660" .name = "bvfs", // the method to call when a new instance of this filesystem should be mounted .mount = bvfs_mount, // the method to call when an instance of this filesystem should be shut down .kill_sb = bvfs_kill_sb, }; MODULE_ALIAS_FS("bv"); static int __init init_bv_fs(void) { printk("%s(#%d)\n", __func__, __LINE__); return register_filesystem(&bv_fs_type); } module_init(init_bv_fs);name是filesystem type的name,mount是mount的時候會被呼叫的method,mount() method必須回傳root dentry,kill_sb則是umount會被呼叫的method。
通常mount() method會調用generic mount() implementations,包含mount_bdev、mount_nodev、mount_single並帶入fill_super() callback用於初始化struct super_block *與創建root dentry。關係圖概略如下:
在bvfs_fill_super()中主要要填入super block operations,並透過bvfs_get_inode()取得mount point/root的inode,再透過d_make_root()取得root dentry。
struct inode *bvfs_get_inode(struct super_block *sb, const struct inode *dir, umode_t mode, dev_t dev) { // Allocates a new inode for given superblock. // The default gfp_mask for allocations related // to inode->i_mapping is GFP_HIGHUSER_MOVABLE. // If HIGHMEM pages are unsuitable or it is known // that pages allocated for the page cache are // not reclaimable or migratable, mapping_set_gfp_mask // must be called with suitable flags on // the newly created inode's mapping struct inode *inode = new_inode(sb); pr_debug("%s(#%d): mode:0%o\n", __func__, __LINE__, mode); if (inode) { inode->i_ino = get_next_ino(); pr_debug("%s(#%d): i_ino:%lx\n", __func__, __LINE__, inode->i_ino); // Init uid,gid,mode for new inode according to posix standards inode_init_owner(inode, dir, mode); // add file operation inode->i_mapping->a_ops = &bvfs_aops; inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); switch (mode & S_IFMT) { default: init_special_inode(inode, mode, dev); break; case S_IFREG: inode->i_op = &bvfs_file_inode_operations; inode->i_fop = &bvfs_file_operations; break; case S_IFDIR: inode->i_op = &bvfs_dir_inode_ops; inode->i_fop = &simple_dir_operations; /* directory inodes start off with i_nlink == 2 (for "." entry) */ inc_nlink(inode); break; } } return inode; } #define BVFS_MAGIC 0x20201204 /** * create and populate the root directory for our new filesystem. */ static int bvfs_fill_super(struct super_block *sb, void *data, int silent) { struct bvfs_fs_info *fsi; struct inode *inode; pr_debug("%s(#%d): data:%s, silent:%d\n", __func__, __LINE__, (char *) data, silent); fsi = kzalloc(sizeof(*fsi), GFP_KERNEL); sb->s_fs_info = fsi; if (!fsi) { pr_err("%s(#%d): kzalloc()\n", __func__, __LINE__); return -ENOMEM; } // set default mount permission to 755 fsi->mount_opts.mode = S_IRWXU | (S_IRGRP | S_IXGRP) | (S_IROTH | S_IXOTH); // The super block operations are set at the time of mounting. sb->s_maxbytes = MAX_LFS_FILESIZE; // The maximum file system block size is limited by the page cache size // (which is 4k on x86 systems). sb->s_blocksize = PAGE_SIZE; // The number of bits that make up the filesystem block size sb->s_blocksize_bits = PAGE_SHIFT; sb->s_magic = BVFS_MAGIC; sb->s_op = &bvfs_ops; sb->s_time_gran = 1; // passed S_IFDIR, the returned inode will describe a directory inode = bvfs_get_inode(sb, NULL, S_IFDIR | fsi->mount_opts.mode, 0); if (!inode) { pr_err("%s(#%d): bvfs_get_inode failed\n", __func__, __LINE__); return -ENOMEM; } // This directory inode must be put into the directory cache // (by way of a "dentry" structure) so that the VFS can find it sb->s_root = d_make_root(inode); if (!sb->s_root) { pr_err("%s(#%d): d_make_root()\n", __func__, __LINE__); return -ENOMEM; } dump_stack(); return 0; } /* * @param fs_type describes the filesystem, partly initialized by the specific filesystem code * @param flags mount flags * @param dev_name the device name we are mounting * @param data arbitrary mount options, usually comes as an ASCII string * * @return struct dentry * */ static struct dentry *bvfs_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) { pr_debug("%s(#%d): flags:%d, dev_name:%s, data:%s\n", __func__, __LINE__, flags, dev_name, (char *) data); /* * Usually, a filesystem uses one of the generic mount() implementations * and provides a fill_super() callback instead. The generic variants are: * * mount_bdev: mount a filesystem residing on a block device * * mount_nodev: mount a filesystem that is not backed by a device * * mount_single: mount a filesystem which shares the instance between all mounts */ return mount_nodev(fs_type, flags, data, bvfs_fill_super); } static void bvfs_kill_sb(struct super_block *sb) { pr_debug("%s(#%d)\n", __func__, __LINE__); // kill_litter_super() is a generic function provided by the VFS; // it simply cleans up all of the in-core structures when the filesystem is unmounted kill_litter_super(sb); }
透過簡單的mount可以更了解一下整個call flow
/ # sysctl kernel.printk='8 8 8 8' kernel.printk = 8 8 8 8 / # insmod bvfs.ko [ 181.936205] init_bv_fs(#425) / # mount bvfs mnt -t bvfs sys_mount() |-> ksys_mount() |-> do_mount() |-> do_new_mount() |-> vfs_kern_mount() |-> mount_fs() |-> bvfs_mount() |-> mount_nodev() |-> bvfs_fill_super() bvfs_mount(#392): flags:32768, dev_name:bvfs, data:(null) bvfs_fill_super(#346): data:(null), silent:1 bvfs_get_inode(#302): mode:040755 bvfs_get_inode(#305): i_ino:1340 CPU: 0 PID: 810 Comm: mount Tainted: G O 4.19.0-rc8+ #32 Hardware name: ARM-Versatile Express [<80111eb4>] (unwind_backtrace) from [<8010d8e4>] (show_stack+0x10/0x14) [<8010d8e4>] (show_stack) from [<806c81b0>] (dump_stack+0x88/0x9c) [<806c81b0>] (dump_stack) from [<7f000b34>] (bvfs_fill_super+0x108/0x124 [bvfs]) [<7f000b34>] (bvfs_fill_super [bvfs]) from [<8024d8ec>] (mount_nodev+0x44/0x90) [<8024d8ec>] (mount_nodev) from [<7f000468>] (bvfs_mount+0x68/0x78 [bvfs]) [<7f000468>] (bvfs_mount [bvfs]) from [<8024e3f0>] (mount_fs+0x14/0xa8) [<8024e3f0>] (mount_fs) from [<8026af6c>] (vfs_kern_mount.part.3+0x48/0xf8) [<8026af6c>] (vfs_kern_mount.part.3) from [<8026d80c>] (do_mount+0x57c/0xc80) [<8026d80c>] (do_mount) from [<8026e2a4>] (ksys_mount+0x8c/0xb4) [<8026e2a4>] (ksys_mount) from [<80101000>] (ret_fast_syscall+0x0/0x54) Exception stack(0x8550dfa8 to 0x8550dff0) dfa0: 00000000 00000000 7e9d8f85 7e9d8f8a 7e9d8f91 00008000 dfc0: 00000000 00000000 7e9d8f85 00000015 7e9d8f91 00008000 00000000 00000000 dfe0: 00000000 7e9d8b68 0007e5a7 00012fba / # cd mnt sys_stat64() |-> vfs_statx() |-> filename_lookup() |-> walk_component() |-> lookup_slow() |-> __lookup_slow() |-> bvfs_lookup() bvfs_lookup(#218): CPU: 0 PID: 800 Comm: sh Tainted: G O 4.19.0-rc8+ #32 Hardware name: ARM-Versatile Express [<80111eb4>] (unwind_backtrace) from [<8010d8e4>] (show_stack+0x10/0x14) [<8010d8e4>] (show_stack) from [<806c81b0>] (dump_stack+0x88/0x9c) [<806c81b0>] (dump_stack) from [<7f000224>] (bvfs_lookup+0x40/0x5c [bvfs]) [<7f000224>] (bvfs_lookup [bvfs]) from [<802551a0>] (__lookup_slow+0x8c/0x154) [<802551a0>] (__lookup_slow) from [<80255298>] (lookup_slow+0x30/0x44) [<80255298>] (lookup_slow) from [<802579c8>] (walk_component+0x1c8/0x300) [<802579c8>] (walk_component) from [<802580c8>] (path_lookupat+0x70/0x1fc) [<802580c8>] (path_lookupat) from [<80259da8>] (filename_lookup+0x9c/0x10c) [<80259da8>] (filename_lookup) from [<8024f3d0>] (vfs_statx+0x68/0xd0) [<8024f3d0>] (vfs_statx) from [<8024fbc8>] (sys_stat64+0x38/0x68) [<8024fbc8>] (sys_stat64) from [<80101000>] (ret_fast_syscall+0x0/0x54) Exception stack(0x85505fa8 to 0x85505ff0) 5fa0: 0018a058 00000000 0018a25c 7e97caf8 7e97caf8 0018a266 5fc0: 0018a058 00000000 00000001 000000c3 001864dc 00186478 00000000 7e97cadc 5fe0: 000000c3 7e97cac4 000ddb9b 00011726
基本上dentry就是一個representation/cache結構,用於反應file-system的目錄結構,而不須真的讀從file-system讀出資料。
A "dentry" in the Linux kernel is the in-memory representation of a directory entry; it is a way of remembering the resolution of a given file or directory name without having to search through the filesystem to find it. The dentry cache speeds lookups considerably; keeping dentries for frequently accessed names like /tmp, /dev/null, or /usr/bin/tetris saves a lot of filesystem I/O.
概略的一些關係圖如下,希望能讓大家能對這些關係有較清楚的概念。
目前為止的功能只能達到mount,其餘的下一個章節繼續囉。
-
參考資料:
- Overview of the Linux Virtual File System
- Creating Linux virtual filesystems
- Creating Linux virtual filesystems
- Kernel index
- The Linux Virtual File System
- mount过程分析之一(基于3.16.3内核)【转】
- Linux VFS机制简析(一)
- Linux Filesystems API
- Linux Filesystems in 45 minutes
- VFS文件系统结构分析
- VFS file system structure analysis
- Linux Filesystems API
- Writing a Simple File System
- VFS中的file,dentry和inode