2020年11月14日 星期六

Linux Kernel(18.1)- My First Filesystem


這一個章節要跟大家介紹如何寫一個filesystem,主要是讓大家對VFS的framework有一些基本認識。
首先要透過register_filesystem()註冊對應的filesystem,而register_filesystem()需要struct file_system_type的一些基本參數如下:
static struct file_system_type bv_fs_type = {
    // for internal VFS use: you should initialize this to THIS_MODULE in most cases
    .owner = THIS_MODULE,
    // the name of the filesystem type, such as "ext2", "iso9660"
    .name = "bvfs",
    // the method to call when a new instance of this filesystem should be mounted
    .mount = bvfs_mount,
    // the method to call when an instance of this filesystem should be shut down
    .kill_sb = bvfs_kill_sb,
};

MODULE_ALIAS_FS("bv");

static int __init init_bv_fs(void)
{
    printk("%s(#%d)\n", __func__, __LINE__);
    return register_filesystem(&bv_fs_type);
}
module_init(init_bv_fs);
name是filesystem type的name,mount是mount的時候會被呼叫的method,mount() method必須回傳root dentry,kill_sb則是umount會被呼叫的method。
通常mount() method會調用generic mount() implementations,包含mount_bdev、mount_nodev、mount_single並帶入fill_super() callback用於初始化struct super_block *與創建root dentry。關係圖概略如下:

在bvfs_fill_super()中主要要填入super block operations,並透過bvfs_get_inode()取得mount point/root的inode,再透過d_make_root()取得root dentry。
struct inode *bvfs_get_inode(struct super_block *sb, const struct inode *dir, umode_t mode, dev_t dev)
{
    // Allocates a new inode for given superblock.
    // The default gfp_mask for allocations related
    // to inode->i_mapping is GFP_HIGHUSER_MOVABLE.
    // If HIGHMEM pages are unsuitable or it is known
    // that pages allocated for the page cache are
    // not reclaimable or migratable, mapping_set_gfp_mask
    // must be called with suitable flags on
    // the newly created inode's mapping
    struct inode *inode = new_inode(sb);
    pr_debug("%s(#%d): mode:0%o\n", __func__, __LINE__, mode);
    if (inode) {
        inode->i_ino = get_next_ino();
        pr_debug("%s(#%d): i_ino:%lx\n", __func__, __LINE__, inode->i_ino);

        // Init uid,gid,mode for new inode according to posix standards
        inode_init_owner(inode, dir, mode);

        // add file operation
        inode->i_mapping->a_ops = &bvfs_aops;
        inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);

        switch (mode & S_IFMT) {
            default:
                init_special_inode(inode, mode, dev);
                break;

            case S_IFREG:
                inode->i_op = &bvfs_file_inode_operations;
                inode->i_fop = &bvfs_file_operations;
                break;

            case S_IFDIR:
                inode->i_op = &bvfs_dir_inode_ops;
                inode->i_fop = &simple_dir_operations;

                /* directory inodes start off with i_nlink == 2 (for "." entry) */
                inc_nlink(inode);
                break;
        }
    }
    return inode;
}

#define BVFS_MAGIC 0x20201204

/**
 * create and populate the root directory for our new filesystem.
 */
static int bvfs_fill_super(struct super_block *sb, void *data, int silent)
{
    struct bvfs_fs_info *fsi;
    struct inode *inode;

    pr_debug("%s(#%d): data:%s, silent:%d\n", __func__, __LINE__, (char *) data, silent);

    fsi = kzalloc(sizeof(*fsi), GFP_KERNEL);
    sb->s_fs_info = fsi;
    if (!fsi) {
        pr_err("%s(#%d): kzalloc()\n", __func__, __LINE__);
        return -ENOMEM;
    }
    // set default mount permission to 755
    fsi->mount_opts.mode = S_IRWXU | (S_IRGRP | S_IXGRP) | (S_IROTH | S_IXOTH);

    // The super block operations are set at the time of mounting.
    sb->s_maxbytes = MAX_LFS_FILESIZE;
    // The maximum file system block size is limited by the page cache size
    // (which is 4k on x86 systems).
    sb->s_blocksize = PAGE_SIZE;
    // The number of bits that make up the filesystem block size
    sb->s_blocksize_bits = PAGE_SHIFT;
    sb->s_magic = BVFS_MAGIC;
    sb->s_op = &bvfs_ops;
    sb->s_time_gran = 1;

    // passed S_IFDIR, the returned inode will describe a directory
    inode = bvfs_get_inode(sb, NULL, S_IFDIR | fsi->mount_opts.mode, 0);
    if (!inode) {
        pr_err("%s(#%d): bvfs_get_inode failed\n", __func__, __LINE__);
        return -ENOMEM;
    }
    // This directory inode must be put into the directory cache
    // (by way of a "dentry" structure) so that the VFS can find it
    sb->s_root = d_make_root(inode);
    if (!sb->s_root) {
        pr_err("%s(#%d): d_make_root()\n", __func__, __LINE__);
        return -ENOMEM;
    }

    dump_stack();
    return 0;
}

/*
 *  @param fs_type describes the filesystem, partly initialized by the specific filesystem code
 *  @param flags mount flags
 *  @param dev_name the device name we are mounting
 *  @param data arbitrary mount options, usually comes as an ASCII string
 *
 *  @return struct dentry
 *
 */
static struct dentry *bvfs_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data)
{
    pr_debug("%s(#%d): flags:%d, dev_name:%s, data:%s\n", __func__, __LINE__, flags, dev_name, (char *) data);
    /*
     * Usually, a filesystem uses one of the generic mount() implementations
     * and provides a fill_super() callback instead. The generic variants are:
     *
     * mount_bdev: mount a filesystem residing on a block device
     *
     * mount_nodev: mount a filesystem that is not backed by a device
     *
     * mount_single: mount a filesystem which shares the instance between all mounts
     */
    return mount_nodev(fs_type, flags, data, bvfs_fill_super);
}

static void bvfs_kill_sb(struct super_block *sb)
{
    pr_debug("%s(#%d)\n", __func__, __LINE__);
    // kill_litter_super() is a generic function provided by the VFS;
    // it simply cleans up all of the in-core structures when the filesystem is unmounted
    kill_litter_super(sb);
}

透過簡單的mount可以更了解一下整個call flow
/ # sysctl kernel.printk='8 8 8 8'
kernel.printk = 8 8 8 8
/ # insmod bvfs.ko
[  181.936205] init_bv_fs(#425)

/ # mount bvfs mnt -t bvfs
sys_mount()
  |-> ksys_mount()
    |-> do_mount()
      |-> do_new_mount()
        |-> vfs_kern_mount()
          |-> mount_fs()
            |-> bvfs_mount()
              |-> mount_nodev()
                |-> bvfs_fill_super()
bvfs_mount(#392): flags:32768, dev_name:bvfs, data:(null)
bvfs_fill_super(#346): data:(null), silent:1
bvfs_get_inode(#302): mode:040755
bvfs_get_inode(#305): i_ino:1340
CPU: 0 PID: 810 Comm: mount Tainted: G           O      4.19.0-rc8+ #32
Hardware name: ARM-Versatile Express
[<80111eb4>] (unwind_backtrace) from [<8010d8e4>] (show_stack+0x10/0x14)
[<8010d8e4>] (show_stack) from [<806c81b0>] (dump_stack+0x88/0x9c)
[<806c81b0>] (dump_stack) from [<7f000b34>] (bvfs_fill_super+0x108/0x124 [bvfs])
[<7f000b34>] (bvfs_fill_super [bvfs]) from [<8024d8ec>] (mount_nodev+0x44/0x90)
[<8024d8ec>] (mount_nodev) from [<7f000468>] (bvfs_mount+0x68/0x78 [bvfs])
[<7f000468>] (bvfs_mount [bvfs]) from [<8024e3f0>] (mount_fs+0x14/0xa8)
[<8024e3f0>] (mount_fs) from [<8026af6c>] (vfs_kern_mount.part.3+0x48/0xf8)
[<8026af6c>] (vfs_kern_mount.part.3) from [<8026d80c>] (do_mount+0x57c/0xc80)
[<8026d80c>] (do_mount) from [<8026e2a4>] (ksys_mount+0x8c/0xb4)
[<8026e2a4>] (ksys_mount) from [<80101000>] (ret_fast_syscall+0x0/0x54)
Exception stack(0x8550dfa8 to 0x8550dff0)
dfa0:                   00000000 00000000 7e9d8f85 7e9d8f8a 7e9d8f91 00008000
dfc0: 00000000 00000000 7e9d8f85 00000015 7e9d8f91 00008000 00000000 00000000
dfe0: 00000000 7e9d8b68 0007e5a7 00012fba


/ # cd mnt
sys_stat64()
  |-> vfs_statx()
    |-> filename_lookup()
      |-> walk_component()
        |-> lookup_slow()
          |-> __lookup_slow()
            |-> bvfs_lookup()

bvfs_lookup(#218):
CPU: 0 PID: 800 Comm: sh Tainted: G           O      4.19.0-rc8+ #32
Hardware name: ARM-Versatile Express
[<80111eb4>] (unwind_backtrace) from [<8010d8e4>] (show_stack+0x10/0x14)
[<8010d8e4>] (show_stack) from [<806c81b0>] (dump_stack+0x88/0x9c)
[<806c81b0>] (dump_stack) from [<7f000224>] (bvfs_lookup+0x40/0x5c [bvfs])
[<7f000224>] (bvfs_lookup [bvfs]) from [<802551a0>] (__lookup_slow+0x8c/0x154)
[<802551a0>] (__lookup_slow) from [<80255298>] (lookup_slow+0x30/0x44)
[<80255298>] (lookup_slow) from [<802579c8>] (walk_component+0x1c8/0x300)
[<802579c8>] (walk_component) from [<802580c8>] (path_lookupat+0x70/0x1fc)
[<802580c8>] (path_lookupat) from [<80259da8>] (filename_lookup+0x9c/0x10c)
[<80259da8>] (filename_lookup) from [<8024f3d0>] (vfs_statx+0x68/0xd0)
[<8024f3d0>] (vfs_statx) from [<8024fbc8>] (sys_stat64+0x38/0x68)
[<8024fbc8>] (sys_stat64) from [<80101000>] (ret_fast_syscall+0x0/0x54)
Exception stack(0x85505fa8 to 0x85505ff0)
5fa0:                   0018a058 00000000 0018a25c 7e97caf8 7e97caf8 0018a266
5fc0: 0018a058 00000000 00000001 000000c3 001864dc 00186478 00000000 7e97cadc
5fe0: 000000c3 7e97cac4 000ddb9b 00011726


基本上dentry就是一個representation/cache結構,用於反應file-system的目錄結構,而不須真的讀從file-system讀出資料。
A "dentry" in the Linux kernel is the in-memory representation of a directory entry;
it is a way of remembering the resolution of a given file or directory name 
without having to search through the filesystem to find it. 
The dentry cache speeds lookups considerably; 
keeping dentries for frequently accessed names like /tmp, /dev/null, 
or /usr/bin/tetris saves a lot of filesystem I/O.


概略的一些關係圖如下,希望能讓大家能對這些關係有較清楚的概念。



目前為止的功能只能達到mount,其餘的下一個章節繼續囉。


1 則留言:

  1. https://lwn.net/Articles/814535/

    A negative dentry is a little different, though: it is a memory of a filesystem lookup that failed. If a user types "more cowbell" and no file named cowbell exists, the kernel will create a negative dentry recording that fact.

    回覆刪除

熱門文章