2020年11月14日 星期六

Linux Kernel(18.1)- My First Filesystem


這一個章節要跟大家介紹如何寫一個filesystem,主要是讓大家對VFS的framework有一些基本認識。
首先要透過register_filesystem()註冊對應的filesystem,而register_filesystem()需要struct file_system_type的一些基本參數如下:
static struct file_system_type bv_fs_type = {
    // for internal VFS use: you should initialize this to THIS_MODULE in most cases
    .owner = THIS_MODULE,
    // the name of the filesystem type, such as "ext2", "iso9660"
    .name = "bvfs",
    // the method to call when a new instance of this filesystem should be mounted
    .mount = bvfs_mount,
    // the method to call when an instance of this filesystem should be shut down
    .kill_sb = bvfs_kill_sb,
};

MODULE_ALIAS_FS("bv");

static int __init init_bv_fs(void)
{
    printk("%s(#%d)\n", __func__, __LINE__);
    return register_filesystem(&bv_fs_type);
}
module_init(init_bv_fs);
name是filesystem type的name,mount是mount的時候會被呼叫的method,mount() method必須回傳root dentry,kill_sb則是umount會被呼叫的method。
通常mount() method會調用generic mount() implementations,包含mount_bdev、mount_nodev、mount_single並帶入fill_super() callback用於初始化struct super_block *與創建root dentry。關係圖概略如下:

在bvfs_fill_super()中主要要填入super block operations,並透過bvfs_get_inode()取得mount point/root的inode,再透過d_make_root()取得root dentry。
struct inode *bvfs_get_inode(struct super_block *sb, const struct inode *dir, umode_t mode, dev_t dev)
{
    // Allocates a new inode for given superblock.
    // The default gfp_mask for allocations related
    // to inode->i_mapping is GFP_HIGHUSER_MOVABLE.
    // If HIGHMEM pages are unsuitable or it is known
    // that pages allocated for the page cache are
    // not reclaimable or migratable, mapping_set_gfp_mask
    // must be called with suitable flags on
    // the newly created inode's mapping
    struct inode *inode = new_inode(sb);
    pr_debug("%s(#%d): mode:0%o\n", __func__, __LINE__, mode);
    if (inode) {
        inode->i_ino = get_next_ino();
        pr_debug("%s(#%d): i_ino:%lx\n", __func__, __LINE__, inode->i_ino);

        // Init uid,gid,mode for new inode according to posix standards
        inode_init_owner(inode, dir, mode);

        // add file operation
        inode->i_mapping->a_ops = &bvfs_aops;
        inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);

        switch (mode & S_IFMT) {
            default:
                init_special_inode(inode, mode, dev);
                break;

            case S_IFREG:
                inode->i_op = &bvfs_file_inode_operations;
                inode->i_fop = &bvfs_file_operations;
                break;

            case S_IFDIR:
                inode->i_op = &bvfs_dir_inode_ops;
                inode->i_fop = &simple_dir_operations;

                /* directory inodes start off with i_nlink == 2 (for "." entry) */
                inc_nlink(inode);
                break;
        }
    }
    return inode;
}

#define BVFS_MAGIC 0x20201204

/**
 * create and populate the root directory for our new filesystem.
 */
static int bvfs_fill_super(struct super_block *sb, void *data, int silent)
{
    struct bvfs_fs_info *fsi;
    struct inode *inode;

    pr_debug("%s(#%d): data:%s, silent:%d\n", __func__, __LINE__, (char *) data, silent);

    fsi = kzalloc(sizeof(*fsi), GFP_KERNEL);
    sb->s_fs_info = fsi;
    if (!fsi) {
        pr_err("%s(#%d): kzalloc()\n", __func__, __LINE__);
        return -ENOMEM;
    }
    // set default mount permission to 755
    fsi->mount_opts.mode = S_IRWXU | (S_IRGRP | S_IXGRP) | (S_IROTH | S_IXOTH);

    // The super block operations are set at the time of mounting.
    sb->s_maxbytes = MAX_LFS_FILESIZE;
    // The maximum file system block size is limited by the page cache size
    // (which is 4k on x86 systems).
    sb->s_blocksize = PAGE_SIZE;
    // The number of bits that make up the filesystem block size
    sb->s_blocksize_bits = PAGE_SHIFT;
    sb->s_magic = BVFS_MAGIC;
    sb->s_op = &bvfs_ops;
    sb->s_time_gran = 1;

    // passed S_IFDIR, the returned inode will describe a directory
    inode = bvfs_get_inode(sb, NULL, S_IFDIR | fsi->mount_opts.mode, 0);
    if (!inode) {
        pr_err("%s(#%d): bvfs_get_inode failed\n", __func__, __LINE__);
        return -ENOMEM;
    }
    // This directory inode must be put into the directory cache
    // (by way of a "dentry" structure) so that the VFS can find it
    sb->s_root = d_make_root(inode);
    if (!sb->s_root) {
        pr_err("%s(#%d): d_make_root()\n", __func__, __LINE__);
        return -ENOMEM;
    }

    dump_stack();
    return 0;
}

/*
 *  @param fs_type describes the filesystem, partly initialized by the specific filesystem code
 *  @param flags mount flags
 *  @param dev_name the device name we are mounting
 *  @param data arbitrary mount options, usually comes as an ASCII string
 *
 *  @return struct dentry
 *
 */
static struct dentry *bvfs_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data)
{
    pr_debug("%s(#%d): flags:%d, dev_name:%s, data:%s\n", __func__, __LINE__, flags, dev_name, (char *) data);
    /*
     * Usually, a filesystem uses one of the generic mount() implementations
     * and provides a fill_super() callback instead. The generic variants are:
     *
     * mount_bdev: mount a filesystem residing on a block device
     *
     * mount_nodev: mount a filesystem that is not backed by a device
     *
     * mount_single: mount a filesystem which shares the instance between all mounts
     */
    return mount_nodev(fs_type, flags, data, bvfs_fill_super);
}

static void bvfs_kill_sb(struct super_block *sb)
{
    pr_debug("%s(#%d)\n", __func__, __LINE__);
    // kill_litter_super() is a generic function provided by the VFS;
    // it simply cleans up all of the in-core structures when the filesystem is unmounted
    kill_litter_super(sb);
}

透過簡單的mount可以更了解一下整個call flow
/ # sysctl kernel.printk='8 8 8 8'
kernel.printk = 8 8 8 8
/ # insmod bvfs.ko
[  181.936205] init_bv_fs(#425)

/ # mount bvfs mnt -t bvfs
sys_mount()
  |-> ksys_mount()
    |-> do_mount()
      |-> do_new_mount()
        |-> vfs_kern_mount()
          |-> mount_fs()
            |-> bvfs_mount()
              |-> mount_nodev()
                |-> bvfs_fill_super()
bvfs_mount(#392): flags:32768, dev_name:bvfs, data:(null)
bvfs_fill_super(#346): data:(null), silent:1
bvfs_get_inode(#302): mode:040755
bvfs_get_inode(#305): i_ino:1340
CPU: 0 PID: 810 Comm: mount Tainted: G           O      4.19.0-rc8+ #32
Hardware name: ARM-Versatile Express
[<80111eb4>] (unwind_backtrace) from [<8010d8e4>] (show_stack+0x10/0x14)
[<8010d8e4>] (show_stack) from [<806c81b0>] (dump_stack+0x88/0x9c)
[<806c81b0>] (dump_stack) from [<7f000b34>] (bvfs_fill_super+0x108/0x124 [bvfs])
[<7f000b34>] (bvfs_fill_super [bvfs]) from [<8024d8ec>] (mount_nodev+0x44/0x90)
[<8024d8ec>] (mount_nodev) from [<7f000468>] (bvfs_mount+0x68/0x78 [bvfs])
[<7f000468>] (bvfs_mount [bvfs]) from [<8024e3f0>] (mount_fs+0x14/0xa8)
[<8024e3f0>] (mount_fs) from [<8026af6c>] (vfs_kern_mount.part.3+0x48/0xf8)
[<8026af6c>] (vfs_kern_mount.part.3) from [<8026d80c>] (do_mount+0x57c/0xc80)
[<8026d80c>] (do_mount) from [<8026e2a4>] (ksys_mount+0x8c/0xb4)
[<8026e2a4>] (ksys_mount) from [<80101000>] (ret_fast_syscall+0x0/0x54)
Exception stack(0x8550dfa8 to 0x8550dff0)
dfa0:                   00000000 00000000 7e9d8f85 7e9d8f8a 7e9d8f91 00008000
dfc0: 00000000 00000000 7e9d8f85 00000015 7e9d8f91 00008000 00000000 00000000
dfe0: 00000000 7e9d8b68 0007e5a7 00012fba


/ # cd mnt
sys_stat64()
  |-> vfs_statx()
    |-> filename_lookup()
      |-> walk_component()
        |-> lookup_slow()
          |-> __lookup_slow()
            |-> bvfs_lookup()

bvfs_lookup(#218):
CPU: 0 PID: 800 Comm: sh Tainted: G           O      4.19.0-rc8+ #32
Hardware name: ARM-Versatile Express
[<80111eb4>] (unwind_backtrace) from [<8010d8e4>] (show_stack+0x10/0x14)
[<8010d8e4>] (show_stack) from [<806c81b0>] (dump_stack+0x88/0x9c)
[<806c81b0>] (dump_stack) from [<7f000224>] (bvfs_lookup+0x40/0x5c [bvfs])
[<7f000224>] (bvfs_lookup [bvfs]) from [<802551a0>] (__lookup_slow+0x8c/0x154)
[<802551a0>] (__lookup_slow) from [<80255298>] (lookup_slow+0x30/0x44)
[<80255298>] (lookup_slow) from [<802579c8>] (walk_component+0x1c8/0x300)
[<802579c8>] (walk_component) from [<802580c8>] (path_lookupat+0x70/0x1fc)
[<802580c8>] (path_lookupat) from [<80259da8>] (filename_lookup+0x9c/0x10c)
[<80259da8>] (filename_lookup) from [<8024f3d0>] (vfs_statx+0x68/0xd0)
[<8024f3d0>] (vfs_statx) from [<8024fbc8>] (sys_stat64+0x38/0x68)
[<8024fbc8>] (sys_stat64) from [<80101000>] (ret_fast_syscall+0x0/0x54)
Exception stack(0x85505fa8 to 0x85505ff0)
5fa0:                   0018a058 00000000 0018a25c 7e97caf8 7e97caf8 0018a266
5fc0: 0018a058 00000000 00000001 000000c3 001864dc 00186478 00000000 7e97cadc
5fe0: 000000c3 7e97cac4 000ddb9b 00011726


基本上dentry就是一個representation/cache結構,用於反應file-system的目錄結構,而不須真的讀從file-system讀出資料。
A "dentry" in the Linux kernel is the in-memory representation of a directory entry;
it is a way of remembering the resolution of a given file or directory name 
without having to search through the filesystem to find it. 
The dentry cache speeds lookups considerably; 
keeping dentries for frequently accessed names like /tmp, /dev/null, 
or /usr/bin/tetris saves a lot of filesystem I/O.


概略的一些關係圖如下,希望能讓大家能對這些關係有較清楚的概念。



目前為止的功能只能達到mount,其餘的下一個章節繼續囉。


2020年10月9日 星期五

How MSM Parse Partition Table in Kernel


call flow
msm_nand_probe()
  |-> msm_nand_parse_smem_ptable() 
    |-> smem_get_entry(SMEM_AARM_PARTITION_TABLE)
  |-> mtd_device_parse_register()


code snippet, https://android.googlesource.com/kernel/msm/+/android-msm-hammerhead-3.4-kk-r1/drivers/mtd/devices/msm_qpic_nand.c
#define FLASH_PART_MAGIC1        0x55EE73AA
#define FLASH_PART_MAGIC2        0xE35EBDDB
#define FLASH_PTABLE_V3         3
#define FLASH_PTABLE_V4         4
#define FLASH_PTABLE_MAX_PARTS_V3  16
#define FLASH_PTABLE_MAX_PARTS_V4  32
#define FLASH_PTABLE_HDR_LEN     (4*sizeof(uint32_t))

// Parititon Table Store in Flash.
struct flash_partition_entry {
  char name[FLASH_PTABLE_ENTRY_NAME_SIZE];
  u32 offset;   /* Offset in blocks from beginning of device */
  u32 length;   /* Length of the partition in blocks */
  u8 attr;     /* Flags for this partition */
};

struct flash_partition_table {
  u32 magic1;
  u32 magic2;
  u32 version;
  u32 numparts;
  struct flash_partition_entry part_entry[FLASH_PTABLE_MAX_PARTS_V4];
};

static struct mtd_partition mtd_part[FLASH_PTABLE_MAX_PARTS_V4];

static int __devinit msm_nand_probe(struct platform_device *pdev)
{
  ...
  err = msm_nand_parse_smem_ptable(&nr_parts);
  ...
  for (i = 0; i < nr_parts; i++) {
    mtd_part[i].offset *= info->mtd.erasesize;
    mtd_part[i].size *= info->mtd.erasesize;
  }
  err = mtd_device_parse_register(&info->mtd, NULL, NULL, &mtd_part[0], nr_parts);
  ...
}

// Get Partition Table from RAM/Flash via smem_get_entry(SMEM_AARM_PARTITION_TABLE).
static int msm_nand_parse_smem_ptable(int *nr_parts)
{
  uint32_t  i, j;
  uint32_t len = FLASH_PTABLE_HDR_LEN;
  struct flash_partition_entry *pentry;
  char *delimiter = ":";
  pr_info("Parsing partition table info from SMEM\n");
  /* Read only the header portion of ptable */
  ptable = *(struct flash_partition_table *)
            (smem_get_entry(SMEM_AARM_PARTITION_TABLE, &len));
  /* Verify ptable magic */
  if (ptable.magic1 != FLASH_PART_MAGIC1 || 
      ptable.magic2 != FLASH_PART_MAGIC2) {
    pr_err("Partition table magic verification failed\n");
    goto out;
  }
  /* Ensure that # of partitions is less than the max we have allocated */
  if (ptable.numparts > FLASH_PTABLE_MAX_PARTS_V4) {
    pr_err("Partition numbers exceed the max limit\n");
    goto out;
  }
  /* Find out length of partition data based on table version. */
  if (ptable.version <= FLASH_PTABLE_V3) {
    len = FLASH_PTABLE_HDR_LEN + FLASH_PTABLE_MAX_PARTS_V3 *
        sizeof(struct flash_partition_entry);
  } else if (ptable.version == FLASH_PTABLE_V4) {
    len = FLASH_PTABLE_HDR_LEN + FLASH_PTABLE_MAX_PARTS_V4 *
        sizeof(struct flash_partition_entry);
  } else {
    pr_err("Unknown ptable version (%d)", ptable.version);
    goto out;
  }
  *nr_parts = ptable.numparts;
  ptable = *(struct flash_partition_table *)
        (smem_get_entry(SMEM_AARM_PARTITION_TABLE, &len));
  for (i = 0; i < ptable.numparts; i++) {
    pentry = &ptable.part_entry[i];
    if (pentry->name == '\0')
      continue;
    /* Convert name to lower case and discard the initial chars */
    mtd_part[i].name = pentry->name;
    for (j = 0; j < strlen(mtd_part[i].name); j++)
      *(mtd_part[i].name + j) = tolower(*(mtd_part[i].name + j));
      strsep(&(mtd_part[i].name), delimiter);
      mtd_part[i].offset      = pentry->offset;
      mtd_part[i].mask_flags  = pentry->attr;
      mtd_part[i].size        = pentry->length;
      pr_debug("%d: %s offs=0x%08x size=0x%08x attr:0x%08x\n",
          i, pentry->name, pentry->offset, pentry->length, pentry->attr);
  }
  pr_info("SMEM partition table found: ver: %d len: %d\n",
      ptable.version, ptable.numparts);
  return 0;
out:
  return -EINVAL;
}

static const struct of_device_id msm_nand_match_table[] = {
  { .compatible = "qcom,msm-nand", },
  {},
};


2020年9月27日 星期日

Linux Kernel(10.3.1)- Command line partition table parsing for Kernel 4.19


由於10.3的kernel過時,所以10.3.1是用kernel 4.19來做補充。
MTD Partition除了在code中寫死以外,其實還可以透過一些parsers來作規劃,這一章就要來教大家如何使用"Command line partition table parsing"。
首先必須在kernel中啟用"Command line partition table parsing",請參照10.3。
接著在command line中加入mtdparts的設定,其簡單說明如下:
 * mtdparts=<mtddef>[;<mtddef]
 * <mtddef>  := <mtd-id>:<partdef>[,<partdef>]
 * <partdef> := <size>[@<offset>][<name>][ro][lk]
 * <mtd-id>  := unique name used in mapping driver/device (mtd->name)
 * <size>    := standard linux memsize OR "-" to denote all remaining space
 *              size is automatically truncated at end of device
 *              if specified or truncated size is 0 the part is skipped
 * <offset>  := standard linux memsize
 *              if omitted the part will immediately follow the previous part
 *              or 0 if the first part
 * <name>    := '(' NAME ')'
 *              NAME will appear in /proc/mtd
 *
 * <size> and <offset> can be specified such that the parts are out of order
 * in physical memory and may even overlap.
 *
 * The parts are assigned MTD numbers in the order they are specified in the
 * command line regardless of their order in physical memory.

 * Due to the way Linux handles the command line, no spaces are
 * allowed in the partition definition, including mtd id's and partition
 * names.

這裡我用qemu與mtdram來執行,執行參數如下
qemu-system-arm -s -M vexpress-a9 -m 128M -kernel ./linux/arch/arm/boot/zImage \
                -dtb ./linux/arch/arm/boot/dts/vexpress-v2p-ca9.dtb -initrd ./initrd-arm.img \
                -nographic -append "console=ttyAMA0 mtdparts=\"mtdram test device:1024k(p1),1024k(p2),-(p3)\""

這裡特別要說明的是mtd-id就是mtd name,基本上你可以以cat /proc/mtd,就可以知道mtd name了,或者trace一下code也行
// file "drivers/mtd/cmdlinepart.c"
static int parse_cmdline_partitions(struct mtd_info *master,
                    const struct mtd_partition **pparts,
                    struct mtd_part_parser_data *data)
{
    ...
const char *mtd_id = master->name;
    /*
     * Search for the partition definition matching master->name.
     * If master->name is not set, stop at first partition definition.
     */
    for (part = partitions; part; part = part->next) {
        if ((!mtd_id) || (!strcmp(part->mtd_id, mtd_id)))
            break;
    }
    ...
}

// drivers/mtd/devices/mtdram.c
static int __init init_mtdram(void)
{
    ...
    err = mtdram_init_device(mtd_info, addr, MTDRAM_TOTAL_SIZE, "mtdram test device");
    ...
}

int mtdram_init_device(struct mtd_info *mtd, void *mapped_address,
        unsigned long size, const char *name)
{
    ...
    /* Setup the MTD structure */
    mtd->name = name;
    ...
}

此外我多印了一些deubg資訊在drivers/mtd/cmdlinepart.c
diff --git a/drivers/mtd/cmdlinepart.c b/drivers/mtd/cmdlinepart.c
index 3ea44cff9b75..a411ce85097e 100644
--- a/drivers/mtd/cmdlinepart.c
+++ b/drivers/mtd/cmdlinepart.c
@@ -48,6 +48,7 @@
  * edb7312-nor:256k(ARMboot)ro,-(root);edb7312-nand:-(home)
  */

+#define DEBGU
 #define pr_fmt(fmt)    "mtd: " fmt

 #include 
@@ -58,7 +59,7 @@
 #include 

 /* debug macro */
-#if 0
+#if 1
 #define dbg(x) do { printk("DEBUG-CMDLINE-PART: "); printk x; } while(0)
 #else
 #define dbg(x)
@@ -315,6 +316,7 @@ static int parse_cmdline_partitions(struct mtd_info *master,
        struct cmdline_mtd_partition *part;
        const char *mtd_id = master->name;

+       printk("BROOK, %s(#%d), cmdline_parsed:%d\n", __func__, __LINE__, cmdline_parsed);
        /* parse command line */
        if (!cmdline_parsed) {
                err = mtdpart_setup_real(cmdline);
@@ -327,12 +329,15 @@ static int parse_cmdline_partitions(struct mtd_info *master,
         * If master->name is not set, stop at first partition definition.
         */
        for (part = partitions; part; part = part->next) {
+           printk("BROOK, %s(#%d), mtd_id:%s, part->mtd_id:%s\n", __func__, __LINE__, mtd_id, part->mtd_id);
                if ((!mtd_id) || (!strcmp(part->mtd_id, mtd_id)))
                        break;
        }

-       if (!part)
+       printk("BROOK, %s(#%d), part:%p\n", __func__, __LINE__, part);
+       if (!part) {
                return 0;
+       }

        for (i = 0, offset = 0; i < part->num_parts; i++) {
                if (part->parts[i].offset == OFFSET_CONTINUOUS)

執行結果如下:
/ # dmesg
...
[    4.688086] BROOK, parse_cmdline_partitions(#319), cmdline_parsed:0
[    4.688876] DEBUG-CMDLINE-PART:
[    4.688994] parsing <1024k(p1),1024k(p2),-(p3)>
[    4.690787] DEBUG-CMDLINE-PART:
[    4.690944] partition 2: name <p3>, offset ffffffffffffffff, size ffffffffffffffff, mask flags 0
[    4.692017] DEBUG-CMDLINE-PART:
[    4.692102] partition 1: name <p2>, offset ffffffffffffffff, size 100000, mask flags 0
[    4.692950] DEBUG-CMDLINE-PART:
[    4.693034] partition 0: name <p1>, offset ffffffffffffffff, size 100000, mask flags 0
[    4.693993] DEBUG-CMDLINE-PART:
[    4.694101] mtdid=<mtdram test device> num_parts=<3>
[    4.695000] BROOK, parse_cmdline_partitions(#332), mtd_id:40000000.flash, part->mtd_id:mtdram test device
[    4.696546] BROOK, parse_cmdline_partitions(#337), part:  (null)
[    4.763129] BROOK, parse_cmdline_partitions(#319), cmdline_parsed:1
[    4.763742] BROOK, parse_cmdline_partitions(#332), mtd_id:48000000.psram, part->mtd_id:mtdram test device
[    4.764461] BROOK, parse_cmdline_partitions(#337), part:  (null)
[    4.806551] BROOK, parse_cmdline_partitions(#319), cmdline_parsed:1
[    4.808150] BROOK, parse_cmdline_partitions(#332), mtd_id:mtdram test device, part->mtd_id:mtdram test device
[    4.809227] BROOK, parse_cmdline_partitions(#337), part:(ptrval)
[    4.810325] 3 cmdlinepart partitions found on MTD device mtdram test device
[    4.811006] Creating 3 MTD partitions on "mtdram test device":
[    4.813103] 0x000000000000-0x000000100000 : "p1"
[    4.830841] 0x000000100000-0x000000200000 : "p2"
[    4.846502] 0x000000200000-0x000000400000 : "p3"
...

/ # cat /proc/mtd
dev:    size   erasesize  name
mtd0: 08000000 00040000 "40000000.flash"
mtd1: 02000000 00001000 "48000000.psram"
mtd2: 00100000 00020000 "p1"
mtd3: 00100000 00020000 "p2"
mtd4: 00200000 00020000 "p3"

/ # zcat /proc/config.gz | grep -i MTDRAM
CONFIG_MTD_MTDRAM=y
CONFIG_MTDRAM_TOTAL_SIZE=4096
CONFIG_MTDRAM_ERASE_SIZE=128




熱門文章