Nano雞排: Linux

顯示具有 Linux - kernel 標籤的文章。顯示所有文章

2016年1月3日星期日

Linux Kernel（15.1）- platform_driver_register()之如何调用driver.probe()

轉自platform_driver_register()--如何match之后调用probe

int platform_driver_register(struct platform_driver *drv)
{
    drv->driver.bus = &platform_bus_type;/*关联总线*/
    /*关联driver的设备方法*/
    if (drv->probe)
        drv->driver.probe = platform_drv_probe;
    if (drv->remove)
        drv->driver.remove = platform_drv_remove;
    if (drv->shutdown)
        drv->driver.shutdown = platform_drv_shutdown;

    return driver_register(&drv->driver);/*注册驱动*/
}

/******************************************************************************/
struct platform_driver {
    int (*probe)(struct platform_device *);/*匹配到设备后调用，下面分析内核代码怎么调用的*/
    int (*remove)(struct platform_device *);
    void (*shutdown)(struct platform_device *);
    int (*suspend)(struct platform_device *, pm_message_t state);
    int (*resume)(struct platform_device *);
    struct device_driver driver;
    const struct platform_device_id *id_table;
};

struct bus_type platform_bus_type = {
    .name        = "platform",
    .dev_attrs    = platform_dev_attrs,
    .match        = platform_match,
    .uevent        = platform_uevent,
    .pm        = &platform_dev_pm_ops,
};
/********************************************************************************/

int driver_register(struct device_driver *drv)
{
    int ret;
    struct device_driver *other;

    BUG_ON(!drv->bus->p);

    if ((drv->bus->probe && drv->probe) ||
        (drv->bus->remove && drv->remove) ||
        (drv->bus->shutdown && drv->shutdown))
        printk(KERN_WARNING "Driver '%s' needs updating - please use "
            "bus_type methods\n", drv->name);

    other = driver_find(drv->name, drv->bus);
    if (other) {
        put_driver(other);
        printk(KERN_ERR "Error: Driver '%s' is already registered, "
            "aborting...\n", drv->name);
        return -EBUSY;
    }

    ret = bus_add_driver(drv);
    if (ret)
        return ret;
    ret = driver_add_groups(drv, drv->groups);
    if (ret)
        bus_remove_driver(drv);
    return ret;
}

int bus_add_driver(struct device_driver *drv)
{
    struct bus_type *bus;
    struct driver_private *priv;
    int error = 0;

    bus = bus_get(drv->bus);
    if (!bus)
        return -EINVAL;

    pr_debug("bus: '%s': add driver %s\n", bus->name, drv->name);

    priv = kzalloc(sizeof(*priv), GFP_KERNEL);
    if (!priv) {
        error = -ENOMEM;
        goto out_put_bus;
    }
    klist_init(&priv->klist_devices, NULL, NULL);
    priv->driver = drv;
    drv->p = priv;
    priv->kobj.kset = bus->p->drivers_kset;
    error = kobject_init_and_add(&priv->kobj, &driver_ktype, NULL,
                     "%s", drv->name);
    if (error)
        goto out_unregister;

    if (drv->bus->p->drivers_autoprobe) {
        error = driver_attach(drv);
        if (error)
            goto out_unregister;
    }
    klist_add_tail(&priv->knode_bus, &bus->p->klist_drivers);
    module_add_driver(drv->owner, drv);

    error = driver_create_file(drv, &driver_attr_uevent);
    if (error) {
        printk(KERN_ERR "%s: uevent attr (%s) failed\n",
            __func__, drv->name);
    }
    error = driver_add_attrs(bus, drv);
    if (error) {
        /* How the hell do we get out of this pickle? Give up */
        printk(KERN_ERR "%s: driver_add_attrs(%s) failed\n",
            __func__, drv->name);
    }

    if (!drv->suppress_bind_attrs) {
        error = add_bind_files(drv);
        if (error) {
            /* Ditto */
            printk(KERN_ERR "%s: add_bind_files(%s) failed\n",
                __func__, drv->name);
        }
    }

    kobject_uevent(&priv->kobj, KOBJ_ADD);
    return 0;

out_unregister:
    kobject_put(&priv->kobj);
    kfree(drv->p);
    drv->p = NULL;
out_put_bus:
    bus_put(bus);
    return error;
}

int driver_attach(struct device_driver *drv)
{
    /*对总线上的每一个设备都调用__driver_attach*/
    return bus_for_each_dev(drv->bus, NULL, drv, __driver_attach);
}

static int __driver_attach(struct device *dev, void *data)
{
    struct device_driver *drv = data;

    /*
     * Lock device and try to bind to it. We drop the error
     * here and always return 0, because we need to keep trying
     * to bind to devices and some drivers will return an error
     * simply if it didn't support the device.
     *
     * driver_probe_device() will spit a warning if there
     * is an error.
     */

    if (!driver_match_device(drv, dev))
        return 0;

    if (dev->parent)    /* Needed for USB */
        device_lock(dev->parent);
    device_lock(dev);
    if (!dev->driver)
        driver_probe_device(drv, dev);
    device_unlock(dev);
    if (dev->parent)
        device_unlock(dev->parent);

    return 0;
}

static inline int driver_match_device(struct device_driver *drv,
                      struct device *dev)
{
    /*调用总线的match去匹配设备和驱动*/
    return drv->bus->match ? drv->bus->match(dev, drv) : 1;
}

int driver_probe_device(struct device_driver *drv, struct device *dev)
{
    int ret = 0;

    if (!device_is_registered(dev))
        return -ENODEV;

    pr_debug("bus: '%s': %s: matched device %s with driver %s\n",
         drv->bus->name, __func__, dev_name(dev), drv->name);

    pm_runtime_get_noresume(dev);
    pm_runtime_barrier(dev);
    ret = really_probe(dev, drv);
    pm_runtime_put_sync(dev);

    return ret;
}
static int really_probe(struct device *dev, struct device_driver *drv)
{
    int ret = 0;

    atomic_inc(&probe_count);
    pr_debug("bus: '%s': %s: probing driver %s with device %s\n",
         drv->bus->name, __func__, drv->name, dev_name(dev));
    WARN_ON(!list_empty(&dev->devres_head));

    dev->driver = drv;
    if (driver_sysfs_add(dev)) {
        printk(KERN_ERR "%s: driver_sysfs_add(%s) failed\n",
            __func__, dev_name(dev));
        goto probe_failed;
    }
/**********************************************************************************/
    if (dev->bus->probe) {/*首先看总线有没有probe函数，若有则调用，而平台总线没有probe*/
        ret = dev->bus->probe(dev);
        if (ret)
            goto probe_failed;
    } else if (drv->probe) {/*然后看驱动有没有probe函数，若有则调用，*/
        ret = drv->probe(dev);
        if (ret)
            goto probe_failed;
    }
/************************************************************************************/

    driver_bound(dev);
    ret = 1;
    pr_debug("bus: '%s': %s: bound device %s to driver %s\n",
         drv->bus->name, __func__, dev_name(dev), drv->name);
    goto done;

probe_failed:
    devres_release_all(dev);
    driver_sysfs_remove(dev);
    dev->driver = NULL;

    if (ret != -ENODEV && ret != -ENXIO) {
        /* driver matched but the probe failed */
        printk(KERN_WARNING
               "%s: probe of %s failed with error %d\n",
               drv->name, dev_name(dev), ret);
    }
    /*
     * Ignore errors returned by ->probe so that the next driver can try
     * its luck.
     */
    ret = 0;
done:
    atomic_dec(&probe_count);
    wake_up(&probe_waitqueue);
    return ret;
}


/*平台总线的match逻辑*/
static int platform_match(struct device *dev, struct device_driver *drv)
{
    struct platform_device *pdev = to_platform_device(dev);
    struct platform_driver *pdrv = to_platform_driver(drv);

    /* match against the id table first */
    if (pdrv->id_table)
        return platform_match_id(pdrv->id_table, pdev) != NULL;

    /* fall-back to driver name match */
    return (strcmp(pdev->name, drv->name) == 0);/*驱动名字与设备名字要匹配*/
}

2016年1月2日星期六

Linux Kernel（15）- Platform Devices

很多人心中都有過一個問題What is the difference between Platform driver and normal device driver?，簡單的來說Platform devices就non-discoverable，也就是device本身沒辦法跟系統說"我在這裡"，典型的就是I2C device，它不會通知kernel"我在這裡"，通常是預先知道有個I2C device在那裡，再由software設定好，這類non-discoverable device就適用Platform devices架構來寫。
platform device會被connect在platform bus上，而platform bus是一個虛擬的bus(pseudo-bus)，這樣可以讓整個架構platform driver符合Linux的標準driver model。

這篇會根據The platform device API教導如何寫一個簡單的Platform devices，基本上最基本的platform device只需要name，因為platform bus會根據platform device與platform driver的name是否match執行driver的probe()，而最簡單的platform driver只需要name，跟probe()與remove()即可。

#include <linux/module.h>
#include <linux/platform_device.h>

MODULE_AUTHOR("Brook");
MODULE_DESCRIPTION("Kernel module for demo");
MODULE_LICENSE("GPL");

#define DEVNAME "brook"

#define DYN_ALLOC 1

static struct platform_device brook_device = {
    .name = DEVNAME,
};

static int brook_probe(struct platform_device *pdev)
{
    pr_info("%s(#%d)\n", __func__, __LINE__);
    return 0;
}

static int brook_remove(struct platform_device *pdev)
{
    pr_info("%s(#%d)\n", __func__, __LINE__);
    return 0;
}

static struct platform_driver brook_driver = {
    .driver = {
        .name  = DEVNAME,
        .owner = THIS_MODULE,
    },
    .probe  = brook_probe,
    .remove = brook_remove,
};

static int __init brook_init(void)
{
    int err;
    pr_info("%s(#%d)\n", __func__, __LINE__);

    err = platform_device_register(&brook_device);
    if (err) {
        pr_err("%s(#%d): platform_device_register failed(%d)\n",
                __func__, __LINE__, err);
        return err;
    }

    err = platform_driver_register(&brook_driver);
    if (err) {
        dev_err(&(brook_device.dev), "%s(#%d): platform_driver_register fail(%d)\n",
                __func__, __LINE__, err);
        goto dev_reg_failed;
    }
    return err;

dev_reg_failed:
    platform_device_unregister(&brook_device);

    return err;
}
module_init(brook_init);

static void __exit brook_exit(void)
{
    dev_info(&(brook_device.dev), "%s(#%d)\n", __func__, __LINE__);
    platform_device_unregister(&brook_device);
    platform_driver_unregister(&brook_driver);
}
module_exit(brook_exit);

使用platform_device_register()會導致"brook.0" does not have a release() function, it is broken and must be fixed.的OOPS，可以改用platform_device_alloc() + platform_device_add()，platform_device_alloc()裡面就會做pa->pdev.dev.release = platform_device_release。

#include <linux/module.h>
#include <linux/platform_device.h>

MODULE_AUTHOR("Brook");
MODULE_DESCRIPTION("Kernel module for demo");
MODULE_LICENSE("GPL");

#define DEVNAME "brook"

#define DYN_ALLOC 1

static struct platform_device *brook_device;

static int brook_probe(struct platform_device *pdev)
{
    pr_info("%s(#%d)\n", __func__, __LINE__);
    return 0;
}

static int brook_remove(struct platform_device *pdev)
{
    pr_info("%s(#%d)\n", __func__, __LINE__);
    return 0;
}

static struct platform_driver brook_driver = {
    .driver = {
        .name  = DEVNAME,
        .owner = THIS_MODULE,
    },
    .probe  = brook_probe,
    .remove = brook_remove,
};

static int __init brook_init(void)
{
    int err;
    pr_info("%s(#%d)\n", __func__, __LINE__);

    /* using platform_device_alloc() + platform_device_add() 
     * instead of platform_device_register() to avoid the OOPS, 
     *     "Device 'brook.0' does not have a release() function,
     *      it is broken and must be fixed."
     */
    brook_device = platform_device_alloc(DEVNAME, 0);
    if (!brook_device) {
        pr_err("%s(#%d): platform_device_alloc fail\n",
               __func__, __LINE__);
        return -ENOMEM;
    }

    err = platform_device_add(brook_device);
    if (err) {
        pr_err("%s(#%d): platform_device_add failed\n",
               __func__, __LINE__);
        goto dev_add_failed;
    }

    err = platform_driver_register(&brook_driver);
    if (err) {
        dev_err(&(brook_device->dev), "%s(#%d): platform_driver_register fail(%d)\n",
                __func__, __LINE__, err);
        goto dev_reg_failed;
    }
    return err;

dev_add_failed:
    platform_device_put(brook_device);
dev_reg_failed:
    platform_device_unregister(brook_device);

    return err;
}
module_init(brook_init);

static void __exit brook_exit(void)
{
    dev_info(&(brook_device->dev), "%s(#%d)\n", __func__, __LINE__);
    platform_device_unregister(brook_device);
    platform_driver_unregister(&brook_driver);
}
module_exit(brook_exit);

Documentation/driver-model
What is the difference between Platform driver and normal device driver?
The platform device API
Linux Kernel architecture for device drivers
platform_driver_register()--如何match之后调用probe
Improved dynamically allocated platform_device interface

2011年12月10日星期六

send signal to user-space

某天有個需求是希望當kernel發生某事件時通知user-space的process，心裡想最快就是送signal，於是google一下，果然有人有類似的需求，signals handling in the kernel，於是改了一下把他放上來，值得一提的是，其實這樣並不被鼓勵的，而且原本的kill_proc_info並沒有被export出來，所以如果是module要使用的話，就必須把他export出來，EXPORT_SYMBOL(kill_proc_info)。

#include <linux/module.h>
#include <linux/init.h>
#include <linux/moduleparam.h>

#include <linux/sched.h>
#include <linux/kernel.h> /* printk() */
// #include <linux/slab.h> /* kmalloc() */
#include <linux/errno.h>  /* error codes */
#include <linux/types.h>  /* size_t */
#include <linux/signal.h>
#include <linux/proc_fs.h>
#include <linux/uaccess.h>

#define PROC_NAME "sig2pid"

/**
 * 送signal 到pid去
 */
static int send_sig_to_pid(int sig, pid_t pid)
{
    struct siginfo info;

    info.si_signo = sig;
    info.si_errno = 0;
    info.si_code = SI_USER; // sent by kill, sigsend, raise
    info.si_pid = get_current()->pid; // sender's pid
    info.si_uid = current_uid(); // sender's uid

    return kill_proc_info(sig, &info, pid);
}

/**
 * /proc/sig2pid的write ops
 */
static int
sig2pid_proc_write(struct file *file, const char __user * buffer,
                     unsigned long count, void *data)
{
    int sig, pid, ret;
    char line[count];
    ret = copy_from_user(line, buffer, count);

    if (ret) {
        return -EFAULT;
    }
    sscanf(line, "%d %d", &pid, &sig);
    printk("%s(#%d): pid(%d), sig(%d)\n",
            __func__, __LINE__, pid, sig);
    send_sig_to_pid(sig, (pid_t) pid);
    return count;
}

/**
 * 建立/proc/sig2pid
 */
static int create_proc_file(void)
{
    struct proc_dir_entry *p;
    p = create_proc_entry(PROC_NAME, S_IFREG | S_IWUGO, NULL);
    if (!p) {
        printk("%s(#%d): create proc entry failed\n", __func__, __LINE__);
        return -EFAULT;
    }
    p->write_proc = sig2pid_proc_write;
    return 0;
}

int sig2pid_init_module(void)
{
    return create_proc_file();
}

void sig2pid_exit_module(void)
{
    remove_proc_entry(PROC_NAME, NULL);
}

module_init(sig2pid_init_module);
module_exit(sig2pid_exit_module);

http://old.nabble.com/signals-handling-in-the-kernel-to12032525.html#a12032525 , signals handling in the kernel.
http://kerneltrap.org/node/5800, how to send signal from kernel space to user space.

github： https://github.com/brook-kuo/Linux_Module/tree/master/process/send_sig_to_userspace

2011年11月6日星期日

利用gen_init_cpio建立initrd的script

#!/bin/bash
#kernel的目錄
KERN_DIR=/usr/src/linux-kvm
#gen_initramfs_list.sh產生的暫存檔
INITRAMFS_LIST=/tmp/gen_initramfs_list
#initramfs的來源目錄, 為傳入該script的第一個參數
INITRAMFS_DIR=$1
#initrd的目的檔名, 為傳入該script的第二個參數
INITRD=$2
#給INITRAMFS_DIR default值
: ${INITRAMFS_DIR:="/home/brook/projects/rootfs"}
#給INITRD default值, 
: ${INITRD:="/home/brook/initrd"}

if [ ! -d $INITRAMFS_DIR ]; then
    echo "usage: $0 <initramfs_dir> <output_file>"
    exit 1
fi

sh $KERN_DIR/scripts/gen_initramfs_list.sh -d $INITRAMFS_DIR > $INITRAMFS_LIST
$KERN_DIR/usr/gen_init_cpio $INITRAMFS_LIST > $INITRD

2011年5月7日星期六

Linux Kernel（14）- Kernel Synchronization

這裡簡單介紹的介紹一下Kernel Synchronization的幾個觀念的幾個觀念。

Race Condition
Critical Regions(或稱critical sections)
Kernel Synchronization

保護的重點是shared data

Kernel Synchronization常見的作法就是locking，每次只允許一個process可以存取share data，就可以避免race condition了。

Interrupt
Softirq
kernel preemption
Sleeping
SMP

在linunx kernel中，執行的context主要分成兩種interrupt context和process context，凡是只要in_interrupt()都是interrupt context，所以引起的原因包含Hardware interrupt和softirq兩種。以下就擷取片段程式碼說明。

// thread_info存放在task的stack裡面
# define task_thread_info(task)  ((struct thread_info *)(task)->stack)

// thread_info中的preempt_count分成幾個部份
// bits 0-7 are the preemption count (max preemption depth: 256)
// bits 8-15 are the softirq count (max # of softirqs: 256)
// bits 16-25 are the hardirq count (max # of nested hardirqs: 1024)
// bit 26 is the NMI_MASK
// bit 28 is the PREEMPT_ACTIVE flag
struct thread_info {
    struct task_struct *task;   /* main task structure */
    int    preempt_count;       /* 0 => preemptable */
};

# define preempt_count()        (current_thread_info()->preempt_count)
# define add_preempt_count(val) do { preempt_count() += (val); } while (0)
# define sub_preempt_count(val) do { preempt_count() -= (val); } while (0)

// 當每次呼叫irq_enter()就會將preempt_count屬於HARDIRQ的部份遞增
#define __irq_enter()                       \
    do {                                    \
        account_system_vtime(current);      \
        add_preempt_count(HARDIRQ_OFFSET);  \
        trace_hardirq_enter();              \
    } while (0)

# define invoke_softirq()   do_softirq()
# define IRQ_EXIT_OFFSET    HARDIRQ_OFFSET
/*
 * Exit an interrupt context. Process softirqs if needed and possible:
 */
void irq_exit(void)
{
    // 離開hard interrupt所以要減回去HARDIRQ_OFFSET
    sub_preempt_count(IRQ_EXIT_OFFSET);
    // 如果不在interrupt context(如softirq裡面),
    // 而且softirq有被raise就執行softirq
    if (!in_interrupt() && local_softirq_pending())
        invoke_softirq();
}

此圖出於http://blog.csdn.net/hero7935/archive/2011/05/07/6401522.aspx

Linux Kernel Development 2nd, Novell Press

2011年2月27日星期日

Linux Modules（14.1）- Read Copy Update

RCU (Read-Copy Update)是kernel同步機制之一，允許多個reader在writer更新資料的同時讀取資料，reader可能讀到更新前或更新後的，但是資料內容是一致的(不是新的就是舊的，這是因為RCU利用指標的dereference和assign達成的)，另外，RCU也能確保資料在read-side使用時不會將之free(下一篇介紹RCU的原理在提吧)。

這裡有一張圖用來描述RCU再經典不過了。首先，藍色的reader的開始就是rcu_read_lock()，結束就是rcu_read_unlock()，下面的removal、grace period和reclamation代表著writer的狀態，這邊只要保證讀到舊資料的reader(就是開頭落在removal的reader)，都能在grace period結束之前，離開read-side就可以了，聰明的你一定可以看出在grace period開始之後的reader都是讀到新資料，所以RCU就不管他想用多久。

removal：更新指標。
grace period：等待所有持有舊資料的reader都離開RCU read-side。
reclamation：回收舊資料。

RCU本身就是read-write lock的一種，所以我們介紹一下RCU的reader和writer的形式。

struct foo {
    int x;
};

static struct foo *foo = NULL;

// Reader的形式
static int reader(void)
{
    int ret;

    rcu_read_lock();
    ret = rcu_dereference(foo)->x;
    rcu_read_unlock();

    return ret;
}

// Writer的形式
static void writer(int x)
{
    struct foo *new_foo, *old_foo = foo;

    // 建立新的資料內容new_foo
    new_foo = kmalloc(sizeof(struct foo), GFP_KERNEL);

    // 複製原本的內容
    *new_foo = *old_foo;

    // 修改內容
    new_foo->x = x;

    // removal
    rcu_assign_pointer(foo, new_foo);

    // grace period
    synchronize_rcu();

    // reclamation: 
    kfree(old_foo);

}

synchronize_rcu()就是在等待所謂的grace period，等所有持舊資料的reader都離開RCU read-side才會往下執行kfree(old_foo)。

What is RCU, Fundamentally?

2011年2月26日星期六

Linux Kernel（8.1）- Notifier機制剖析

由Linux Kernel（8）- Notification可以學會運用notifier，而這一篇會概述如何實現，基本上所謂的publish-and-subscribe pattern都是註冊callback function到某個list上去，某事件發生時，再將整個list的callback function執行過一次。

include/lunux/notifier.h

#define BLOCKING_NOTIFIER_HEAD(name)                \
        struct blocking_notifier_head name =            \
        BLOCKING_NOTIFIER_INIT(name)

struct blocking_notifier_head {
    // 用於blocking機制時使用
    // 可以於kernel/notifier.c看到以下註解
    /*
     * Blocking notifier chain routines.  All access to the chain is
     * synchronized by an rwsem.
     */
    struct rw_semaphore rwsem;
    // callback function之linking-list的頭
    struct notifier_block __rcu *head;
};

// linking-list之node結構
struct notifier_block {
    // callback function
    int (*notifier_call)(struct notifier_block *, unsigned long, void *);
    struct notifier_block __rcu *next;
    // 用於註冊到list之優先順序, 數字越大 priority越高
    int priority;
};

kernel/notifier.c

/**
 * blocking_notifier_chain_register - Add notifier to a blocking notifier chain
 * @nh: Pointer to head of the blocking notifier chain
 * @n: New entry in notifier chain
 *
 * Adds a notifier to a blocking notifier chain.
 * Must be called in process context.
 * // 因為semaphore只能用在process context
 *
 * Currently always returns zero.
 */
int blocking_notifier_chain_register(struct blocking_notifier_head *nh,
                struct notifier_block *n)
{
    int ret;

    /*
     * This code gets used during boot-up, when task switching is
     * not yet working and interrupts must remain disabled.  At
     * such times we must not call down_write().
     */
    if (unlikely(system_state == SYSTEM_BOOTING))
        return notifier_chain_register(&nh->head, n);

    // 使用writer semaphore保護, 確保kernel synchronization
    down_write(&nh->rwsem);

    // 真正掛callback function到list的function
    ret = notifier_chain_register(&nh->head, n);

    up_write(&nh->rwsem);
    return ret;
}


/*
 *  Notifier chain core routines.  The exported routines below
 *  are layered on top of these, with appropriate locking added.
 */

static int notifier_chain_register(struct notifier_block **nl,
                struct notifier_block *n)
{
    // nl指向list中的第一個node
    while ((*nl) != NULL) {
        // 比較list中的每一個node之priority,
        // 如果發現新的比較大, 就break準備插到這個(*nl)的前面
        if (n->priority > (*nl)->priority)
            break;
        nl = &((*nl)->next);
    }
    // 將(*nl)串到新的後面
    n->next = *nl;
    // 將(*nl)取代成n
    rcu_assign_pointer(*nl, n);
    return 0;
}

/**
 * blocking_notifier_chain_unregister - Remove notifier from a blocking notifier chain
 * @nh: Pointer to head of the blocking notifier chain
 * @n: Entry to remove from notifier chain
 *
 * Removes a notifier from a blocking notifier chain.
 * Must be called from process context.
 *
 * Returns zero on success or %-ENOENT on failure.
 */
int blocking_notifier_chain_unregister(struct blocking_notifier_head *nh,
                struct notifier_block *n)
{
    // 基本上這個function和blocking_notifier_chain_register()相同

    int ret;

    /*
     * This code gets used during boot-up, when task switching is
     * not yet working and interrupts must remain disabled.  At
     * such times we must not call down_write().
     */
    if (unlikely(system_state == SYSTEM_BOOTING))
        return notifier_chain_unregister(&nh->head, n);

    // 使用writer semaphore保護, 確保kernel synchronization
    down_write(&nh->rwsem);

    // 真正移除callback function
    ret = notifier_chain_unregister(&nh->head, n);

    up_write(&nh->rwsem);
    return ret;
}

static int notifier_chain_unregister(struct notifier_block **nl,
                struct notifier_block *n)
{
    while ((*nl) != NULL) {
        if ((*nl) == n) {
            // 找到n在list中的位置, 然後將之移除
            rcu_assign_pointer(*nl, n->next);
            return 0;
        }
        // 將nl往下一個移動
        nl = &((*nl)->next);
    }
    return -ENOENT;
}


int blocking_notifier_call_chain(struct blocking_notifier_head *nh,
                unsigned long val, void *v)
{
    return __blocking_notifier_call_chain(nh, val, v, -1, NULL);
}


/**
 *  __blocking_notifier_call_chain - Call functions in a blocking notifier chain
 *  @nh: Pointer to head of the blocking notifier chain
 *  @val: Value passed unmodified to notifier function
 *  @v: Pointer passed unmodified to notifier function
 *  @nr_to_call: See comment for notifier_call_chain.
 *  @nr_calls: See comment for notifier_call_chain.
 *
 *  Calls each function in a notifier chain in turn.  The functions
 *  run in a process context, so they are allowed to block.
 *
 *  If the return value of the notifier can be and'ed
 *  with %NOTIFY_STOP_MASK then blocking_notifier_call_chain()
 *  will return immediately, with the return value of
 *  the notifier function which halted execution.
 *  Otherwise the return value is the return value
 *  of the last notifier function called.
 */
int __blocking_notifier_call_chain(struct blocking_notifier_head *nh,
            unsigned long val, void *v, int nr_to_call, int *nr_calls)
{
    int ret = NOTIFY_DONE;

    /*
     * We check the head outside the lock, but if this access is
     * racy then it does not matter what the result of the test
     * is, we re-check the list after having taken the lock anyway:
     */
    if (rcu_dereference_raw(nh->head)) {
        down_read(&nh->rwsem);
        // 真正執行list中所有callback function的API
        ret = notifier_call_chain(&nh->head, val, v, nr_to_call,
                                nr_calls);
        up_read(&nh->rwsem);
    }
    return ret;
}

/**
 *  notifier_call_chain - Informs the registered notifiers about an event.
 *  @nl:        Pointer to head of the blocking notifier chain
 *  @val:       Value passed unmodified to notifier function
 *  @v:     Pointer passed unmodified to notifier function
 *  @nr_to_call:    Number of notifier functions to be called. Don't care
 *                  value of this parameter is -1.
 *  @nr_calls:  Records the number of notifications sent. Don't care
 *              value of this field is NULL.
 *  @returns:   notifier_call_chain returns the value returned by the
 *              last notifier function called.
 */
static int __kprobes notifier_call_chain(struct notifier_block **nl,
            unsigned long val, void *v, int nr_to_call, int *nr_calls)
{
    int ret = NOTIFY_DONE;
    struct notifier_block *nb, *next_nb;

    nb = rcu_dereference_raw(*nl);

    // 由blocking_notifier_call_chain傳進來的nr_to_call為-1, 
    // 由於nr_to_call只會--, 所以nr_to_call就是always成立
    // 於是停止的條件只剩下nb為NULL
    while (nb && nr_to_call) {
        // ??這段的用意就不是很明瞭了??
        // 為啥不在後面在nb = nb->next?
        next_nb = rcu_dereference_raw(nb->next);

#ifdef CONFIG_DEBUG_NOTIFIERS
        if (unlikely(!func_ptr_is_kernel_text(nb->notifier_call))) {
            WARN(1, "Invalid notifier called!");
            nb = next_nb;
            continue;
        }
#endif
        // 執行callback function
        ret = nb->notifier_call(nb, val, v);

        if (nr_calls)
            (*nr_calls)++;

        // 如果帶有 STOP的bit就停止執行後面的callback function
        if ((ret & NOTIFY_STOP_MASK) == NOTIFY_STOP_MASK)
            break;
        nb = next_nb;
        nr_to_call--;
    }
    return ret;
}

這篇文章還不算完成，後面在補充啦~~

2011年1月16日星期日

Linux softirq執行分析(轉)

又是一篇精彩的文章，強力轉貼。

Linux softirq執行分析 

Author:  sinister
Email:   sinister@whitecell.org
Homepage:http://www.whitecell.org 
Date:    2007-01-11

本文對 Linux 內核軟中斷的執行流程進行了分析，並盡可能的結合當前運行環境詳細地寫出我的理解，
但這並不表明我的理解一定正確。這本是論壇裏的一篇帖子，發出來是為了抛磚引玉，如果您在閱讀本文
時發現了我的錯誤，還望得到您的指正。


今天無意中看了眼 2.6 內核的軟中斷實現，發現和以前我看到的大不相同（以前也是走馬觀花，不大仔
細），可以說改動很大。連 softirq 的調用點都不一樣了，以前是三個調用點，今天搜索了一下源代
碼，發現在多出了ksoftirqd 後，softirq 在系統中的調用點僅是在 ISR 返回時和使用了 
local_bh_enable() 函數後被調用了。網卡部分的顯示調用，我覺得應該不算是系統中的調用點。
ksoftirqd 返回去調用 do_softirq() 函數應該也只能算是其中的一個分支，因為其本身從源頭上
來講也還是在 ISR 返回時 irq_exit() 調用的。這樣一來就和前些日子寫的那份筆記
（Windows/Linux/Solaris 軟中斷機制）裏介紹的 Linux 內核部分的軟中斷有出處了，看來以後
討論 Linux kernel 代碼一定要以內核版本為前題，要不非亂了不可。得買本 Linux 方面的書了，
每次上來直接看相關代碼也不是回事，時間也不允許。


//
// do_IRQ 函數執行完硬體 ISR 後退出時調用此函數。
//

void irq_exit(void)
{
    account_system_vtime(current);
    trace_hardirq_exit();
    sub_preempt_count(IRQ_EXIT_OFFSET);

        //
        // 判斷當前是否有硬體中斷嵌套，並且是否有軟中斷在
        // pending 狀態，注意：這裏只有兩個條件同時滿足
        // 時，才有可能調用 do_softirq() 進入軟中斷。也就是
        // 說確認當前所有硬體中斷處理完成，且有硬體中斷安裝了
        // 軟中斷處理時理時才會進入。
        // 
    if (!in_interrupt() && local_softirq_pending())
                //
                // 其實這裏就是調用 do_softirq() 執行
                //
        invoke_softirq();
    preempt_enable_no_resched();
}


#ifndef __ARCH_HAS_DO_SOFTIRQ

asmlinkage void do_softirq(void)
{
    __u32 pending;
    unsigned long flags;

    //
    // 這個函數判斷，如果當前有硬體中斷嵌套，或者
    // 有軟中斷正在執行時候，則馬上返回。在這個
    // 入口判斷主要是為了與 ksoftirqd 互斥。
    //
    if (in_interrupt())
        return;

    //
    // 關中斷執行以下代碼
    //
    local_irq_save(flags);

    //
    // 判斷是否有 pending 的軟中斷需要處理。
    //
    pending = local_softirq_pending();

    //
    // 如果有則調用 __do_softirq() 進行實際處理
    //
    if (pending)
        __do_softirq();

    //
    // 開中斷繼續執行
    //
    local_irq_restore(flags);
}


//
// 最大軟中斷調用次數為 10 次。
//

#define MAX_SOFTIRQ_RESTART 10

asmlinkage void __do_softirq(void)
{
    //
    // 軟體中斷處理結構，此結構中包括了 ISR 中
    // 註冊的回調函數。
    //
    struct softirq_action *h;
    __u32 pending;
    int max_restart = MAX_SOFTIRQ_RESTART;
    int cpu;

    //
    // 得到當前所有 pending 的軟中斷。
    // 
    pending = local_softirq_pending();
    account_system_vtime(current);

    //
    // 執行到這裏要遮罩其他軟中斷，這裏也就證明了
    // 每個 CPU 上同時運行的軟中斷只能有一個。
    //
    __local_bh_disable((unsigned long)__builtin_return_address(0));
    trace_softirq_enter();

    //
    // 針對 SMP 得到當前正在處理的 CPU
    //
    cpu = smp_processor_id();
//
// 迴圈標誌
//
restart:
    //
    // 每次迴圈在允許硬體 ISR 強佔前，首先重置軟中斷
    // 的標誌位元。
    //
    /* Reset the pending bitmask before enabling irqs */
    set_softirq_pending(0);

    //
    // 到這裏才開中斷運行，注意：以前運行狀態一直是關中斷
    // 運行，這時當前處理軟中斷才可能被硬體中斷搶佔。也就
    // 是說在進入軟中斷時不是一開始就會被硬體中斷搶佔。只有
    // 在這裏以後的代碼才可能被硬體中斷搶佔。
    //
    local_irq_enable();

    //
    // 這裏要注意，以下代碼運行時可以被硬體中斷搶佔，但
    // 這個硬體 ISR 執行完成後，它的所註冊的軟中斷無法馬上運行，
    // 別忘了，現在雖是開硬體中斷執行，但前面的 __local_bh_disable()
    // 函數遮罩了軟中斷。所以這種環境下只能被硬體中斷搶佔，但這
    // 個硬中斷註冊的軟中斷回調函數無法運行。要問為什麼，那是因為
    // __local_bh_disable() 函數設置了一個標誌當作互斥量，而這個
    // 標誌正是上面的 irq_exit() 和 do_softirq() 函數中的
    // in_interrupt() 函數判斷的條件之一，也就是說 in_interrupt() 
    // 函數不僅檢測硬中斷而且還判斷了軟中斷。所以在這個環境下觸發
    // 硬中斷時註冊的軟中斷，根本無法重新進入到這個函數中來，只能
    // 是做一個標誌，等待下面的重複迴圈（最大 MAX_SOFTIRQ_RESTART）
    // 才可能處理到這個時候觸發的硬體中斷所註冊的軟中斷。
    //


    //
    // 得到軟中斷向量表。
    //
    h = softirq_vec;

    //
    // 迴圈處理所有 softirq 軟中斷註冊函數。
    // 
    do {
        //
        // 如果對應的軟中斷設置 pending 標誌則表明
        // 需要進一步處理它所註冊的函數。
        //
        if (pending & 1) {
            //
            // 在這裏執行了這個軟中斷所註冊的回調函數。
            //
            h->action(h);
            rcu_bh_qsctr_inc(cpu);
        }
        //
        // 繼續找，直到把軟中斷向量表中所有 pending 的軟
        // 中斷處理完成。
        //
        h++;

        //
        // 從代碼裏可以看出按位操作，表明一次迴圈只
        // 處理 32 個軟中斷的回調函數。
        //
        pending >>= 1; 
    } while (pending);

    //
    // 關中斷執行以下代碼。注意：這裏又關中斷了，下面的
    // 代碼執行過程中硬體中斷無法搶佔。
    //
    local_irq_disable();

    //
    // 前面提到過，在剛才開硬體中斷執行環境時只能被硬體中斷
    // 搶佔，在這個時候是無法處理軟中斷的，因為剛才開中
    // 斷執行過程中可能多次被硬體中斷搶佔，每搶佔一次就有可
    // 能註冊一個軟中斷，所以要再重新取一次所有的軟中斷。
    // 以便下面的代碼進行處理後跳回到 restart 處重複執行。
    //
    pending = local_softirq_pending();

    //
    // 如果在上面的開中斷執行環境中觸發了硬體中斷，且每個都
    // 註冊了一個軟中斷的話，這個軟中斷會設置 pending 位，
    // 但在當前一直遮罩軟中斷的環境下無法得到執行，前面提
    // 到過，因為 irq_exit() 和 do_softirq() 根本無法進入到
    // 這個處理過程中來。這個在上面詳細的記錄過了。那麼在
    // 這裏又有了一個執行的機會。注意：雖然當前環境一直是
    // 處於遮罩軟中斷執行的環境中，但在這裏又給出了一個執行
    // 剛才在開中斷環境過程中觸發硬體中斷時所註冊的軟中斷的
    // 機會，其實只要理解了軟中斷機制就會知道，無非是在一些特
    // 定環境下調用 ISR 註冊到軟中斷向量表裏的函數而已。
    //

    //
    // 如果剛才觸發的硬體中斷註冊了軟中斷，並且重複執行次數
    // 沒有到 10 次的話，那麼則跳轉到 restart 標誌處重複以上
    // 所介紹的所有步驟：設置軟中斷標誌位元，重新開中斷執行...
    // 注意：這裏是要兩個條件都滿足的情況下才可能重複以上步驟。 
    //
    if (pending && --max_restart)
        goto restart;

    //
    // 如果以上步驟重複了 10 次後還有 pending 的軟中斷的話，
    // 那麼系統在一定時間內可能達到了一個峰值，為了平衡這點。
    // 系統專門建立了一個 ksoftirqd 線程來處理，這樣避免在一
    // 定時間內負荷太大。這個 ksoftirqd 線程本身是一個大循環，
    // 在某些條件下為了不負載過重，它是可以被其他進程搶佔的，
    // 但注意，它是顯示的調用了 preempt_xxx() 和 schedule()
    // 才會被搶佔和切換的。這麼做的原因是因為在它一旦調用 
    // local_softirq_pending() 函數檢測到有 pending 的軟中斷
    // 需要處理的時候，則會顯示的調用 do_softirq() 來處理軟中
    // 斷。也就是說，下面代碼喚醒的 ksoftirqd 線程有可能會回
    // 到這個函數當中來，尤其是在系統需要回應很多軟中斷的情況
    // 下，它的調用入口是 do_softirq()，這也就是為什麼在 do_softirq()
    // 的入口處也會用 in_interrupt()  函數來判斷是否有軟中斷
    // 正在處理的原因了，目的還是為了防止重入。ksoftirqd 實現
    // 看下面對 ksoftirqd() 函數的分析。
    //
    if (pending)
               //
               // 此函數實際是調用 wake_up_process() 來喚醒 ksoftirqd
               // 
        wakeup_softirqd();

    trace_softirq_exit();
    account_system_vtime(current);

    //
    // 到最後才開軟中斷執行環境，允許軟中斷執行。注意：這裏
    // 使用的不是 local_bh_enable()，不會再次觸發 do_softirq()
    // 的調用。
    // 
    _local_bh_enable();
}


static int ksoftirqd(void * __bind_cpu)
{
    //
    // 顯示調用此函數設置當前進程的靜態優先順序。當然，
    // 這個優先順序會隨調度器策略而變化。
    //
    set_user_nice(current, 19);

    //
    // 設置當前進程不允許被掛啟
    //
    current->flags |= PF_NOFREEZE;

    //
    // 設置當前進程狀態為可中斷的狀態，這種睡眠狀
    // 態可回應信號處理等。
    // 
    set_current_state(TASK_INTERRUPTIBLE);

    //
    // 下面是一個大循環，迴圈判斷當前進程是否會停止，
    // 不會則繼續判斷當前是否有 pending 的軟中斷需
    // 要處理。
    //
    while (!kthread_should_stop()) {
        //
        // 如果可以進行處理，那麼在此處理期間內禁止
        // 當前進程被搶佔。
        //
        preempt_disable();

        //
        // 首先判斷系統當前沒有需要處理的 pending 狀態的軟中斷
        //
        if (!local_softirq_pending()) {
            //
            // 沒有的話在主動放棄 CPU 前先要允許搶佔，因為
            // 一直是在不允許搶佔狀態下執行的代碼。
            //
            preempt_enable_no_resched();

            //
            // 顯示調用此函數主動放棄 CPU 將當前進程放入睡眠佇列，
            // 並切換新的進程執行（調度器相關不記錄在此）
            //
            schedule();

            //
            // 注意：如果當前顯示調用 schedule() 函數主動切換的進
            // 程再次被調度執行的話，那麼將從調用這個函數的下一條
            // 語句開始執行。也就是說，在這裏當前進程再次被執行的
            // 話，將會執行下面的 preempt_disable() 函數。
            //

            //
            // 當進程再度被調度時，在以下處理期間內禁止當前進程被搶佔。
            //
            preempt_disable();
        }

        //
        // 設置當前進程為運行狀態。注意：已經設置了當前進程不可搶佔
        // 在進入迴圈後，以上兩個分支不論走哪個都會執行到這裏。一是
        // 進入迴圈時就有 pending 的軟中斷需要執行時。二是進入迴圈時
        // 沒有 pending 的軟中斷，當前進程再次被調度獲得 CPU 時繼續
        // 執行時。
        //
        __set_current_state(TASK_RUNNING);

        //
        // 迴圈判斷是否有 pending 的軟中斷，如果有則調用 do_softirq()
        // 來做具體處理。注意：這裏又是一個 do_softirq() 的入口點，
        // 那麼在 __do_softirq() 當中迴圈處理 10 次軟中斷的回調函數
        // 後，如果還有 pending 的話，會又調用到這裏。那麼在這裏則
        // 又會有可能去調用 __do_softirq() 來處理軟中斷回調函數。在前
        // 面介紹 __do_softirq() 時已經提到過，處理 10 次還處理不完的
        // 話說明系統正處於繁忙狀態。根據以上分析，我們可以試想如果在
        // 系統非常繁忙時，這個進程將會與 do_softirq() 相互交替執行，
        // 這時此進程佔用 CPU 應該會很高，雖然下面的 cond_resched() 
        // 函數做了一些處理，它在處理完一輪軟中斷後當前處理進程可能會
        // 因被調度而減少 CPU 負荷，但是在非常繁忙時這個進程仍然有可
        // 能大量佔用 CPU。
        //
        while (local_softirq_pending()) {
            /* Preempt disable stops cpu going offline.
               If already offline, we'll be on wrong CPU:
               don't process */
            if (cpu_is_offline((long)__bind_cpu))
                //
                // 如果當前被關聯的 CPU 無法繼續處理則跳轉
                // 到 wait_to_die 標記出，等待結束並退出。
                // 
                goto wait_to_die;

                //
                // 執行 do_softirq() 來處理具體的軟中斷回調函數。注
                // 意：如果此時有一個正在處理的軟中斷的話，則會馬上
                // 返回，還記得前面介紹的 in_interrupt() 函數麼。
                //
                do_softirq();

                //
                // 允許當前進程被搶佔。
                //
                preempt_enable_no_resched();
                        
                //
                // 這個函數有可能間接的調用 schedule() 來切換當前
                // 進程，而且上面已經允許當前進程可被搶佔。也就是
                // 說在處理完一輪軟中斷回調函數時，有可能會切換到
                // 其他進程。我認為這樣做的目的一是為了在某些負載
                // 超標的情況下不至於讓這個進程長時間大量的佔用 CPU，
                // 二是讓在有很多軟中斷需要處理時不至於讓其他進程
                // 得不到回應。
                //
                cond_resched();

                //
                // 禁止當前進程被搶佔。
                //
                preempt_disable();

                //
                // 處理完所有軟中斷了嗎？沒有的話繼續迴圈以上步驟
                //
        }

        //
        // 待一切都處理完成後，允許當前進程被搶佔，並設置
        // 當前進程狀態為可中斷狀態，繼續迴圈以上所有過程。
        //
        preempt_enable();
        set_current_state(TASK_INTERRUPTIBLE);
    }
   
    //
    // 如果將會停止則設置當前進程為運行狀態後直接返回。
    // 調度器會根據優先順序來使當前進程運行。
    //
    __set_current_state(TASK_RUNNING);
    return 0;

//
// 一直等待到當前進程被停止
//
wait_to_die:

    //
    // 允許當前進程被搶佔。
    //
    preempt_enable();
    /* Wait for kthread_stop */

    //
    // 設置當前進程狀態為可中斷的狀態，這種睡眠狀
    // 態可回應信號處理等。
    // 
    set_current_state(TASK_INTERRUPTIBLE);

    //
    // 判斷當前進程是否會被停止，如果不是的話
    // 則設置進程狀態為可中斷狀態並放棄當前 CPU
    // 主動切換。也就是說這裏將一直等待當前進程
    // 將被停止時候才結束。
    //
    while (!kthread_should_stop()) {
        schedule();
        set_current_state(TASK_INTERRUPTIBLE);
    }

    //
    // 如果將會停止則設置當前進程為運行狀態後直接返回。
    // 調度器會根據優先順序來使當前進程運行。
    //
    __set_current_state(TASK_RUNNING);
    return 0;
}


參考：
linux kernel source 2.6.19.1 /kernel/softirq.c
WSS(Whitecell Security Systems)，一個非營利性民間技術組織，致力於各種系統安全技術的研究。
堅持傳統的hacker精神，追求技術的精純。
WSS 主頁：http://www.whitecell.org/ 
WSS 論壇：http://www.whitecell.org/forums/

2011年1月9日星期日

Linux Modules（1.1）module parameters

Linux Module允許使用者在insmod時帶入相關的parameters，這些parameters必須被宣告成golbal，並且使用module_param()宣告資料型態與權限，目前支援的資料型態有byte, short, ushort, int, uint, long, ulong, charp, bool等等。也可以使用module_param_array(name, type, num, perm)宣告成陣列。perm(權限)會決定/sys/module/顯示該參數的權限。

#include <linux/init.h>
#include <linux/module.h>

MODULE_LICENSE("GPL");

static unsigned char b_byte = 1;
module_param(b_byte, byte, S_IRUGO|S_IWUSR);

static short int b_short = 2;
module_param(b_short, short, S_IRUGO|S_IWUSR);

static unsigned short int b_ushort = 3;
module_param(b_ushort, ushort, S_IRUGO|S_IWUSR);

static int b_int = 6;
module_param(b_int, int, S_IRUGO|S_IWUSR);

static unsigned int b_uint = 5;
module_param(b_uint, uint, S_IRUGO|S_IWUSR);

static long b_long = 6;
module_param(b_long, long, S_IRUGO|S_IWUSR);

static unsigned long b_ulong = 7;
module_param(b_ulong, ulong, S_IRUGO|S_IWUSR);

static char *b_charp = "brook";
module_param(b_charp, charp, S_IRUGO|S_IWUSR);

static int b_bool = 1;
module_param(b_bool, bool, S_IRUGO|S_IWUSR);

static int __init init_modules(void)
{
    printk("b_byte: %d\n", b_byte);
    printk("b_short: %d\n", b_short);
    printk("b_ushort: %u\n", b_ushort);
    printk("b_int: %d\n", b_int);
    printk("b_uint: %u\n", b_uint);
    printk("b_long: %ld\n", b_long);
    printk("b_ulong: %lu\n", b_ulong);
    printk("b_charp: %s\n", b_charp);
    printk("b_bool: %d\n", b_bool);

    return 0;
}

static void __exit exit_modules(void)
{
}

module_init(init_modules);
module_exit(exit_modules);

Kernel Version：2.6.35

Linux Device Drivers, 3e
Document/printk-formats.txt

2010年12月25日星期六

Linux Modules（7.3）- work queue

Work queue提供一個interface，讓使用者輕易的建立kernel thread並且將work綁在這個kernel thread上面，如下圖[1]所示。

由於work queue是建立一個kernel thread來執行，所以是在process context，不同於tasklet的interrupt context，因此，work queue可以sleep(設定semaphore或者執行block I/O等等)。

Creating Work
透過 DECLARE_WORK(name, void (work_func_t)(struct work_struct *work)); // statically
或者
INIT_WORK(struct work_struct*, void (work_func_t)(struct work_struct *work)); //dynamically
建立work，就是要執行的工作。
有了work還需要將它和work thread結合，您可以透過create_singlethread_workqueue("name")建立一個名為name的single thread(執行work的thread就稱為work thread)，或者create_workqueue("name")建立per cpu的thread。接著就是要將work和work thread做關聯了，透過queue_work(work_thread, work)就可以將work送給work thread執行了。

queue_delayed_work(work_thread, delayed_work, delay)為queue_work()的delay版本。
flush_workqueue(work_thread)會wait直到這個work_thread的work都做完。flush_workqueue()並不會取消任何被delay執行的work，如果要取消delayed的work則需要呼叫cancel_delayed_work(delayed_work)將delayed_work自某個work thread中移除。

最後，要將work_thread摧毀要呼叫destroy_workqueue(work_thread)。

event/n
除了自己建立work thread以外，kernel還建立一個公用的work thread稱為event

kernel/workqueue.c

void __init init_workqueues(void)
{
    …
    keventd_wq = create_workqueue("events");
    …
}

您可以透過schedule_work(&work)將，work送給"events"執行，flush_scheduled_work(void)等待"events"中所有的work執行完畢。

#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/init.h>
#include <linux/workqueue.h>
#include <linux/sched.h>
#include <linux/slab.h>

MODULE_LICENSE("GPL");

static void brook_1_routine(struct work_struct *);
static void brook_2_routine(struct work_struct *);
static void brook_3_routine(struct work_struct *);

static struct work_struct *brook_1_work; // for event
static DECLARE_WORK(brook_2_work, brook_2_routine);
static DECLARE_DELAYED_WORK(brook_3_work, brook_3_routine);
static struct workqueue_struct *brook_workqueue;
static int stop_wq;
module_param(stop_wq, int, S_IRUGO | S_IWUGO);

static int __init init_modules(void)
{
    // for event
    brook_1_work = kzalloc(sizeof(typeof(*brook_1_work)), GFP_KERNEL);
    INIT_WORK(brook_1_work, brook_1_routine);
    schedule_work(brook_1_work);

    // for brook_wq
    brook_workqueue = create_workqueue("brook_wq");
    queue_work(brook_workqueue, &brook_2_work);
    queue_delayed_work(brook_workqueue, &brook_3_work, 0);
    stop_wq = 0;
    return 0;
}

static void __exit exit_modules(void)
{
    cancel_delayed_work(&brook_3_work);
    flush_workqueue(brook_workqueue);
    stop_wq = 1;
    destroy_workqueue(brook_workqueue);
}

static void brook_1_routine(struct work_struct *ws)
{
    printk("%s(): on cpu:%d, pname:%s\n",
            __func__, smp_processor_id(), current->comm);
}

static void brook_2_routine(struct work_struct *ws)
{
    printk("%s(): on cpu:%d, pname:%s\n",
            __func__, smp_processor_id(), current->comm);
    // do something to block/sleep
    // the work in the same workqueue is also deferred.
    msleep(5000);
    if (!stop_wq) {
        queue_work(brook_workqueue, &brook_2_work);
    }
}

static void brook_3_routine(struct work_struct *ws)
{
    printk("%s(): on cpu:%d, pname:%s\n",
            __func__, smp_processor_id(), current->comm);
    queue_delayed_work(brook_workqueue, &brook_3_work, 50);
}

module_init(init_modules);
module_exit(exit_modules);

Kernel Version：2.6.35
參考資料：

http://www.embexperts.com/viewthread.php?tid=12&highlight=work%2Bqueue
Linux Kernel Development 2nd, Novell Press

2010年12月11日星期六

Linux Kernel（13）- syscall

System Call在HW和user space提供一層抽象層，主要目的有：

為user space提供硬體抽象層。比如，讀取檔案時，不用管檔案所在的媒體類型與檔案儲存類型。
System call能確保系統的安全與穩定。避免user space的無意或惡意的破壞。

除了exception和trap以外，System call是user space進入kernel space的唯一管道。
User space的programming主要是base on API(Application Programming Interface)並非system call，從programmer的觀點來看，關注的是API(如C library)而非system call。

System call的return type為long，主要是要相容64bit，return value通常代表失敗或成功，失敗時，error code當常寫入global variable “errno”。

典型的system call都以sys_開頭，如getpid()的system call為：

asmlinkage long sys_getpid(void)
{
    return current->tgid;
}

在Linux中(x86)，將所有的system call存放在一個system call table中，透過system call number來所引(index)要執行的system call，儘管每個platform所implement的system call table和system call number都不同，但是原理都是相同的，首先會將system call number存放在某個特定的CPU register(X86放在eax)，並將system call的參數也存放置其他的register(最多5個參數，x86依序為ebx、ecx、edx、esi和edi)，接著透過int 0x80進入system call處理程序，透過system call number(eax)在system call table中找到相對應的system call，並且執行該system call，因為參數存放是先就定義好了，所以就可以在registers(x86依序為ebx、ecx、edx、esi和edi)中依序讀出要處理的參數，超過五個參數就會用structure來傳遞，而ioctl這個不定參數的system call是傳遞pointer的方式來存取，ioctl是一個不好的例子，因為定義不是很明確，system call最好是能定義明確。

新增system call “brook()”到kernel 2.6.32的步驟(x86)

新增一筆system call entry到sys_call_table中arch/x86/kernel/syscall_table_32.s。

定義brook的system call number，arch/x86/include/asm/unistd_32.h，並將NR_syscalls做遞增。

#define __NR_brook              337
#define __NR_syscalls           338

定義system call的原型，include/linux/syscalls.h。

asmlinkage long sys_brook(int n, unsigned long arg);

加入至system call table中，arch/x86/kernel/syscall_table_32.S。

.long sys_brook;

撰寫system call的內容。

obj-y := brook.o

#include <linux/kernel.h>
#include <linux/syscalls.h>
#include <linux/uaccess.h>

SYSCALL_DEFINE2(brook, int, n, unsigned long, arg)
{
    int __user *p = (int __user *) arg;
    int i, x, sum = 0, err = 0;
    printk("n=%d, ", n);
    for (i = 0; i < n; i++) {
        err = get_user(x, p + i);
        sum += x;
        if (err) {
            return err;
        }
        printk("[%d]=%d, ", i, x);
    }

    return sum;
}

ifeq ($(KBUILD_EXTMOD),)
core-y += kernel/ mm/ fs/ ipc/ security/ crypto/ block/ brook_syscall/

撰寫Application測試system call

#include <linux/unistd.h>
#include 
#define __NR_brook 337
int brook(int n, ...)
{
    int ret;
    va_list ap;

    va_start(ap, n);
    ret = syscall(__NR_brook, n, ap);
    va_end(ap);
    return ret;
}

#include <stdio.h>
#include "brook.h"
int main(int argc, char *argv[])
{
    return printf("%d\n", brook(3, 3, 2, 1));
}

Kernel Version：2.6.32
參考資料：

Linux Kernel Development 2nd, Novell Press
http://pradeepkumar.org/2010/01/implementing-a-new-system-call-in-kernel-version-2-6-32.html
Professional Linux Kernel Architecture, Wiley Publishing

config automatically switches from 32-bit to 64-bit for x86

今天我用我的NB去make config，卻發現config會自動的切成64bit的，如果想要編成32bit，就執行linux32 make menuconfig即可。

參考資料：
http://kerneltrap.org/mailarchive/linux-kernel/2010/6/6/4579953/thread

2010年11月27日星期六

Linux Kernel（12.1）- netfilter機制之初探

延續Linux Modules（12）- netfilter我們由nf_register_hooks()來看看netfilter這個framework是如何運作的。

struct list_head nf_hooks[NFPROTO_NUMPROTO][NF_MAX_HOOKS] __read_mostly;

int nf_register_hook(struct nf_hook_ops *reg)
{
    struct nf_hook_ops *elem;
    int err;

    err = mutex_lock_interruptible(&nf_hook_mutex);
    if (err < 0)
        return err;
    list_for_each_entry(elem, &nf_hooks[reg->pf][reg->hooknum], list) {
        if (reg->priority < elem->priority)
            break;
    }
    list_add_rcu(®->list, elem->list.prev);
    mutex_unlock(&nf_hook_mutex);
    return 0;
}

void nf_unregister_hook(struct nf_hook_ops *reg)
{
    mutex_lock(&nf_hook_mutex);
    list_del_rcu(®->list);
    mutex_unlock(&nf_hook_mutex);
    synchronize_net();
}

nf_hook_register_hook()其實就是在將要註冊的callback function依照所屬的protocol family以及hooknum插入struct list_head nf_hooks[NFPROTO_NUMPROTO][NF_MAX_HOOKS]，並且會依照priority由小到大，而nf_unregister_hook()就是很簡單的reg由nf_hooks中移除。

接著我們再來看看nf_iterate()，程式碼中以//為註解方式，且為粗體字型就是我的註解。

unsigned int
nf_iterate(struct list_head *head, struct sk_buff *skb,
          unsigned int hook, const struct net_device *indev,
          const struct net_device *outdev, struct list_head **i,
          int (*okfn)(struct sk_buff *), int hook_thresh)
{
    unsigned int verdict;

    /*
     * The caller must not block between calls to this
     * function because of risk of continuing from deleted element.
     */
    list_for_each_continue_rcu(*i, head) {
        struct nf_hook_ops *elem = (struct nf_hook_ops *)*i;

        // 註冊的priority必須小於等於hook_thresh才會被執行
        if (hook_thresh > elem->priority)
            continue;

        /* Optimization: we don't need to hold module
           reference here, since function can't sleep. --RR */
        //丟進註冊的hook function執行
        verdict = elem->hook(hook, skb, indev, outdev, okfn);
        if (verdict != NF_ACCEPT) {
#ifdef CONFIG_NETFILTER_DEBUG
            if (unlikely((verdict & NF_VERDICT_MASK) > NF_MAX_VERDICT)) {
                NFDEBUG("Evil return from %p(%u).\n", elem->hook, hook);
                continue;
            }
#endif
            //如果不是NF_ACCEPT而且也不是NF_REPEAT就回傳verdict
            // (NF_DROP/NF_STOLEN/NF_QUEUE)
            if (verdict != NF_REPEAT)
                return verdict;
            //會執行到這邊就是NF_REPEAT啦
            *i = (*i)->prev;
        }
        // 如果verdict是NF_ACCEPT就會繼續往下一個hook function執行
    }
    //如果沒有任何的hook function或者所有的hook function都是NF_ACCEPT
    return NF_ACCEPT;
}

/* Returns 1 if okfn() needs to be executed by the caller,
 * -EPERM for NF_DROP, 0 otherwise. */
int
nf_hook_slow(u_int8_t pf, unsigned int hook, struct sk_buff *skb,
             struct net_device *indev, struct net_device *outdev,
             int (*okfn)(struct sk_buff *), int hook_thresh)
{
    struct list_head *elem;
    unsigned int verdict;
    int ret = 0;

    /* We may already have this, but read-locks nest anyway */
    rcu_read_lock();

    elem = &nf_hooks[pf][hook];
next_hook:
    // 將nf_hooks[pf][hook]這個linked list丟進nf_iterate()中執行
    verdict = nf_iterate(&nf_hooks[pf][hook], skb, hook, indev,
                         outdev, &elem, okfn, hook_thresh);
    if (verdict == NF_ACCEPT || verdict == NF_STOP) {
        // 如果是NF_ACCEPT或NF_STOP就回傳1, 到時候NF_HOOK()/NF_HOOK_COND()
        // 等macro就會執行okfn, 前面的註解也有說明
        ret = 1;
    } else if (verdict == NF_DROP) {
        // 如果是NF_DROP就會free resource並且回傳！1, 就是不會呼叫okfn()了
        kfree_skb(skb);
        ret = -EPERM;
    } else if ((verdict & NF_VERDICT_MASK) == NF_QUEUE) {
        // 如果是QUEUE就會將他nf_queue()將資訊暫時存起來, 等候處理
        if (!nf_queue(skb, elem, pf, hook, indev, outdev, okfn,
                      verdict >> NF_VERDICT_BITS))
            goto next_hook;
    }
    rcu_read_unlock();
    // 執行到這邊有可能是NF_STOLEN, 但ret = 0, 所以不會執行okfn,
    // NF_STOLEN會改變packet原本要走的路徑
    return ret;
}

#iddef CONFIG_NETFILTER
int
nf_hook_slow(u_int8_t pf, unsigned int hook, struct sk_buff *skb,
             struct net_device *indev, struct net_device *outdev,
             int (*okfn)(struct sk_buff *), int thresh);

/**
 * nf_hook_thresh - call a netfilter hook
 * 
 * Returns 1 if the hook has allowed the packet to pass.  The function
 * okfn must be invoked by the caller in this case.  Any other return
 * value indicates the packet has been consumed by the hook.
 */
static inline int 
nf_hook_thresh(u_int8_t pf, unsigned int hook, struct sk_buff *skb,
               struct net_device *indev, struct net_device *outdev,
               int (*okfn)(struct sk_buff *), int thresh)
{
#ifndef CONFIG_NETFILTER_DEBUG
    if (list_empty(&nf_hooks[pf][hook]))
        return 1;
#endif
    return nf_hook_slow(pf, hook, skb, indev, outdev, okfn, thresh);
}

static inline int
nf_hook(u_int8_t pf, unsigned int hook, struct sk_buff *skb,
        struct net_device *indev, struct net_device *outdev,
        int (*okfn)(struct sk_buff *))
{
    return nf_hook_thresh(pf, hook, skb, indev, outdev, okfn, INT_MIN);
}
                   
/* Activate hook; either okfn or kfree_skb called, unless a hook
   returns NF_STOLEN (in which case, it's up to the hook to deal with
   the consequences).

   Returns -ERRNO if packet dropped.  Zero means queued, stolen or
   accepted.
*/

/* RR:
   > I don't want nf_hook to return anything because people might forget
   > about async and trust the return value to mean "packet was ok".

   AK:
   Just document it clearly, then you can expect some sense from kernel
   coders :)
*/

static inline int
NF_HOOK_THRESH(uint8_t pf, unsigned int hook, struct sk_buff *skb,
               struct net_device *in, struct net_device *out,
               int (*okfn)(struct sk_buff *), int thresh)
{
    int ret = nf_hook_thresh(pf, hook, skb, in, out, okfn, thresh);
    if (ret == 1)
        ret = okfn(skb);
    return ret;
}

static inline int
NF_HOOK_COND(uint8_t pf, unsigned int hook, struct sk_buff *skb,
             struct net_device *in, struct net_device *out,
             int (*okfn)(struct sk_buff *), bool cond)
{
    int ret;

    if (!cond ||
            (ret = nf_hook_thresh(pf, hook, skb, in, out, okfn, INT_MIN) == 1))
        ret = okfn(skb);
    return ret;
}

static inline int
NF_HOOK(uint8_t pf, unsigned int hook, struct sk_buff *skb,
        struct net_device *in, struct net_device *out,
        int (*okfn)(struct sk_buff *))
{
    return NF_HOOK_THRESH(pf, hook, skb, in, out, okfn, INT_MIN);
}

#else /* !CONFIG_NETFILTER */

#define NF_HOOK(pf, hook, skb, indev, outdev, okfn) (okfn)(skb)
#define NF_HOOK_COND(pf, hook, skb, indev, outdev, okfn, cond) (okfn)(skb)
static inline int 
nf_hook_thresh(u_int8_t pf, unsigned int hook, struct sk_buff *skb,
               struct net_device *indev, struct net_device *outdev,
               int (*okfn)(struct sk_buff *), int thresh)
{
    return okfn(skb);
}

static inline int 
nf_hook(u_int8_t pf, unsigned int hook, struct sk_buff *skb,
        struct net_device *indev, struct net_device *outdev,
        int (*okfn)(struct sk_buff *))
{
    return 1;
}
#endif /*CONFIG_NETFILTER*/

如果沒有defined CONFIG_NETFILTER，NF_HOOK()其實就是直接呼叫okfn了。到這邊對於netfilter的運作就有基本的認識了，有機會hack其他關於netfilter的心得再和大家分享。

Kernel version：2.6.36

2010年4月16日星期五

Linux Kernel（3.2）- procfs之symlink與mkdir

在procfs底下無法直接使用mkdir/ln等指令建立目錄和建立link，不過有提供兩個API讓user達成這兩件事情。

static struct proc_dir_entry *proc_symlink(const char *src,
  struct proc_dir_entry *parent,const char *dest);

static struct proc_dir_entry *proc_mkdir(const char *name,
 struct proc_dir_entry *parent);

看名字就知道proc_symlink()是用來建立link的，src是檔名(basename)，parent是src所在的目錄，dest是要link的對象。
proc_mkdir()就更容易了，要在那個目錄(parent)下建立新的目錄(name)。
下面是範例：

#include <linux/init.h>
#include <linux/module.h>
#include <linux/proc_fs.h>

MODULE_LICENSE("GPL");

static char *bdir = "brook_dir";
module_param(bdir, charp, 0644);
MODULE_PARM_DESC(dir, "brook's dir");

static char *bfile = "brook_file";
module_param(bfile, charp, 0644);
MODULE_PARM_DESC(bfile, "brook's file");

static struct proc_dir_entry *ent = NULL;

static int __init init_modules(void)
{
    if (!(ent = proc_mkdir(bdir, NULL))) {
        printk("create dir \"%s\" failed\n", bdir);
        return -1;
    }

    if (!proc_symlink(bfile, ent, "../uptime")) {
        printk("create symlink \"%s\" failed\n", bfile);
        return -1;
    }

    return 0;
}

static void __exit exit_modules(void)
{
    remove_proc_entry(bfile, ent);
    if (ent) {
        remove_proc_entry(bdir, NULL);
    }
}

module_init(init_modules);
module_exit(exit_modules);

訂閱：文章 (Atom)

Nano雞排

2016年1月3日星期日

Linux Kernel（15.1）- platform_driver_register()之如何调用driver.probe()

2016年1月2日星期六

Linux Kernel（15）- Platform Devices

2011年12月10日星期六

send signal to user-space

2011年11月6日星期日

利用gen_init_cpio建立initrd的script

2011年5月7日星期六

Linux Kernel（14）- Kernel Synchronization

2011年2月27日星期日

Linux Modules（14.1）- Read Copy Update

2011年2月26日星期六

Linux Kernel（8.1）- Notifier機制剖析

2011年1月16日星期日

Linux softirq執行分析(轉)

2011年1月9日星期日

Linux Modules（1.1）module parameters

2010年12月25日星期六

Linux Modules（7.3）- work queue

2010年12月11日星期六

Linux Kernel（13）- syscall

新增system call “brook()”到kernel 2.6.32的步驟(x86)

config automatically switches from 32-bit to 64-bit for x86

2010年11月27日星期六

Linux Kernel（12.1）- netfilter機制之初探

2010年4月16日星期五

Linux Kernel（3.2）- procfs之symlink與mkdir

熱門文章

關於我自己

網誌存檔

搜尋此網誌

標籤

2016年1月3日 星期日

2016年1月2日 星期六

2011年12月10日 星期六

2011年11月6日 星期日

2011年5月7日 星期六

2011年2月27日 星期日

2011年2月26日 星期六

2011年1月16日 星期日

2011年1月9日 星期日

2010年12月25日 星期六

2010年12月11日 星期六

新增system call “brook()”到kernel 2.6.32的步驟(x86)

2010年11月27日 星期六

2010年4月16日 星期五

熱門文章

關於我自己

網誌存檔

搜尋此網誌

標籤

2016年1月3日星期日

2016年1月2日星期六

2011年12月10日星期六

2011年11月6日星期日

2011年5月7日星期六

2011年2月27日星期日

2011年2月26日星期六

2011年1月16日星期日

2011年1月9日星期日

2010年12月25日星期六

2010年12月11日星期六

2010年11月27日星期六

2010年4月16日星期五