Nano雞排: Linux

顯示具有 Linux - kernel 標籤的文章。顯示所有文章

2011年2月26日星期六

Linux Kernel（8.1）- Notifier機制剖析

由Linux Kernel（8）- Notification可以學會運用notifier，而這一篇會概述如何實現，基本上所謂的publish-and-subscribe pattern都是註冊callback function到某個list上去，某事件發生時，再將整個list的callback function執行過一次。

include/lunux/notifier.h

#define BLOCKING_NOTIFIER_HEAD(name)                \
        struct blocking_notifier_head name =            \
        BLOCKING_NOTIFIER_INIT(name)

struct blocking_notifier_head {
    // 用於blocking機制時使用
    // 可以於kernel/notifier.c看到以下註解
    /*
     * Blocking notifier chain routines.  All access to the chain is
     * synchronized by an rwsem.
     */
    struct rw_semaphore rwsem;
    // callback function之linking-list的頭
    struct notifier_block __rcu *head;
};

// linking-list之node結構
struct notifier_block {
    // callback function
    int (*notifier_call)(struct notifier_block *, unsigned long, void *);
    struct notifier_block __rcu *next;
    // 用於註冊到list之優先順序, 數字越大 priority越高
    int priority;
};

kernel/notifier.c

/**
 * blocking_notifier_chain_register - Add notifier to a blocking notifier chain
 * @nh: Pointer to head of the blocking notifier chain
 * @n: New entry in notifier chain
 *
 * Adds a notifier to a blocking notifier chain.
 * Must be called in process context.
 * // 因為semaphore只能用在process context
 *
 * Currently always returns zero.
 */
int blocking_notifier_chain_register(struct blocking_notifier_head *nh,
                struct notifier_block *n)
{
    int ret;

    /*
     * This code gets used during boot-up, when task switching is
     * not yet working and interrupts must remain disabled.  At
     * such times we must not call down_write().
     */
    if (unlikely(system_state == SYSTEM_BOOTING))
        return notifier_chain_register(&nh->head, n);

    // 使用writer semaphore保護, 確保kernel synchronization
    down_write(&nh->rwsem);

    // 真正掛callback function到list的function
    ret = notifier_chain_register(&nh->head, n);

    up_write(&nh->rwsem);
    return ret;
}


/*
 *  Notifier chain core routines.  The exported routines below
 *  are layered on top of these, with appropriate locking added.
 */

static int notifier_chain_register(struct notifier_block **nl,
                struct notifier_block *n)
{
    // nl指向list中的第一個node
    while ((*nl) != NULL) {
        // 比較list中的每一個node之priority,
        // 如果發現新的比較大, 就break準備插到這個(*nl)的前面
        if (n->priority > (*nl)->priority)
            break;
        nl = &((*nl)->next);
    }
    // 將(*nl)串到新的後面
    n->next = *nl;
    // 將(*nl)取代成n
    rcu_assign_pointer(*nl, n);
    return 0;
}

/**
 * blocking_notifier_chain_unregister - Remove notifier from a blocking notifier chain
 * @nh: Pointer to head of the blocking notifier chain
 * @n: Entry to remove from notifier chain
 *
 * Removes a notifier from a blocking notifier chain.
 * Must be called from process context.
 *
 * Returns zero on success or %-ENOENT on failure.
 */
int blocking_notifier_chain_unregister(struct blocking_notifier_head *nh,
                struct notifier_block *n)
{
    // 基本上這個function和blocking_notifier_chain_register()相同

    int ret;

    /*
     * This code gets used during boot-up, when task switching is
     * not yet working and interrupts must remain disabled.  At
     * such times we must not call down_write().
     */
    if (unlikely(system_state == SYSTEM_BOOTING))
        return notifier_chain_unregister(&nh->head, n);

    // 使用writer semaphore保護, 確保kernel synchronization
    down_write(&nh->rwsem);

    // 真正移除callback function
    ret = notifier_chain_unregister(&nh->head, n);

    up_write(&nh->rwsem);
    return ret;
}

static int notifier_chain_unregister(struct notifier_block **nl,
                struct notifier_block *n)
{
    while ((*nl) != NULL) {
        if ((*nl) == n) {
            // 找到n在list中的位置, 然後將之移除
            rcu_assign_pointer(*nl, n->next);
            return 0;
        }
        // 將nl往下一個移動
        nl = &((*nl)->next);
    }
    return -ENOENT;
}


int blocking_notifier_call_chain(struct blocking_notifier_head *nh,
                unsigned long val, void *v)
{
    return __blocking_notifier_call_chain(nh, val, v, -1, NULL);
}


/**
 *  __blocking_notifier_call_chain - Call functions in a blocking notifier chain
 *  @nh: Pointer to head of the blocking notifier chain
 *  @val: Value passed unmodified to notifier function
 *  @v: Pointer passed unmodified to notifier function
 *  @nr_to_call: See comment for notifier_call_chain.
 *  @nr_calls: See comment for notifier_call_chain.
 *
 *  Calls each function in a notifier chain in turn.  The functions
 *  run in a process context, so they are allowed to block.
 *
 *  If the return value of the notifier can be and'ed
 *  with %NOTIFY_STOP_MASK then blocking_notifier_call_chain()
 *  will return immediately, with the return value of
 *  the notifier function which halted execution.
 *  Otherwise the return value is the return value
 *  of the last notifier function called.
 */
int __blocking_notifier_call_chain(struct blocking_notifier_head *nh,
            unsigned long val, void *v, int nr_to_call, int *nr_calls)
{
    int ret = NOTIFY_DONE;

    /*
     * We check the head outside the lock, but if this access is
     * racy then it does not matter what the result of the test
     * is, we re-check the list after having taken the lock anyway:
     */
    if (rcu_dereference_raw(nh->head)) {
        down_read(&nh->rwsem);
        // 真正執行list中所有callback function的API
        ret = notifier_call_chain(&nh->head, val, v, nr_to_call,
                                nr_calls);
        up_read(&nh->rwsem);
    }
    return ret;
}

/**
 *  notifier_call_chain - Informs the registered notifiers about an event.
 *  @nl:        Pointer to head of the blocking notifier chain
 *  @val:       Value passed unmodified to notifier function
 *  @v:     Pointer passed unmodified to notifier function
 *  @nr_to_call:    Number of notifier functions to be called. Don't care
 *                  value of this parameter is -1.
 *  @nr_calls:  Records the number of notifications sent. Don't care
 *              value of this field is NULL.
 *  @returns:   notifier_call_chain returns the value returned by the
 *              last notifier function called.
 */
static int __kprobes notifier_call_chain(struct notifier_block **nl,
            unsigned long val, void *v, int nr_to_call, int *nr_calls)
{
    int ret = NOTIFY_DONE;
    struct notifier_block *nb, *next_nb;

    nb = rcu_dereference_raw(*nl);

    // 由blocking_notifier_call_chain傳進來的nr_to_call為-1, 
    // 由於nr_to_call只會--, 所以nr_to_call就是always成立
    // 於是停止的條件只剩下nb為NULL
    while (nb && nr_to_call) {
        // ??這段的用意就不是很明瞭了??
        // 為啥不在後面在nb = nb->next?
        next_nb = rcu_dereference_raw(nb->next);

#ifdef CONFIG_DEBUG_NOTIFIERS
        if (unlikely(!func_ptr_is_kernel_text(nb->notifier_call))) {
            WARN(1, "Invalid notifier called!");
            nb = next_nb;
            continue;
        }
#endif
        // 執行callback function
        ret = nb->notifier_call(nb, val, v);

        if (nr_calls)
            (*nr_calls)++;

        // 如果帶有 STOP的bit就停止執行後面的callback function
        if ((ret & NOTIFY_STOP_MASK) == NOTIFY_STOP_MASK)
            break;
        nb = next_nb;
        nr_to_call--;
    }
    return ret;
}

這篇文章還不算完成，後面在補充啦~~

2011年1月16日星期日

Linux softirq執行分析(轉)

又是一篇精彩的文章，強力轉貼。

Linux softirq執行分析 

Author:  sinister
Email:   sinister@whitecell.org
Homepage:http://www.whitecell.org 
Date:    2007-01-11

本文對 Linux 內核軟中斷的執行流程進行了分析，並盡可能的結合當前運行環境詳細地寫出我的理解，
但這並不表明我的理解一定正確。這本是論壇裏的一篇帖子，發出來是為了抛磚引玉，如果您在閱讀本文
時發現了我的錯誤，還望得到您的指正。


今天無意中看了眼 2.6 內核的軟中斷實現，發現和以前我看到的大不相同（以前也是走馬觀花，不大仔
細），可以說改動很大。連 softirq 的調用點都不一樣了，以前是三個調用點，今天搜索了一下源代
碼，發現在多出了ksoftirqd 後，softirq 在系統中的調用點僅是在 ISR 返回時和使用了 
local_bh_enable() 函數後被調用了。網卡部分的顯示調用，我覺得應該不算是系統中的調用點。
ksoftirqd 返回去調用 do_softirq() 函數應該也只能算是其中的一個分支，因為其本身從源頭上
來講也還是在 ISR 返回時 irq_exit() 調用的。這樣一來就和前些日子寫的那份筆記
（Windows/Linux/Solaris 軟中斷機制）裏介紹的 Linux 內核部分的軟中斷有出處了，看來以後
討論 Linux kernel 代碼一定要以內核版本為前題，要不非亂了不可。得買本 Linux 方面的書了，
每次上來直接看相關代碼也不是回事，時間也不允許。


//
// do_IRQ 函數執行完硬體 ISR 後退出時調用此函數。
//

void irq_exit(void)
{
    account_system_vtime(current);
    trace_hardirq_exit();
    sub_preempt_count(IRQ_EXIT_OFFSET);

        //
        // 判斷當前是否有硬體中斷嵌套，並且是否有軟中斷在
        // pending 狀態，注意：這裏只有兩個條件同時滿足
        // 時，才有可能調用 do_softirq() 進入軟中斷。也就是
        // 說確認當前所有硬體中斷處理完成，且有硬體中斷安裝了
        // 軟中斷處理時理時才會進入。
        // 
    if (!in_interrupt() && local_softirq_pending())
                //
                // 其實這裏就是調用 do_softirq() 執行
                //
        invoke_softirq();
    preempt_enable_no_resched();
}


#ifndef __ARCH_HAS_DO_SOFTIRQ

asmlinkage void do_softirq(void)
{
    __u32 pending;
    unsigned long flags;

    //
    // 這個函數判斷，如果當前有硬體中斷嵌套，或者
    // 有軟中斷正在執行時候，則馬上返回。在這個
    // 入口判斷主要是為了與 ksoftirqd 互斥。
    //
    if (in_interrupt())
        return;

    //
    // 關中斷執行以下代碼
    //
    local_irq_save(flags);

    //
    // 判斷是否有 pending 的軟中斷需要處理。
    //
    pending = local_softirq_pending();

    //
    // 如果有則調用 __do_softirq() 進行實際處理
    //
    if (pending)
        __do_softirq();

    //
    // 開中斷繼續執行
    //
    local_irq_restore(flags);
}


//
// 最大軟中斷調用次數為 10 次。
//

#define MAX_SOFTIRQ_RESTART 10

asmlinkage void __do_softirq(void)
{
    //
    // 軟體中斷處理結構，此結構中包括了 ISR 中
    // 註冊的回調函數。
    //
    struct softirq_action *h;
    __u32 pending;
    int max_restart = MAX_SOFTIRQ_RESTART;
    int cpu;

    //
    // 得到當前所有 pending 的軟中斷。
    // 
    pending = local_softirq_pending();
    account_system_vtime(current);

    //
    // 執行到這裏要遮罩其他軟中斷，這裏也就證明了
    // 每個 CPU 上同時運行的軟中斷只能有一個。
    //
    __local_bh_disable((unsigned long)__builtin_return_address(0));
    trace_softirq_enter();

    //
    // 針對 SMP 得到當前正在處理的 CPU
    //
    cpu = smp_processor_id();
//
// 迴圈標誌
//
restart:
    //
    // 每次迴圈在允許硬體 ISR 強佔前，首先重置軟中斷
    // 的標誌位元。
    //
    /* Reset the pending bitmask before enabling irqs */
    set_softirq_pending(0);

    //
    // 到這裏才開中斷運行，注意：以前運行狀態一直是關中斷
    // 運行，這時當前處理軟中斷才可能被硬體中斷搶佔。也就
    // 是說在進入軟中斷時不是一開始就會被硬體中斷搶佔。只有
    // 在這裏以後的代碼才可能被硬體中斷搶佔。
    //
    local_irq_enable();

    //
    // 這裏要注意，以下代碼運行時可以被硬體中斷搶佔，但
    // 這個硬體 ISR 執行完成後，它的所註冊的軟中斷無法馬上運行，
    // 別忘了，現在雖是開硬體中斷執行，但前面的 __local_bh_disable()
    // 函數遮罩了軟中斷。所以這種環境下只能被硬體中斷搶佔，但這
    // 個硬中斷註冊的軟中斷回調函數無法運行。要問為什麼，那是因為
    // __local_bh_disable() 函數設置了一個標誌當作互斥量，而這個
    // 標誌正是上面的 irq_exit() 和 do_softirq() 函數中的
    // in_interrupt() 函數判斷的條件之一，也就是說 in_interrupt() 
    // 函數不僅檢測硬中斷而且還判斷了軟中斷。所以在這個環境下觸發
    // 硬中斷時註冊的軟中斷，根本無法重新進入到這個函數中來，只能
    // 是做一個標誌，等待下面的重複迴圈（最大 MAX_SOFTIRQ_RESTART）
    // 才可能處理到這個時候觸發的硬體中斷所註冊的軟中斷。
    //


    //
    // 得到軟中斷向量表。
    //
    h = softirq_vec;

    //
    // 迴圈處理所有 softirq 軟中斷註冊函數。
    // 
    do {
        //
        // 如果對應的軟中斷設置 pending 標誌則表明
        // 需要進一步處理它所註冊的函數。
        //
        if (pending & 1) {
            //
            // 在這裏執行了這個軟中斷所註冊的回調函數。
            //
            h->action(h);
            rcu_bh_qsctr_inc(cpu);
        }
        //
        // 繼續找，直到把軟中斷向量表中所有 pending 的軟
        // 中斷處理完成。
        //
        h++;

        //
        // 從代碼裏可以看出按位操作，表明一次迴圈只
        // 處理 32 個軟中斷的回調函數。
        //
        pending >>= 1; 
    } while (pending);

    //
    // 關中斷執行以下代碼。注意：這裏又關中斷了，下面的
    // 代碼執行過程中硬體中斷無法搶佔。
    //
    local_irq_disable();

    //
    // 前面提到過，在剛才開硬體中斷執行環境時只能被硬體中斷
    // 搶佔，在這個時候是無法處理軟中斷的，因為剛才開中
    // 斷執行過程中可能多次被硬體中斷搶佔，每搶佔一次就有可
    // 能註冊一個軟中斷，所以要再重新取一次所有的軟中斷。
    // 以便下面的代碼進行處理後跳回到 restart 處重複執行。
    //
    pending = local_softirq_pending();

    //
    // 如果在上面的開中斷執行環境中觸發了硬體中斷，且每個都
    // 註冊了一個軟中斷的話，這個軟中斷會設置 pending 位，
    // 但在當前一直遮罩軟中斷的環境下無法得到執行，前面提
    // 到過，因為 irq_exit() 和 do_softirq() 根本無法進入到
    // 這個處理過程中來。這個在上面詳細的記錄過了。那麼在
    // 這裏又有了一個執行的機會。注意：雖然當前環境一直是
    // 處於遮罩軟中斷執行的環境中，但在這裏又給出了一個執行
    // 剛才在開中斷環境過程中觸發硬體中斷時所註冊的軟中斷的
    // 機會，其實只要理解了軟中斷機制就會知道，無非是在一些特
    // 定環境下調用 ISR 註冊到軟中斷向量表裏的函數而已。
    //

    //
    // 如果剛才觸發的硬體中斷註冊了軟中斷，並且重複執行次數
    // 沒有到 10 次的話，那麼則跳轉到 restart 標誌處重複以上
    // 所介紹的所有步驟：設置軟中斷標誌位元，重新開中斷執行...
    // 注意：這裏是要兩個條件都滿足的情況下才可能重複以上步驟。 
    //
    if (pending && --max_restart)
        goto restart;

    //
    // 如果以上步驟重複了 10 次後還有 pending 的軟中斷的話，
    // 那麼系統在一定時間內可能達到了一個峰值，為了平衡這點。
    // 系統專門建立了一個 ksoftirqd 線程來處理，這樣避免在一
    // 定時間內負荷太大。這個 ksoftirqd 線程本身是一個大循環，
    // 在某些條件下為了不負載過重，它是可以被其他進程搶佔的，
    // 但注意，它是顯示的調用了 preempt_xxx() 和 schedule()
    // 才會被搶佔和切換的。這麼做的原因是因為在它一旦調用 
    // local_softirq_pending() 函數檢測到有 pending 的軟中斷
    // 需要處理的時候，則會顯示的調用 do_softirq() 來處理軟中
    // 斷。也就是說，下面代碼喚醒的 ksoftirqd 線程有可能會回
    // 到這個函數當中來，尤其是在系統需要回應很多軟中斷的情況
    // 下，它的調用入口是 do_softirq()，這也就是為什麼在 do_softirq()
    // 的入口處也會用 in_interrupt()  函數來判斷是否有軟中斷
    // 正在處理的原因了，目的還是為了防止重入。ksoftirqd 實現
    // 看下面對 ksoftirqd() 函數的分析。
    //
    if (pending)
               //
               // 此函數實際是調用 wake_up_process() 來喚醒 ksoftirqd
               // 
        wakeup_softirqd();

    trace_softirq_exit();
    account_system_vtime(current);

    //
    // 到最後才開軟中斷執行環境，允許軟中斷執行。注意：這裏
    // 使用的不是 local_bh_enable()，不會再次觸發 do_softirq()
    // 的調用。
    // 
    _local_bh_enable();
}


static int ksoftirqd(void * __bind_cpu)
{
    //
    // 顯示調用此函數設置當前進程的靜態優先順序。當然，
    // 這個優先順序會隨調度器策略而變化。
    //
    set_user_nice(current, 19);

    //
    // 設置當前進程不允許被掛啟
    //
    current->flags |= PF_NOFREEZE;

    //
    // 設置當前進程狀態為可中斷的狀態，這種睡眠狀
    // 態可回應信號處理等。
    // 
    set_current_state(TASK_INTERRUPTIBLE);

    //
    // 下面是一個大循環，迴圈判斷當前進程是否會停止，
    // 不會則繼續判斷當前是否有 pending 的軟中斷需
    // 要處理。
    //
    while (!kthread_should_stop()) {
        //
        // 如果可以進行處理，那麼在此處理期間內禁止
        // 當前進程被搶佔。
        //
        preempt_disable();

        //
        // 首先判斷系統當前沒有需要處理的 pending 狀態的軟中斷
        //
        if (!local_softirq_pending()) {
            //
            // 沒有的話在主動放棄 CPU 前先要允許搶佔，因為
            // 一直是在不允許搶佔狀態下執行的代碼。
            //
            preempt_enable_no_resched();

            //
            // 顯示調用此函數主動放棄 CPU 將當前進程放入睡眠佇列，
            // 並切換新的進程執行（調度器相關不記錄在此）
            //
            schedule();

            //
            // 注意：如果當前顯示調用 schedule() 函數主動切換的進
            // 程再次被調度執行的話，那麼將從調用這個函數的下一條
            // 語句開始執行。也就是說，在這裏當前進程再次被執行的
            // 話，將會執行下面的 preempt_disable() 函數。
            //

            //
            // 當進程再度被調度時，在以下處理期間內禁止當前進程被搶佔。
            //
            preempt_disable();
        }

        //
        // 設置當前進程為運行狀態。注意：已經設置了當前進程不可搶佔
        // 在進入迴圈後，以上兩個分支不論走哪個都會執行到這裏。一是
        // 進入迴圈時就有 pending 的軟中斷需要執行時。二是進入迴圈時
        // 沒有 pending 的軟中斷，當前進程再次被調度獲得 CPU 時繼續
        // 執行時。
        //
        __set_current_state(TASK_RUNNING);

        //
        // 迴圈判斷是否有 pending 的軟中斷，如果有則調用 do_softirq()
        // 來做具體處理。注意：這裏又是一個 do_softirq() 的入口點，
        // 那麼在 __do_softirq() 當中迴圈處理 10 次軟中斷的回調函數
        // 後，如果還有 pending 的話，會又調用到這裏。那麼在這裏則
        // 又會有可能去調用 __do_softirq() 來處理軟中斷回調函數。在前
        // 面介紹 __do_softirq() 時已經提到過，處理 10 次還處理不完的
        // 話說明系統正處於繁忙狀態。根據以上分析，我們可以試想如果在
        // 系統非常繁忙時，這個進程將會與 do_softirq() 相互交替執行，
        // 這時此進程佔用 CPU 應該會很高，雖然下面的 cond_resched() 
        // 函數做了一些處理，它在處理完一輪軟中斷後當前處理進程可能會
        // 因被調度而減少 CPU 負荷，但是在非常繁忙時這個進程仍然有可
        // 能大量佔用 CPU。
        //
        while (local_softirq_pending()) {
            /* Preempt disable stops cpu going offline.
               If already offline, we'll be on wrong CPU:
               don't process */
            if (cpu_is_offline((long)__bind_cpu))
                //
                // 如果當前被關聯的 CPU 無法繼續處理則跳轉
                // 到 wait_to_die 標記出，等待結束並退出。
                // 
                goto wait_to_die;

                //
                // 執行 do_softirq() 來處理具體的軟中斷回調函數。注
                // 意：如果此時有一個正在處理的軟中斷的話，則會馬上
                // 返回，還記得前面介紹的 in_interrupt() 函數麼。
                //
                do_softirq();

                //
                // 允許當前進程被搶佔。
                //
                preempt_enable_no_resched();
                        
                //
                // 這個函數有可能間接的調用 schedule() 來切換當前
                // 進程，而且上面已經允許當前進程可被搶佔。也就是
                // 說在處理完一輪軟中斷回調函數時，有可能會切換到
                // 其他進程。我認為這樣做的目的一是為了在某些負載
                // 超標的情況下不至於讓這個進程長時間大量的佔用 CPU，
                // 二是讓在有很多軟中斷需要處理時不至於讓其他進程
                // 得不到回應。
                //
                cond_resched();

                //
                // 禁止當前進程被搶佔。
                //
                preempt_disable();

                //
                // 處理完所有軟中斷了嗎？沒有的話繼續迴圈以上步驟
                //
        }

        //
        // 待一切都處理完成後，允許當前進程被搶佔，並設置
        // 當前進程狀態為可中斷狀態，繼續迴圈以上所有過程。
        //
        preempt_enable();
        set_current_state(TASK_INTERRUPTIBLE);
    }
   
    //
    // 如果將會停止則設置當前進程為運行狀態後直接返回。
    // 調度器會根據優先順序來使當前進程運行。
    //
    __set_current_state(TASK_RUNNING);
    return 0;

//
// 一直等待到當前進程被停止
//
wait_to_die:

    //
    // 允許當前進程被搶佔。
    //
    preempt_enable();
    /* Wait for kthread_stop */

    //
    // 設置當前進程狀態為可中斷的狀態，這種睡眠狀
    // 態可回應信號處理等。
    // 
    set_current_state(TASK_INTERRUPTIBLE);

    //
    // 判斷當前進程是否會被停止，如果不是的話
    // 則設置進程狀態為可中斷狀態並放棄當前 CPU
    // 主動切換。也就是說這裏將一直等待當前進程
    // 將被停止時候才結束。
    //
    while (!kthread_should_stop()) {
        schedule();
        set_current_state(TASK_INTERRUPTIBLE);
    }

    //
    // 如果將會停止則設置當前進程為運行狀態後直接返回。
    // 調度器會根據優先順序來使當前進程運行。
    //
    __set_current_state(TASK_RUNNING);
    return 0;
}


參考：
linux kernel source 2.6.19.1 /kernel/softirq.c
WSS(Whitecell Security Systems)，一個非營利性民間技術組織，致力於各種系統安全技術的研究。
堅持傳統的hacker精神，追求技術的精純。
WSS 主頁：http://www.whitecell.org/ 
WSS 論壇：http://www.whitecell.org/forums/

2011年1月9日星期日

Linux Modules（1.1）module parameters

Linux Module允許使用者在insmod時帶入相關的parameters，這些parameters必須被宣告成golbal，並且使用module_param()宣告資料型態與權限，目前支援的資料型態有byte, short, ushort, int, uint, long, ulong, charp, bool等等。也可以使用module_param_array(name, type, num, perm)宣告成陣列。perm(權限)會決定/sys/module/顯示該參數的權限。

#include <linux/init.h>
#include <linux/module.h>

MODULE_LICENSE("GPL");

static unsigned char b_byte = 1;
module_param(b_byte, byte, S_IRUGO|S_IWUSR);

static short int b_short = 2;
module_param(b_short, short, S_IRUGO|S_IWUSR);

static unsigned short int b_ushort = 3;
module_param(b_ushort, ushort, S_IRUGO|S_IWUSR);

static int b_int = 6;
module_param(b_int, int, S_IRUGO|S_IWUSR);

static unsigned int b_uint = 5;
module_param(b_uint, uint, S_IRUGO|S_IWUSR);

static long b_long = 6;
module_param(b_long, long, S_IRUGO|S_IWUSR);

static unsigned long b_ulong = 7;
module_param(b_ulong, ulong, S_IRUGO|S_IWUSR);

static char *b_charp = "brook";
module_param(b_charp, charp, S_IRUGO|S_IWUSR);

static int b_bool = 1;
module_param(b_bool, bool, S_IRUGO|S_IWUSR);

static int __init init_modules(void)
{
    printk("b_byte: %d\n", b_byte);
    printk("b_short: %d\n", b_short);
    printk("b_ushort: %u\n", b_ushort);
    printk("b_int: %d\n", b_int);
    printk("b_uint: %u\n", b_uint);
    printk("b_long: %ld\n", b_long);
    printk("b_ulong: %lu\n", b_ulong);
    printk("b_charp: %s\n", b_charp);
    printk("b_bool: %d\n", b_bool);

    return 0;
}

static void __exit exit_modules(void)
{
}

module_init(init_modules);
module_exit(exit_modules);

Kernel Version：2.6.35

Linux Device Drivers, 3e
Document/printk-formats.txt

2010年12月25日星期六

Linux Modules（7.3）- work queue

Work queue提供一個interface，讓使用者輕易的建立kernel thread並且將work綁在這個kernel thread上面，如下圖[1]所示。

由於work queue是建立一個kernel thread來執行，所以是在process context，不同於tasklet的interrupt context，因此，work queue可以sleep(設定semaphore或者執行block I/O等等)。

Creating Work
透過 DECLARE_WORK(name, void (work_func_t)(struct work_struct *work)); // statically
或者
INIT_WORK(struct work_struct*, void (work_func_t)(struct work_struct *work)); //dynamically
建立work，就是要執行的工作。
有了work還需要將它和work thread結合，您可以透過create_singlethread_workqueue("name")建立一個名為name的single thread(執行work的thread就稱為work thread)，或者create_workqueue("name")建立per cpu的thread。接著就是要將work和work thread做關聯了，透過queue_work(work_thread, work)就可以將work送給work thread執行了。

queue_delayed_work(work_thread, delayed_work, delay)為queue_work()的delay版本。
flush_workqueue(work_thread)會wait直到這個work_thread的work都做完。flush_workqueue()並不會取消任何被delay執行的work，如果要取消delayed的work則需要呼叫cancel_delayed_work(delayed_work)將delayed_work自某個work thread中移除。

最後，要將work_thread摧毀要呼叫destroy_workqueue(work_thread)。

event/n
除了自己建立work thread以外，kernel還建立一個公用的work thread稱為event

kernel/workqueue.c

void __init init_workqueues(void)
{
    …
    keventd_wq = create_workqueue("events");
    …
}

您可以透過schedule_work(&work)將，work送給"events"執行，flush_scheduled_work(void)等待"events"中所有的work執行完畢。

#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/init.h>
#include <linux/workqueue.h>
#include <linux/sched.h>
#include <linux/slab.h>

MODULE_LICENSE("GPL");

static void brook_1_routine(struct work_struct *);
static void brook_2_routine(struct work_struct *);
static void brook_3_routine(struct work_struct *);

static struct work_struct *brook_1_work; // for event
static DECLARE_WORK(brook_2_work, brook_2_routine);
static DECLARE_DELAYED_WORK(brook_3_work, brook_3_routine);
static struct workqueue_struct *brook_workqueue;
static int stop_wq;
module_param(stop_wq, int, S_IRUGO | S_IWUGO);

static int __init init_modules(void)
{
    // for event
    brook_1_work = kzalloc(sizeof(typeof(*brook_1_work)), GFP_KERNEL);
    INIT_WORK(brook_1_work, brook_1_routine);
    schedule_work(brook_1_work);

    // for brook_wq
    brook_workqueue = create_workqueue("brook_wq");
    queue_work(brook_workqueue, &brook_2_work);
    queue_delayed_work(brook_workqueue, &brook_3_work, 0);
    stop_wq = 0;
    return 0;
}

static void __exit exit_modules(void)
{
    cancel_delayed_work(&brook_3_work);
    flush_workqueue(brook_workqueue);
    stop_wq = 1;
    destroy_workqueue(brook_workqueue);
}

static void brook_1_routine(struct work_struct *ws)
{
    printk("%s(): on cpu:%d, pname:%s\n",
            __func__, smp_processor_id(), current->comm);
}

static void brook_2_routine(struct work_struct *ws)
{
    printk("%s(): on cpu:%d, pname:%s\n",
            __func__, smp_processor_id(), current->comm);
    // do something to block/sleep
    // the work in the same workqueue is also deferred.
    msleep(5000);
    if (!stop_wq) {
        queue_work(brook_workqueue, &brook_2_work);
    }
}

static void brook_3_routine(struct work_struct *ws)
{
    printk("%s(): on cpu:%d, pname:%s\n",
            __func__, smp_processor_id(), current->comm);
    queue_delayed_work(brook_workqueue, &brook_3_work, 50);
}

module_init(init_modules);
module_exit(exit_modules);

Kernel Version：2.6.35
參考資料：

http://www.embexperts.com/viewthread.php?tid=12&highlight=work%2Bqueue
Linux Kernel Development 2nd, Novell Press

2010年12月11日星期六

Linux Kernel（13）- syscall

System Call在HW和user space提供一層抽象層，主要目的有：

為user space提供硬體抽象層。比如，讀取檔案時，不用管檔案所在的媒體類型與檔案儲存類型。
System call能確保系統的安全與穩定。避免user space的無意或惡意的破壞。

除了exception和trap以外，System call是user space進入kernel space的唯一管道。
User space的programming主要是base on API(Application Programming Interface)並非system call，從programmer的觀點來看，關注的是API(如C library)而非system call。

System call的return type為long，主要是要相容64bit，return value通常代表失敗或成功，失敗時，error code當常寫入global variable “errno”。

典型的system call都以sys_開頭，如getpid()的system call為：

asmlinkage long sys_getpid(void)
{
    return current->tgid;
}

在Linux中(x86)，將所有的system call存放在一個system call table中，透過system call number來所引(index)要執行的system call，儘管每個platform所implement的system call table和system call number都不同，但是原理都是相同的，首先會將system call number存放在某個特定的CPU register(X86放在eax)，並將system call的參數也存放置其他的register(最多5個參數，x86依序為ebx、ecx、edx、esi和edi)，接著透過int 0x80進入system call處理程序，透過system call number(eax)在system call table中找到相對應的system call，並且執行該system call，因為參數存放是先就定義好了，所以就可以在registers(x86依序為ebx、ecx、edx、esi和edi)中依序讀出要處理的參數，超過五個參數就會用structure來傳遞，而ioctl這個不定參數的system call是傳遞pointer的方式來存取，ioctl是一個不好的例子，因為定義不是很明確，system call最好是能定義明確。

新增system call “brook()”到kernel 2.6.32的步驟(x86)

新增一筆system call entry到sys_call_table中arch/x86/kernel/syscall_table_32.s。

定義brook的system call number，arch/x86/include/asm/unistd_32.h，並將NR_syscalls做遞增。

#define __NR_brook              337
#define __NR_syscalls           338

定義system call的原型，include/linux/syscalls.h。

asmlinkage long sys_brook(int n, unsigned long arg);

加入至system call table中，arch/x86/kernel/syscall_table_32.S。

.long sys_brook;

撰寫system call的內容。

obj-y := brook.o

#include <linux/kernel.h>
#include <linux/syscalls.h>
#include <linux/uaccess.h>

SYSCALL_DEFINE2(brook, int, n, unsigned long, arg)
{
    int __user *p = (int __user *) arg;
    int i, x, sum = 0, err = 0;
    printk("n=%d, ", n);
    for (i = 0; i < n; i++) {
        err = get_user(x, p + i);
        sum += x;
        if (err) {
            return err;
        }
        printk("[%d]=%d, ", i, x);
    }

    return sum;
}

ifeq ($(KBUILD_EXTMOD),)
core-y += kernel/ mm/ fs/ ipc/ security/ crypto/ block/ brook_syscall/

撰寫Application測試system call

#include <linux/unistd.h>
#include 
#define __NR_brook 337
int brook(int n, ...)
{
    int ret;
    va_list ap;

    va_start(ap, n);
    ret = syscall(__NR_brook, n, ap);
    va_end(ap);
    return ret;
}

#include <stdio.h>
#include "brook.h"
int main(int argc, char *argv[])
{
    return printf("%d\n", brook(3, 3, 2, 1));
}

Kernel Version：2.6.32
參考資料：

Linux Kernel Development 2nd, Novell Press
http://pradeepkumar.org/2010/01/implementing-a-new-system-call-in-kernel-version-2-6-32.html
Professional Linux Kernel Architecture, Wiley Publishing

config automatically switches from 32-bit to 64-bit for x86

今天我用我的NB去make config，卻發現config會自動的切成64bit的，如果想要編成32bit，就執行linux32 make menuconfig即可。

參考資料：
http://kerneltrap.org/mailarchive/linux-kernel/2010/6/6/4579953/thread

2010年11月27日星期六

Linux Kernel（12.1）- netfilter機制之初探

延續Linux Modules（12）- netfilter我們由nf_register_hooks()來看看netfilter這個framework是如何運作的。

struct list_head nf_hooks[NFPROTO_NUMPROTO][NF_MAX_HOOKS] __read_mostly;

int nf_register_hook(struct nf_hook_ops *reg)
{
    struct nf_hook_ops *elem;
    int err;

    err = mutex_lock_interruptible(&nf_hook_mutex);
    if (err < 0)
        return err;
    list_for_each_entry(elem, &nf_hooks[reg->pf][reg->hooknum], list) {
        if (reg->priority < elem->priority)
            break;
    }
    list_add_rcu(®->list, elem->list.prev);
    mutex_unlock(&nf_hook_mutex);
    return 0;
}

void nf_unregister_hook(struct nf_hook_ops *reg)
{
    mutex_lock(&nf_hook_mutex);
    list_del_rcu(®->list);
    mutex_unlock(&nf_hook_mutex);
    synchronize_net();
}

nf_hook_register_hook()其實就是在將要註冊的callback function依照所屬的protocol family以及hooknum插入struct list_head nf_hooks[NFPROTO_NUMPROTO][NF_MAX_HOOKS]，並且會依照priority由小到大，而nf_unregister_hook()就是很簡單的reg由nf_hooks中移除。

接著我們再來看看nf_iterate()，程式碼中以//為註解方式，且為粗體字型就是我的註解。

unsigned int
nf_iterate(struct list_head *head, struct sk_buff *skb,
          unsigned int hook, const struct net_device *indev,
          const struct net_device *outdev, struct list_head **i,
          int (*okfn)(struct sk_buff *), int hook_thresh)
{
    unsigned int verdict;

    /*
     * The caller must not block between calls to this
     * function because of risk of continuing from deleted element.
     */
    list_for_each_continue_rcu(*i, head) {
        struct nf_hook_ops *elem = (struct nf_hook_ops *)*i;

        // 註冊的priority必須小於等於hook_thresh才會被執行
        if (hook_thresh > elem->priority)
            continue;

        /* Optimization: we don't need to hold module
           reference here, since function can't sleep. --RR */
        //丟進註冊的hook function執行
        verdict = elem->hook(hook, skb, indev, outdev, okfn);
        if (verdict != NF_ACCEPT) {
#ifdef CONFIG_NETFILTER_DEBUG
            if (unlikely((verdict & NF_VERDICT_MASK) > NF_MAX_VERDICT)) {
                NFDEBUG("Evil return from %p(%u).\n", elem->hook, hook);
                continue;
            }
#endif
            //如果不是NF_ACCEPT而且也不是NF_REPEAT就回傳verdict
            // (NF_DROP/NF_STOLEN/NF_QUEUE)
            if (verdict != NF_REPEAT)
                return verdict;
            //會執行到這邊就是NF_REPEAT啦
            *i = (*i)->prev;
        }
        // 如果verdict是NF_ACCEPT就會繼續往下一個hook function執行
    }
    //如果沒有任何的hook function或者所有的hook function都是NF_ACCEPT
    return NF_ACCEPT;
}

/* Returns 1 if okfn() needs to be executed by the caller,
 * -EPERM for NF_DROP, 0 otherwise. */
int
nf_hook_slow(u_int8_t pf, unsigned int hook, struct sk_buff *skb,
             struct net_device *indev, struct net_device *outdev,
             int (*okfn)(struct sk_buff *), int hook_thresh)
{
    struct list_head *elem;
    unsigned int verdict;
    int ret = 0;

    /* We may already have this, but read-locks nest anyway */
    rcu_read_lock();

    elem = &nf_hooks[pf][hook];
next_hook:
    // 將nf_hooks[pf][hook]這個linked list丟進nf_iterate()中執行
    verdict = nf_iterate(&nf_hooks[pf][hook], skb, hook, indev,
                         outdev, &elem, okfn, hook_thresh);
    if (verdict == NF_ACCEPT || verdict == NF_STOP) {
        // 如果是NF_ACCEPT或NF_STOP就回傳1, 到時候NF_HOOK()/NF_HOOK_COND()
        // 等macro就會執行okfn, 前面的註解也有說明
        ret = 1;
    } else if (verdict == NF_DROP) {
        // 如果是NF_DROP就會free resource並且回傳！1, 就是不會呼叫okfn()了
        kfree_skb(skb);
        ret = -EPERM;
    } else if ((verdict & NF_VERDICT_MASK) == NF_QUEUE) {
        // 如果是QUEUE就會將他nf_queue()將資訊暫時存起來, 等候處理
        if (!nf_queue(skb, elem, pf, hook, indev, outdev, okfn,
                      verdict >> NF_VERDICT_BITS))
            goto next_hook;
    }
    rcu_read_unlock();
    // 執行到這邊有可能是NF_STOLEN, 但ret = 0, 所以不會執行okfn,
    // NF_STOLEN會改變packet原本要走的路徑
    return ret;
}

#iddef CONFIG_NETFILTER
int
nf_hook_slow(u_int8_t pf, unsigned int hook, struct sk_buff *skb,
             struct net_device *indev, struct net_device *outdev,
             int (*okfn)(struct sk_buff *), int thresh);

/**
 * nf_hook_thresh - call a netfilter hook
 * 
 * Returns 1 if the hook has allowed the packet to pass.  The function
 * okfn must be invoked by the caller in this case.  Any other return
 * value indicates the packet has been consumed by the hook.
 */
static inline int 
nf_hook_thresh(u_int8_t pf, unsigned int hook, struct sk_buff *skb,
               struct net_device *indev, struct net_device *outdev,
               int (*okfn)(struct sk_buff *), int thresh)
{
#ifndef CONFIG_NETFILTER_DEBUG
    if (list_empty(&nf_hooks[pf][hook]))
        return 1;
#endif
    return nf_hook_slow(pf, hook, skb, indev, outdev, okfn, thresh);
}

static inline int
nf_hook(u_int8_t pf, unsigned int hook, struct sk_buff *skb,
        struct net_device *indev, struct net_device *outdev,
        int (*okfn)(struct sk_buff *))
{
    return nf_hook_thresh(pf, hook, skb, indev, outdev, okfn, INT_MIN);
}
                   
/* Activate hook; either okfn or kfree_skb called, unless a hook
   returns NF_STOLEN (in which case, it's up to the hook to deal with
   the consequences).

   Returns -ERRNO if packet dropped.  Zero means queued, stolen or
   accepted.
*/

/* RR:
   > I don't want nf_hook to return anything because people might forget
   > about async and trust the return value to mean "packet was ok".

   AK:
   Just document it clearly, then you can expect some sense from kernel
   coders :)
*/

static inline int
NF_HOOK_THRESH(uint8_t pf, unsigned int hook, struct sk_buff *skb,
               struct net_device *in, struct net_device *out,
               int (*okfn)(struct sk_buff *), int thresh)
{
    int ret = nf_hook_thresh(pf, hook, skb, in, out, okfn, thresh);
    if (ret == 1)
        ret = okfn(skb);
    return ret;
}

static inline int
NF_HOOK_COND(uint8_t pf, unsigned int hook, struct sk_buff *skb,
             struct net_device *in, struct net_device *out,
             int (*okfn)(struct sk_buff *), bool cond)
{
    int ret;

    if (!cond ||
            (ret = nf_hook_thresh(pf, hook, skb, in, out, okfn, INT_MIN) == 1))
        ret = okfn(skb);
    return ret;
}

static inline int
NF_HOOK(uint8_t pf, unsigned int hook, struct sk_buff *skb,
        struct net_device *in, struct net_device *out,
        int (*okfn)(struct sk_buff *))
{
    return NF_HOOK_THRESH(pf, hook, skb, in, out, okfn, INT_MIN);
}

#else /* !CONFIG_NETFILTER */

#define NF_HOOK(pf, hook, skb, indev, outdev, okfn) (okfn)(skb)
#define NF_HOOK_COND(pf, hook, skb, indev, outdev, okfn, cond) (okfn)(skb)
static inline int 
nf_hook_thresh(u_int8_t pf, unsigned int hook, struct sk_buff *skb,
               struct net_device *indev, struct net_device *outdev,
               int (*okfn)(struct sk_buff *), int thresh)
{
    return okfn(skb);
}

static inline int 
nf_hook(u_int8_t pf, unsigned int hook, struct sk_buff *skb,
        struct net_device *indev, struct net_device *outdev,
        int (*okfn)(struct sk_buff *))
{
    return 1;
}
#endif /*CONFIG_NETFILTER*/

如果沒有defined CONFIG_NETFILTER，NF_HOOK()其實就是直接呼叫okfn了。到這邊對於netfilter的運作就有基本的認識了，有機會hack其他關於netfilter的心得再和大家分享。

Kernel version：2.6.36

2010年4月16日星期五

Linux Kernel（3.2）- procfs之symlink與mkdir

在procfs底下無法直接使用mkdir/ln等指令建立目錄和建立link，不過有提供兩個API讓user達成這兩件事情。

static struct proc_dir_entry *proc_symlink(const char *src,
  struct proc_dir_entry *parent,const char *dest);

static struct proc_dir_entry *proc_mkdir(const char *name,
 struct proc_dir_entry *parent);

看名字就知道proc_symlink()是用來建立link的，src是檔名(basename)，parent是src所在的目錄，dest是要link的對象。
proc_mkdir()就更容易了，要在那個目錄(parent)下建立新的目錄(name)。
下面是範例：

#include <linux/init.h>
#include <linux/module.h>
#include <linux/proc_fs.h>

MODULE_LICENSE("GPL");

static char *bdir = "brook_dir";
module_param(bdir, charp, 0644);
MODULE_PARM_DESC(dir, "brook's dir");

static char *bfile = "brook_file";
module_param(bfile, charp, 0644);
MODULE_PARM_DESC(bfile, "brook's file");

static struct proc_dir_entry *ent = NULL;

static int __init init_modules(void)
{
    if (!(ent = proc_mkdir(bdir, NULL))) {
        printk("create dir \"%s\" failed\n", bdir);
        return -1;
    }

    if (!proc_symlink(bfile, ent, "../uptime")) {
        printk("create symlink \"%s\" failed\n", bfile);
        return -1;
    }

    return 0;
}

static void __exit exit_modules(void)
{
    remove_proc_entry(bfile, ent);
    if (ent) {
        remove_proc_entry(bdir, NULL);
    }
}

module_init(init_modules);
module_exit(exit_modules);

2010年3月22日星期一

Linux Kernel（12）- netfilter

netfilter是一個packet mangling的framework，主要在protocol stack中提供一些hook point(NF_IP_PRE_ROUTING、NF_IP_LOCAL_IN、NF_IP_FORWARD、NF_IP_POST_ROUTING和NF_IP_LOCAL_OUT)，讓user可以在這些hook point上註冊並且執行一些hook function，根據hook function傳回來的數值來決定是否要丟棄(NF_DROP)、pass(NF_ACCEPT)、或者queue(NF_QUEUE)等等。

NF_ACCEPT︰ continue traversal as normal.
NF_DROP︰ drop the packet; don't continue traversal.
NF_STOLEN: I've take over the packet; don't continue traversal.
NF_QUEUE︰ queue the packet.
NF_REPEAT︰ call this hook again.

struct nf_hook_ops
{
    struct list_head list;

    /* User fills in from here down. */
    nf_hookfn *hook;
    struct module *owner;
    u_int8_t pf;
    unsigned int hooknum;
    /* Hooks are ordered in ascending priority. */
    int priority;
};

pf是protocol family，目前有NFPROTO_UNSPEC、NFPROTO_IPV4、NFPROTO_ARP、NFPROTO_BRIDGE、NFPROTO_IPV6和NFPROTO_DECNET等等。這些值也等同sock的protocol family。
hooknum則是填入hook point num，netfilter是一個framework，會在很多地方設立hook point，而我們可以使用nf_register_hook()/nf_register_hooks()將我們的hook function掛在這些點上，以IPv4來說(上面的圖示)，就提供了五個hook point，包含：

Hook	Called
NF_IP_PRE_ROUTING	After sanity checks, before routing decisions.
NF_IP_LOCAL_IN	After routing decisions if packet is for this host.
NF_IP_FORWARD	If the packet is destined for another interface.
NF_IP_LOCAL_OUT	For packets coming from local processes on their way out.
NF_IP_POST_ROUTING	Just before outbound packets "hit the wire".

當然有了nf_register_hook()/nf_register_hooks()提供註冊，也會提供nf_unregister_hook()/nf_unregister_hooks()做unregister。
以下就是一個分別在IPv4的五個hook點上，印出saddr和daddr。

#include <linux/init.h>
#include <linux/module.h>
#include <linux/ip.h>
#include <linux/netfilter.h>
#include <linux/netfilter_ipv4.h>

MODULE_LICENSE("GPL");

inline void dumpIpHdr(const char *fn, const struct sk_buff *skb)
{
    const struct iphdr *ip = ip_hdr(skb);

    printk("%s, saddr:%pI4, daddr:%pI4\n", fn, &ip->saddr, &ip->daddr);
}

static unsigned int
prerouting(unsigned int hook, struct sk_buff *skb,
        const struct net_device *in, const struct net_device *out,
        int (*okfn)(struct sk_buff*))
{
    dumpIpHdr(__FUNCTION__, skb);
    return NF_ACCEPT;
}

static unsigned int
localin(unsigned int hook, struct sk_buff *skb,
        const struct net_device *in, const struct net_device *out,
        int (*okfn)(struct sk_buff*))
{
    dumpIpHdr(__FUNCTION__, skb);
    return NF_ACCEPT;
}

static unsigned int
localout(unsigned int hook, struct sk_buff *skb,
        const struct net_device *in, const struct net_device *out,
        int (*okfn)(struct sk_buff*))
{
    dumpIpHdr(__FUNCTION__, skb);
    return NF_ACCEPT;
}

static unsigned int
postrouting(unsigned int hook, struct sk_buff *skb,
        const struct net_device *in, const struct net_device *out,
        int (*okfn)(struct sk_buff*))
{
    dumpIpHdr(__FUNCTION__, skb);
    return NF_ACCEPT;
}

static unsigned int
fwding(unsigned int hook, struct sk_buff *skb,
        const struct net_device *in, const struct net_device *out,
        int (*okfn)(struct sk_buff*))
{
    dumpIpHdr(__FUNCTION__, skb);
    return NF_ACCEPT;
}

static struct nf_hook_ops brook_ops[] __read_mostly = {
    {
        .hook = prerouting,
        .pf = PF_INET,
        .hooknum = NF_INET_PRE_ROUTING,
        .priority = NF_IP_PRI_RAW,
        .owner = THIS_MODULE,
    }, {
        .hook = localin,
        .pf = PF_INET,
        .hooknum = NF_INET_LOCAL_IN,
        .priority = NF_IP_PRI_RAW,
        .owner = THIS_MODULE,
    }, {
        .hook = fwding,
        .pf = PF_INET,
        .hooknum = NF_INET_FORWARD,
        .priority = NF_IP_PRI_RAW,
        .owner = THIS_MODULE,
    }, {
        .hook = localout,
        .pf = PF_INET,
        .hooknum = NF_INET_LOCAL_OUT,
        .priority = NF_IP_PRI_RAW,
        .owner = THIS_MODULE,
    }, {
        .hook = postrouting,
        .pf = PF_INET,
        .hooknum = NF_INET_POST_ROUTING,
        .priority = NF_IP_PRI_RAW,
        .owner = THIS_MODULE,
    },
};

static int __init init_modules(void)
{
    if (nf_register_hooks(brook_ops, ARRAY_SIZE(brook_ops)) < 0) {
        printk("nf_register_hook failed\n");
    }
    return 0;
}

static void __exit exit_modules(void)
{
    nf_unregister_hooks(brook_ops, ARRAY_SIZE(brook_ops));
}

module_init(init_modules);
module_exit(exit_modules);

以下這張是更為清楚的netfilter packet flow：

2010年1月31日星期日

Linux Kernel（11.1）- sysfs and hotplug

Linux提供兩種非同步的hotplug機制通知user-space裝置狀態的改變，一是usermode helper，另一個則是透過netlink。

usermode helper
每當kernel收到hotplug event便會執行"CONFIG_UEVENT_HELPER_PATH"（預設值是/sbin/hotplug，可以透過修改/proc/sys/kernel/hotplug修改預設值），在embedded system中，常用的是busybox，所以，通常會把UEVENT HELPER換成/sbin/mdev（找時間來寫寫busybox），執行UEVENT HELPER會攜帶一些環境變數，如ACTION/DEVPATH/SUBSYSTEM/HOME/PATH等等，透過這些環境變數，可以取得Device Name，MAJOR/MINOR number等等，我們先來看部份的程式碼，往後有機會在作深度研究。

int kobject_uevent_env(struct kobject *kobj, enum kobject_action action,
         char *envp_ext[])
{
    ... 略...
    /* call uevent_helper, usually only enabled during early boot */
    if (uevent_helper[0]) {
        char *argv [3];

        argv [0] = uevent_helper;
        argv [1] = (char *)subsystem;
        argv [2] = NULL;
        retval = add_uevent_var(env, "HOME=/");
        if (retval)
            goto exit;
        retval = add_uevent_var(env,
                        "PATH=/sbin:/bin:/usr/sbin:/usr/bin");
        if (retval)
            goto exit;

        retval = call_usermodehelper(argv[0], argv,
                        env->envp, UMH_WAIT_EXEC);
    }
    ... 略...
}

我利用一個shell script來取代UEVENT HELPER，主要目的是要印出有哪些環境變數，和傳遞哪些參數給UEVENT HELPER。

#!/bin/sh
env >> /helper.log
echo "CMD:" $@ >> /helper.log
echo "----------     end     ----------" >> /helper.log

netlink
另外一條路徑就是netlink了，基本上也是傳遞相同的資訊。

int kobject_uevent_env(struct kobject *kobj, enum kobject_action action,
         char *envp_ext[])
{
    ... 略...
#if defined(CONFIG_NET)
    /* send netlink message */
    if (uevent_sock) {
        struct sk_buff *skb;
        size_t len;

        /* allocate message with the maximum possible size */
        len = strlen(action_string) + strlen(devpath) + 2;
        skb = alloc_skb(len + env->buflen, GFP_KERNEL);
        if (skb) {
            char *scratch;

            /* add header */
            scratch = skb_put(skb, len);
            sprintf(scratch, "%s@%s", action_string, devpath);

            /* copy keys to our continuous event payload buffer */
            for (i = 0; i < env->envp_idx; i++) {
                len = strlen(env->envp[i]) + 1;
                scratch = skb_put(skb, len);
                strcpy(scratch, env->envp[i]);
            }

            NETLINK_CB(skb).dst_group = 1;
            retval = netlink_broadcast(uevent_sock, skb, 0, 1,
                            GFP_KERNEL);
            /* ENOBUFS should be handled in userspace */
            if (retval == -ENOBUFS) {
                retval = 0;
            }
        } else {
            retval = -ENOMEM;
        }
    }
#endif
    ... 略...
}

user-space的code如下：

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#include <sys/poll.h>
#include <sys/socket.h>
#include <sys/types.h>
#include <unistd.h>

#include <linux/types.h>
#include <linux/netlink.h>

int main(int argc, char *argv[])
{
    struct sockaddr_nl nls;
    struct pollfd pfd;
    char buf[512];

    memset(&nls, 0, sizeof(nls));
    nls.nl_family = AF_NETLINK;
    nls.nl_pid = getpid();
    nls.nl_groups = -1;

    pfd.events = POLLIN;
    pfd.fd = socket(PF_NETLINK, SOCK_DGRAM, NETLINK_KOBJECT_UEVENT);
    if (pfd.fd == -1) {
        printf("Not root\n");
        exit(1);
    }

    if (bind(pfd.fd, (void*)&nls, sizeof(nls))) {
        printf("bind failed\n");
        exit(1);
    }
    while (-1 != poll(&pfd, 1, -1)) {
        int i, len = recv(pfd.fd, buf, sizeof(buf), MSG_DONTWAIT);
        if (len == -1) {
            printf("recv\n");
            exit(1);
        }
        i = 0;
        while (i < len) {
            printf("%s\n", buf + i);
            i += strlen(buf+i) + 1;
        }
    }
    printf("\n");
    return 0;
}

Linux Kernel（11）- sysfs and device node

在linux kernel 2.6.x提供了sysfs，經由這樣的file-system可以告訴user-space系統有哪些裝置，而user-space的程式就可以動態的在/dev底下產生相對應的device node。

device node：
在/sys底下有一些名為"dev"的檔案，就是包含該裝置的major/minor number，比如：

# cat /sys/class/mem/zero/dev
1:5
# ls -al /dev/zero
crw-rw-rw- 1 root root 1, 5 2010-02-01 21:58 /dev/zero

所有的block device都可以在/sys/block/*/dev和/sys/block/*/*/dev找到。
所有的char device都可以在/sys/bus/*/devices/*/dev和/sys/class/*/*/dev找到。

所以一個簡易的動態device node產生程式就可以用script撰寫如下：

#!/bin/sh

# Block Device
for i in /sys/block/*/dev /sys/block/*/*/dev
do
    if [ -f $i ]
    then
        MAJOR=$(sed 's/:.*//' < $i)
        MINOR=$(sed 's/.*://' < $i)
        DEVNAME=$(echo $i | sed -e 's@/dev@@' -e 's@.*/@@')
        mknod /dev/$DEVNAME b $MAJOR $MINOR
    fi
done

# Char Device
for i in /sys/bus/*/devices/*/dev和/sys/class/*/*/dev
do
    if [ -f $i ]
    then
        MAJOR=$(sed 's/:.*//' < $i)
        MINOR=$(sed 's/.*://' < $i)
        DEVNAME=$(echo $i | sed -e 's@/dev@@' -e 's@.*/@@')
        mknod /dev/$DEVNAME c $MAJOR $MINOR
    fi
done

2010年1月27日星期三

Linux Kernel（10.3）- Command line partition table parsing

MTD Partition除了在code中寫死以外，其實還可以透過一些parsers來作規劃，這一章就要來教大家如何使用"Command line partition table parsing"。首先必須在kernel中啟用"Command line partition table parsing"，請參照下圖。

這樣kernel就可以支援"Command line partition table parsing"，然後我們還是拿mtdram.c的code來改（紅色的部份）。

/*
 * mtdram - a test mtd device
 * Author: Alexander Larsson <alex@cendio.se>
 *
 * Copyright (c) 1999 Alexander Larsson alex@cendio.se>
 * Copyright (c) 2005 Joern Engel <joern@wh.fh-wedel.de>
 *
 * This code is GPL
 *
 */

#include <linux/module.h>
#include <linux/slab.h>
#include <linux/ioport.h>
#include <linux/vmalloc.h>
#include <linux/init.h>
#include <linux/mtd/compatmac.h>
#include <linux/mtd/mtd.h>
#include <linux/mtd/mtdram.h>
#include <linux/mtd/partitions.h>

static unsigned long total_size = CONFIG_MTDRAM_TOTAL_SIZE;
static unsigned long erase_size = CONFIG_MTDRAM_ERASE_SIZE;
#define MTDRAM_TOTAL_SIZE (total_size * 1024)
#define MTDRAM_ERASE_SIZE (erase_size * 1024)

#ifdef MODULE
module_param(total_size, ulong, 0);
MODULE_PARM_DESC(total_size, "Total device size in KiB");
module_param(erase_size, ulong, 0);
MODULE_PARM_DESC(erase_size, "Device erase block size in KiB");
#endif

// We could store these in the mtd structure, but we only support 1 device..
static struct mtd_info *mtd_info;
static char partitioned = 0;

static int ram_erase(struct mtd_info *mtd, struct erase_info *instr)
{
    if (instr->addr + instr->len > mtd->size)
        return -EINVAL;

    memset((char *)mtd->priv + instr->addr, 0xff, instr->len);

    instr->state = MTD_ERASE_DONE;
    mtd_erase_callback(instr);

    return 0;
}

static int ram_point(struct mtd_info *mtd, loff_t from, size_t len,
        size_t *retlen, void **virt, resource_size_t *phys)
{
    if (from + len > mtd->size)
        return -EINVAL;

    /* can we return a physical address with this driver? */
    if (phys)
        return -EINVAL;

    *virt = mtd->priv + from;
    *retlen = len;
    return 0;
}

static void ram_unpoint(struct mtd_info *mtd, loff_t from, size_t len)
{
}

/*
 * Allow NOMMU mmap() to directly map the device (if not NULL)
 * - return the address to which the offset maps
 * - return -ENOSYS to indicate refusal to do the mapping
 */
static unsigned long ram_get_unmapped_area(struct mtd_info *mtd,
                       unsigned long len,
                       unsigned long offset,
                       unsigned long flags)
{
    return (unsigned long) mtd->priv + offset;
}

static int ram_read(struct mtd_info *mtd, loff_t from, size_t len,
        size_t *retlen, u_char *buf)
{
    if (from + len > mtd->size)
        return -EINVAL;

    memcpy(buf, mtd->priv + from, len);

    *retlen = len;
    return 0;
}

static int ram_write(struct mtd_info *mtd, loff_t to, size_t len,
        size_t *retlen, const u_char *buf)
{
    if (to + len > mtd->size)
        return -EINVAL;

    memcpy((char *)mtd->priv + to, buf, len);

    *retlen = len;
    return 0;
}

static void __exit cleanup_mtdram(void)
{
    if (mtd_info) {

        if (mtd_has_partitions() && partitioned) {
            del_mtd_partitions(mtd_info);
        } else {
            del_mtd_device(mtd_info);
        }

        vfree(mtd_info->priv);
        kfree(mtd_info);
    }
}

int mtdram_init_device(struct mtd_info *mtd, void *mapped_address,
        unsigned long size, char *name)
{
    memset(mtd, 0, sizeof(*mtd));

    /* Setup the MTD structure */
    mtd->name = name;
    mtd->type = MTD_RAM;
    mtd->flags = MTD_CAP_RAM;
    mtd->size = size;
    mtd->writesize = 1;
    mtd->erasesize = MTDRAM_ERASE_SIZE;
    mtd->priv = mapped_address;

    mtd->owner = THIS_MODULE;
    mtd->erase = ram_erase;
    mtd->point = ram_point;
    mtd->unpoint = ram_unpoint;
    mtd->get_unmapped_area = ram_get_unmapped_area;
    mtd->read = ram_read;
    mtd->write = ram_write;

    if (mtd_has_partitions()) {
        struct mtd_partition *mtd_parts = NULL;
        static const char *probes[] =
                    { "cmdlinepart", NULL };
        int nb_parts = 0;
        printk("has partitions\n");
        if (mtd_has_cmdlinepart()) {
            printk("has probs\n");
            nb_parts = parse_mtd_partitions(mtd, probes, &mtd_parts, 0);
        }
        if (nb_parts > 0) {
            printk("partitioned\n");
            partitioned = 1;
            return add_mtd_partitions(mtd, mtd_parts, nb_parts);
        }
    }

    if (add_mtd_device(mtd)) {
        return -EIO;
    }

    return 0;
}

static int __init init_mtdram(void)
{
    void *addr;
    int err;

    if (!total_size)
        return -EINVAL;

    /* Allocate some memory */
    mtd_info = kmalloc(sizeof(struct mtd_info), GFP_KERNEL);
    if (!mtd_info)
        return -ENOMEM;

    addr = vmalloc(MTDRAM_TOTAL_SIZE);
    if (!addr) {
        kfree(mtd_info);
        mtd_info = NULL;
        return -ENOMEM;
    }
    err = mtdram_init_device(mtd_info, 
                  addr, MTDRAM_TOTAL_SIZE, "brook_flash");
    if (err) {
        vfree(addr);
        kfree(mtd_info);
        mtd_info = NULL;
        return err;
    }
    memset(mtd_info->priv, 0xff, MTDRAM_TOTAL_SIZE);
    return err;
}

module_init(init_mtdram);
module_exit(cleanup_mtdram);

MODULE_LICENSE("GPL");
MODULE_AUTHOR("Alexander Larsson <alexl@redhat.com>");
MODULE_DESCRIPTION("Simulated MTD driver for testing");

Linux Kernel（10.2）- mtd partitions

在drivers/mtd/devices/mtdram.c中可以知道，透過add_mtd_device()/del_mtd_device()新增和刪除mtd device，但是並沒有規劃partition，在這邊將介紹add_mtd_partitions()/del_mtd_partitions()讓您可以透過這兩個函數為mtd切割partition。
此範例為drivers/mtd/devices/mtdram.c內容來修改而成（紅色粗體為修改部份）。

/*
 * mtdram - a test mtd device
 * Author: Alexander Larsson 
 *
 * Copyright (c) 1999 Alexander Larsson 
 * Copyright (c) 2005 Joern Engel 
 *
 * This code is GPL
 *
 */

#include <linux/module.h>
#include <linux/slab.h>
#include <linux/ioport.h>
#include <linux/vmalloc.h>
#include <linux/init.h>
#include <linux/mtd/compatmac.h>
#include <linux/mtd/mtd.h>
#include <linux/mtd/mtdram.h>
#include <linux/mtd/partitions.h>

static unsigned long total_size = CONFIG_MTDRAM_TOTAL_SIZE;
static unsigned long erase_size = CONFIG_MTDRAM_ERASE_SIZE;
#define MTDRAM_TOTAL_SIZE (total_size * 1024)
#define MTDRAM_ERASE_SIZE (erase_size * 1024)

static struct mtd_partition brook_partitions[] = {
    {
        .name = "part-1",
        .size = 0x00100000,
        .offset = 0x0000000
    }, {
        .name = "part-2",
        .size = 0x00100000,
        .offset = MTDPART_OFS_APPEND,
        .mask_flags = MTD_WRITEABLE
    }, {
        .name = "part-3",
        .offset = MTDPART_OFS_APPEND,
    }，
};

#ifdef MODULE
module_param(total_size, ulong, 0);
MODULE_PARM_DESC(total_size, "Total device size in KiB");
module_param(erase_size, ulong, 0);
MODULE_PARM_DESC(erase_size, "Device erase block size in KiB");
#endif

// We could store these in the mtd structure, but we only support 1 device..
static struct mtd_info *mtd_info;

static int ram_erase(struct mtd_info *mtd, struct erase_info *instr)
{
    if (instr->addr + instr->len > mtd->size)
        return -EINVAL;

    memset((char *)mtd->priv + instr->addr, 0xff, instr->len);

    instr->state = MTD_ERASE_DONE;
    mtd_erase_callback(instr);

    return 0;
}

static int ram_point(struct mtd_info *mtd, loff_t from, size_t len,
        size_t *retlen, void **virt, resource_size_t *phys)
{
    if (from + len > mtd->size)
        return -EINVAL;

    /* can we return a physical address with this driver? */
    if (phys)
        return -EINVAL;

    *virt = mtd->priv + from;
    *retlen = len;。
    return 0;
}

static void ram_unpoint(struct mtd_info *mtd, loff_t from, size_t len)
{
}

/*
 * Allow NOMMU mmap() to directly map the device (if not NULL)
 * - return the address to which the offset maps
 * - return -ENOSYS to indicate refusal to do the mapping
 */
static unsigned long ram_get_unmapped_area(struct mtd_info *mtd,
                       unsigned long len,
                       unsigned long offset,
                       unsigned long flags)
{
    return (unsigned long) mtd->priv + offset;
}

static int ram_read(struct mtd_info *mtd, loff_t from, size_t len,
        size_t *retlen, u_char *buf)
{
    if (from + len > mtd->size)
        return -EINVAL;

    memcpy(buf, mtd->priv + from, len);

    *retlen = len;
    return 0;
}

static int ram_write(struct mtd_info *mtd, loff_t to, size_t len,
        size_t *retlen, const u_char *buf)
{
    if (to + len > mtd->size)
        return -EINVAL;

    memcpy((char *)mtd->priv + to, buf, len);

    *retlen = len;
    return 0;
}

static void __exit cleanup_mtdram(void)
{
    if (mtd_info) {
        del_mtd_partitions(mtd_info);
        vfree(mtd_info->priv);
        kfree(mtd_info);
    }
}

int mtdram_init_device(struct mtd_info *mtd, void *mapped_address,
        unsigned long size, char *name)
{
    memset(mtd, 0, sizeof(*mtd));

    /* Setup the MTD structure */
    mtd->name = name;
    mtd->type = MTD_RAM;
    mtd->flags = MTD_CAP_RAM;
    mtd->size = size;
    mtd->writesize = 1;
    mtd->erasesize = MTDRAM_ERASE_SIZE;
    mtd->priv = mapped_address;

    mtd->owner = THIS_MODULE;
    mtd->erase = ram_erase;
    mtd->point = ram_point;
    mtd->unpoint = ram_unpoint;
    mtd->get_unmapped_area = ram_get_unmapped_area;
    mtd->read = ram_read;
    mtd->write = ram_write;

    if (add_mtd_partitions(mtd, brook_partitions,
                ARRAY_SIZE(brook_partitions))) {
        return -EIO;
    }

    return 0;
}

static int __init init_mtdram(void)
{
    void *addr;
    int err;

    if (!total_size)
        return -EINVAL;

    /* Allocate some memory */
    mtd_info = kmalloc(sizeof(struct mtd_info), GFP_KERNEL);
    if (!mtd_info)
        return -ENOMEM;

    addr = vmalloc(MTDRAM_TOTAL_SIZE);
    if (!addr) {
        kfree(mtd_info);
        mtd_info = NULL;
        return -ENOMEM;
    }
    err = mtdram_init_device(mtd_info, addr, 
                  MTDRAM_TOTAL_SIZE, "brook_flash");
    if (err) {
        vfree(addr);
        kfree(mtd_info);
        mtd_info = NULL;
        return err;
    }
    memset(mtd_info->priv, 0xff, MTDRAM_TOTAL_SIZE);
    return err;
}

module_init(init_mtdram);
module_exit(cleanup_mtdram);

MODULE_LICENSE("GPL");
MODULE_AUTHOR("Alexander Larsson >alexl@redhat.com>");
MODULE_DESCRIPTION("Simulated MTD driver for testing");

如果要切割partitions就必須要提供partitions的資訊，於是我們就宣告了一個名為brook_partitions的static struct mtd_partition，然後將原本呼叫add_mtd_device()/del_mtd_device()分別改成add_mtd_partitions()/del_mtd_partitions()就大功告成了。

2010年1月18日星期一

Linux Kernel（10.1）- drivers/mtd/devices/mtdram.c

MTD的基本介紹可以參考MTD - Memory Technology Devices，這篇文章要透過drivers/mtd/devices/mtdram.c來了解mtd的driver如何運作。

static unsigned long total_size = CONFIG_MTDRAM_TOTAL_SIZE;
#define MTDRAM_TOTAL_SIZE (total_size * 1024)

#ifdef MODULE
module_param(total_size, ulong, 0);
MODULE_PARM_DESC(total_size, "Total device size in KiB");
#endif

static int __init init_mtdram(void)
{
 void *addr;
 int err;

 if (!total_size)
  return -EINVAL;

 /* Allocate some memory */
 mtd_info = kmalloc(sizeof(struct mtd_info), GFP_KERNEL);
 if (!mtd_info)
  return -ENOMEM;

 addr = vmalloc(MTDRAM_TOTAL_SIZE);
 if (!addr) {
  kfree(mtd_info);
  mtd_info = NULL;
  return -ENOMEM;
 }
 err = mtdram_init_device(mtd_info, addr,
                   MTDRAM_TOTAL_SIZE, "mtdram test device");
 if (err) {
  vfree(addr);
  kfree(mtd_info);
  mtd_info = NULL;
  return err;
 }
 memset(mtd_info->priv, 0xff, MTDRAM_TOTAL_SIZE);
 return err;
}

首先，看到init_mtdram()，在該function中，我們分配一塊記憶體給mtd_info，以及一塊v-memory給稍候模擬的flash用，接著就呼叫mtdram_init_device()進行mtd的註冊動作，（一個mtd partition需要一個mtd_info來存放所需的資訊），init_mtdram()後面就將mtd_info->priv（即v-memory）的內容全部設成0xff，這是因為一個空的flash裡面預設就是0xff。

int mtdram_init_device(struct mtd_info *mtd, void *mapped_address,
  unsigned long size, char *name)
{
 memset(mtd, 0, sizeof(*mtd));

 /* Setup the MTD structure */
 mtd->name = name;
 mtd->type = MTD_RAM;
 mtd->flags = MTD_CAP_RAM;
 mtd->size = size;
 mtd->writesize = 1;
 mtd->erasesize = MTDRAM_ERASE_SIZE;
 mtd->priv = mapped_address;

 mtd->owner = THIS_MODULE;
 mtd->erase = ram_erase;
 mtd->point = ram_point;
 mtd->unpoint = ram_unpoint;
 mtd->get_unmapped_area = ram_get_unmapped_area;
 mtd->read = ram_read;
 mtd->write = ram_write;

 if (add_mtd_device(mtd)) {
  return -EIO;
 }

 return 0;
}

在mtdram_init_device()主要是填mtd_info相關資訊，然後呼叫add_mtd_device()進行註冊mtd的動作。呼叫del_mtd_device()進行移除mtd的工作。
point()/unpoint()可以參考http://www.linux-mtd.infradead.org/faq/general.html#L_point。

#ifdef MODULE
module_param(total_size, ulong, 0);
MODULE_PARM_DESC(total_size, "Total device size in KiB");
module_param(erase_size, ulong, 0);
MODULE_PARM_DESC(erase_size, "Device erase block size in KiB");
#endif

// We could store these in the mtd structure, but we only support 1 device.
static struct mtd_info *mtd_info;

static int ram_erase(struct mtd_info *mtd, struct erase_info *instr)
{
 if (instr->addr + instr->len > mtd->size)
  return -EINVAL;

 memset((char *)mtd->priv + instr->addr, 0xff, instr->len);

 instr->state = MTD_ERASE_DONE;
 mtd_erase_callback(instr);

 return 0;
}

static int ram_point(struct mtd_info *mtd, loff_t from, size_t len,
  size_t *retlen, void **virt, resource_size_t *phys)
{
 if (from + len > mtd->size)
  return -EINVAL;

 /* can we return a physical address with this driver? */
 if (phys)
  return -EINVAL;

 *virt = mtd->priv + from;
 *retlen = len;
 return 0;
}

static void ram_unpoint(struct mtd_info *mtd, loff_t from, size_t len)
{
}

/*
 * Allow NOMMU mmap() to directly map the device (if not NULL)
 * - return the address to which the offset maps
 * - return -ENOSYS to indicate refusal to do the mapping
 */
static unsigned long ram_get_unmapped_area(struct mtd_info *mtd,
        unsigned long len,
        unsigned long offset,
        unsigned long flags)
{
 return (unsigned long) mtd->priv + offset;
}

static int ram_read(struct mtd_info *mtd, loff_t from, size_t len,
  size_t *retlen, u_char *buf)
{
 if (from + len > mtd->size)
  return -EINVAL;

 memcpy(buf, mtd->priv + from, len);

 *retlen = len;
 return 0;
}

static int ram_write(struct mtd_info *mtd, loff_t to, size_t len,
  size_t *retlen, const u_char *buf)
{
 if (to + len > mtd->size)
  return -EINVAL;

 memcpy((char *)mtd->priv + to, buf, len);

 *retlen = len;
 return 0;
}

剩下的read()/write()/erase()都是copy from/to memory和清成0xff，所以您可以發現讀寫NOR flash和讀寫memory差不多。

2010年1月16日星期六

Linux Kernel（10）- MTD - Memory Technology Devices

說到MTD您就不得不親自拜訪一下MTD的官網(http://www.linux-mtd.infradead.org/)，傳統上UNIX將device分成兩大類，char device和block device，char device就像鍵盤，可以讀資料，但卻不能做seek，也沒有固定大小，而block就像硬碟一樣，可以隨機存取某個位置(seek)。而MTD並不是char device也不是block device，因此建立了新的device類別，稱為MTD。
MTD subsystem提供一個抽象層（FTL）來存取flash device（如NAN、OneNAND、NOR等等），而我們一般用的USB flash因為有IC控制，以Linux的角度看起來就像block device，而不是一個原生的(raw) flash。

一般PC都不會接這些raw flash，不過我們可以透過一些simulate來練習這些device。

在insmod mtd.ko之後我可以透過/proc/mtd得知目前有哪些MTD，因為我們系統當然沒有MTD的device，所以可以insmod mtdram.ko安裝一個虛擬的MTD。

如果要能mount raw flash，還必須透過block device的介面存取，所以在安裝一下mtdblock.ko吧。

利用，mkfs.jffs2建立一個jffs2的image，再利用flashcp將image燒錄到flash中，最後就可以mount來用啦。

這一張圖是利用dd將flash的資料備份下來，再利用flashcp還原資料。

2010年1月5日星期二

Linux Kernel（9）- Kthread

在kernel中建立thread可以使用kthread_create()，建立一個task，然後在調用wake_up_process(task)讓task真正的運行，如果要kill一個kthread可以使用kthread_stop()。
在kernel中，將kthread_create()和wake_up_process()包裝成kthread_run()，也就是調用了kthread_run()之後，該thread會立刻被執行。

#include <linux/init.h>
#include <linux/module.h>
#include <linux/kthread.h>

MODULE_LICENSE("GPL");

static struct task_struct *brook_tsk;
static int data;
static int kbrook(void *arg);

static int kbrook(void *arg)
{
    unsigned int timeout;
    int *d = (int *) arg;

    for(;;) {
        if (kthread_should_stop()) break;
        printk("%s(): %d\n", __FUNCTION__, (*d)++);
        do {
            set_current_state(TASK_INTERRUPTIBLE);
            timeout = schedule_timeout(10 * HZ);
        } while(timeout);
    }
    printk("break\n");

    return 0;
}

static int __init init_modules(void)
{
    int ret;

    brook_tsk = kthread_create(kbrook, &data, "brook");
    if (IS_ERR(brook_tsk)) {
        ret = PTR_ERR(brook_tsk);
        brook_tsk = NULL;
        goto out;
    }
    wake_up_process(brook_tsk);

    return 0;

out:
    return ret;
}

static void __exit exit_modules(void)
{
    kthread_stop(brook_tsk);
}

module_init(init_modules);
module_exit(exit_modules);

linux/kthread.h

/**
 * kthread_run - create and wake a thread.
 * @threadfn: the function to run until signal_pending(current).
 * @data: data ptr for @threadfn.
 * @namefmt: printf-style name for the thread.
 *
 * Description: Convenient wrapper for kthread_create() followed by
 * wake_up_process().  Returns the kthread or ERR_PTR(-ENOMEM).
 */
#define kthread_run(threadfn, data, namefmt, ...)      \
({            \
 struct task_struct *__k         \
  = kthread_create(threadfn, data, namefmt, ## __VA_ARGS__); \
 if (!IS_ERR(__k))         \
  wake_up_process(__k);        \
 __k;           \
})

2010年1月3日星期日

Linux Kernel（8）- Notification

Kernel提供一個notifiers/notifier chains的機制，這是publish-and-subscribe的機制，也就是需要的人自己去訂閱（join到某個notifier chain中），當這個chain的provider有事件要發布，就publish出來（發布給join這個chain的所有人）。
kernel中也提供了一些notifier，如reboot，可以透過register_reboot_notifier()訂閱，用unregister_reboot_notifier()取消訂閱。我們也可以自訂自己的notifier，以下例子就是自訂一個notifier。此一範例為透過寫入/proc/brook_notifier將資料publish給訂閱brook_notifier_list的scbscriber

notifier publisher

#include <linux/init.h>
#include <linux/module.h>
#include <linux/proc_fs.h>
#include <linux/uaccess.h>

#include "notifier.h"

MODULE_LICENSE("GPL");
// 宣告一個新的notifier list – brook_notifier_list
BLOCKING_NOTIFIER_HEAD(brook_notifier_list);

// 訂閱brook_notifier_list事件的wrapper function
int register_brook_notifier(struct notifier_block *nb)
{
    return blocking_notifier_chain_register(&brook_notifier_list, nb);
}
EXPORT_SYMBOL(register_brook_notifier);

// 取消訂閱brook_notifier_list事件的wrapper function
int unregister_brook_notifier(struct notifier_block *nb)
{
    return blocking_notifier_chain_unregister(&brook_notifier_list, nb);
}
EXPORT_SYMBOL(unregister_brook_notifier);

// procfs的write function
static int write_proc(struct file *filp, const char __user *buf,
                               unsigned long count, void *data)
{
    char *p = kzalloc(sizeof(char) * count, GFP_KERNEL);
    if (!p) {
        printk("no mem\n");
        return -ENOMEM;
    }
    if (copy_from_user(p, buf, count)) {
        printk("fault\n");
        return -EFAULT;
    }
    printk("%s(): msg=\"%s\"\n", __FUNCTION__, p);

    // 將事件published給brook_notifier_list的subscriber
    blocking_notifier_call_chain(&brook_notifier_list, brook_num1, (void*)p);
    kfree(p);
    return count;
}

static int __init init_modules(void)
{
    struct proc_dir_entry *ent;

    ent = create_proc_entry("brook_notifier", S_IFREG | S_IWUSR, NULL);
    if (!ent) {
        printk("create proc child failed\n");
    } else {
        ent->write_proc = write_proc;
    }
    return 0;
}

static void __exit exit_modules(void)
{
    remove_proc_entry("brook_notifier", NULL);
}

module_init(init_modules);
module_exit(exit_modules);

notifier subscriber

#include <linux/init.h>
#include <linux/module.h>
#include <linux/uaccess.h>
#include <linux/notifier.h>

#include "notifier.h"

MODULE_LICENSE("GPL");

// callback function, 當brook_notifier_list有事件發生時, 會呼叫該function
static int brook_notify_sys(struct notifier_block *this,
                            unsigned long code, void *data)
{
    printk("%s(): code=%ld, msg=\"%s\"\n", __FUNCTION__, code, (char*)data);
    return 0;
}

// 宣告要註冊到brook_notifier_list的struct
static struct notifier_block brook_notifier = {
        .notifier_call =    brook_notify_sys,
};

static int __init init_modules(void)
{
    // 將brook_notifier註冊到brook_notifier_list
    register_brook_notifier(&brook_notifier);
    return 0;
}

static void __exit exit_modules(void)
{
    // 將brook_notifier自brook_notifier_list移除
    unregister_brook_notifier(&brook_notifier);
}

module_init(init_modules);
module_exit(exit_modules);

header file

#ifndef BROOK_NOTIFIER_H
#define BROOK_NOTIFIER_H

#include <linux/notifier.h>

int register_brook_notifier(struct notifier_block *nb);
int unregister_brook_notifier(struct notifier_block *nb);

// event type
enum brook_msg {
    brook_num1,
    brook_num2,
    brook_num3
};

#endif

基本上我們都透過notifier_chain_register()來訂閱某個notifier，透過notifier_chain_unregister()取消某個notifier的訂閱，用notifier_call_chain()來發布event，不過我們常常會用對訂閱與取消訂閱寫一層wrapper，如我們的register_brook_notifier()/unregister_brook_notifier()。

參考資料：

Understanding Linux Network Internals, Ch4 Notification Chains
Publish/subscribe

訂閱：文章 (Atom)

Nano雞排

2011年2月26日星期六

Linux Kernel（8.1）- Notifier機制剖析

2011年1月16日星期日

Linux softirq執行分析(轉)

2011年1月9日星期日

Linux Modules（1.1）module parameters

2010年12月25日星期六

Linux Modules（7.3）- work queue

2010年12月11日星期六

Linux Kernel（13）- syscall

新增system call “brook()”到kernel 2.6.32的步驟(x86)

config automatically switches from 32-bit to 64-bit for x86

2010年11月27日星期六

Linux Kernel（12.1）- netfilter機制之初探

2010年4月16日星期五

Linux Kernel（3.2）- procfs之symlink與mkdir

2010年3月22日星期一

Linux Kernel（12）- netfilter

2010年1月31日星期日

Linux Kernel（11.1）- sysfs and hotplug

Linux Kernel（11）- sysfs and device node

2010年1月27日星期三

Linux Kernel（10.3）- Command line partition table parsing

Linux Kernel（10.2）- mtd partitions

2010年1月18日星期一

Linux Kernel（10.1）- drivers/mtd/devices/mtdram.c

2010年1月16日星期六

Linux Kernel（10）- MTD - Memory Technology Devices

2010年1月5日星期二

Linux Kernel（9）- Kthread

2010年1月3日星期日

Linux Kernel（8）- Notification

熱門文章

關於我自己

網誌存檔

搜尋此網誌

標籤

2011年2月26日 星期六

2011年1月16日 星期日

2011年1月9日 星期日

2010年12月25日 星期六

2010年12月11日 星期六

新增system call “brook()”到kernel 2.6.32的步驟(x86)

2010年11月27日 星期六

2010年4月16日 星期五

2010年3月22日 星期一

2010年1月31日 星期日

2010年1月27日 星期三

2010年1月18日 星期一

2010年1月16日 星期六

2010年1月5日 星期二

2010年1月3日 星期日

熱門文章

關於我自己

網誌存檔

搜尋此網誌

標籤

2011年2月26日星期六

2011年1月16日星期日

2011年1月9日星期日

2010年12月25日星期六

2010年12月11日星期六

2010年11月27日星期六

2010年4月16日星期五

2010年3月22日星期一

2010年1月31日星期日

2010年1月27日星期三

2010年1月18日星期一

2010年1月16日星期六

2010年1月5日星期二

2010年1月3日星期日