顯示具有 Linux - kernel 標籤的文章。 顯示所有文章
顯示具有 Linux - kernel 標籤的文章。 顯示所有文章

2016年2月21日 星期日

Linux Kernel(16.1)- Network Device Driver, simple snull.


這一篇藉由LDD(Linux Device Drivers)中的SNULL來了解最基本的Network Device Driver的架構,本章的sample code比原本的SNULL更為簡化,但是Network Topology是相同的,讓interface sn0/sn1可以透過遠方虛擬的remote0/remote1彼此溝通。

最基本的Network Device Driver的寫法就是allocate network device, "struct net_device"並且賦予hook function, "struct net_device_ops",然後將該network device註冊到kernel中,Kernel就可以調用該Network device,最基本的net_device_ops包含
  • ndo_open() and ndo_validate_addr() are called, when the NIC is bring up.
  • ndo_stop() is called, when the NIC is shut down.
  • ndo_start_xmit() is called, when a packet is sent from the NIC.
  • ndo_change_mtu() is called, when the MTU of the NIC is changed.
  • ndo_set_mac_address() is called, when the MAC address of the NIC is changed.
以下是demo code的net_device_ops部分
static const struct net_device_ops nic_netdev_ops = {
    /* Kernel calls ndo_open() and ndo_validate_addr()
     * when you bring up the NIC
     */
    .ndo_open               = nic_open,
    .ndo_validate_addr      = nic_validate_addr,

    /* when you shut down the NIC, kernel call the .ndo_stop() */
    .ndo_stop               = nic_close,

    /* Kernel calls ndo_start_xmit() when it wants to 
     *   transmit a packet. 
     */
    .ndo_start_xmit         = nic_start_xmit,

    /* ndo_change_mtu() is called, when you change MTU */
    .ndo_change_mtu         = nic_change_mtu,

    /* ndo_set_mac_address() is called,
     *   when you change the MAC addr
     */
    .ndo_set_mac_address    = nic_set_mac_addr,
};


我們是模擬ethernet,而ethernet有一些hook function可以用,如下
  • ndo_validate_addr() -> eth_validate_addr().
  • ndo_change_mtu() -> eth_change_mtu().
  • ndo_set_mac_address() -> eth_mac_addr().
在demo code中,ndo_validate_addr()/ndo_change_mtu()/ndo_set_mac_address()我都是將其轉成ethernet的default hook function,我沒直接掛,是因為我想印出訊息來看
static int nic_validate_addr(struct net_device *netdev)
{
    struct nic_priv *priv = netdev_priv(netdev);
    netif_info(priv, drv, netdev, "%s(#%d), priv:%p\n",
                __func__, __LINE__, priv);
    return eth_validate_addr(netdev);
}

static int nic_change_mtu(struct net_device *netdev, int new_mtu)
{
    struct nic_priv *priv = netdev_priv(netdev);
    netif_info(priv, drv, netdev, "%s(#%d), priv:%p\n",
                __func__, __LINE__, priv);
    return eth_change_mtu(netdev, new_mtu);
}

static int nic_set_mac_addr(struct net_device *netdev, void *addr)
{
    struct nic_priv *priv = netdev_priv(netdev);
    netif_info(priv, drv, netdev, "%s(#%d), priv:%p\n",
                __func__, __LINE__, priv);
    return eth_mac_addr(netdev, addr);
}

另外幾個比較重要的function是netif_start_queue()/netif_stop_queue()
  • netif_start_queue()是通知上層,可以將資料送到該網卡,通常放在ndo_open()裡面
  • netif_stop_queue()是通知上層,停止將資料送到該網卡,通常放在ndo_stop()裡面


由於我們沒有真的remote0/remote1可以回應,所以必須設定flag/IFF_NOARP在sn0跟sn1,並且自己要處理L2的header,所以必須在額外掛上"struct header_ops"。

以下為完整的driver,主要code都是印訊息觀察driver的call flow
/* reference ldd3, snull.c */
#include <linux/module.h>
#include <linux/netdevice.h>
#include <linux/etherdevice.h>

/* for in_device, in_ifaddr */
#include <linux/inetdevice.h>

MODULE_AUTHOR("Brook");
MODULE_DESCRIPTION("Kernel module for demo");
MODULE_LICENSE("GPL");

#define MAX_ETH_FRAME_SIZE   1792
struct nic_priv {
    /* you can use array to queue more packet */
    unsigned char *tx_buf;
    unsigned int  tx_len;
    u32           msg_enable;
};

static struct net_device *nic_dev[2];
/* netif msg type, defined in netdevice.h
    NETIF_MSG_DRV           = 0x0001,
    NETIF_MSG_PROBE         = 0x0002,
    NETIF_MSG_LINK          = 0x0004,
    NETIF_MSG_TIMER         = 0x0008,
    NETIF_MSG_IFDOWN        = 0x0010,
    NETIF_MSG_IFUP          = 0x0020,
    NETIF_MSG_RX_ERR        = 0x0040,
    NETIF_MSG_TX_ERR        = 0x0080,
    NETIF_MSG_TX_QUEUED     = 0x0100,
    NETIF_MSG_INTR          = 0x0200,
    NETIF_MSG_TX_DONE       = 0x0400,
    NETIF_MSG_RX_STATUS     = 0x0800,
    NETIF_MSG_PKTDATA       = 0x1000,
    NETIF_MSG_HW            = 0x2000,
    NETIF_MSG_WOL           = 0x4000,
*/
#define DEF_MSG_ENABLE 0xffff

static void dump(unsigned char *buf)
{
    unsigned char *p, sbuf[2*(sizeof(struct ethhdr) + sizeof(struct iphdr))];
    int i;
    p = sbuf;

    for(i = 0; i < sizeof(struct ethhdr); i++) {
        p += sprintf(p, "%02X ", buf[i]);
    }
    printk("eth %s\n", sbuf);

    p = sbuf;
    for(i = 0; i < sizeof(struct iphdr); i++) {
        p += sprintf(p, "%02X ", buf[sizeof(struct ethhdr) + i]);
    }
    printk("iph %s\n", sbuf);

    p = sbuf;
    for(i = 0; i < 4; i++) {
        p += sprintf(p, "%02X ", buf[sizeof(struct ethhdr) + sizeof(struct iphdr) + i]);
    }
    printk("payload %s\n", sbuf);
}


static void
nic_rx(struct net_device *netdev, int len, unsigned char *buf)
{
    struct sk_buff *skb;
    struct nic_priv *priv = netdev_priv(netdev);

    netif_info(priv, hw, netdev, "%s(#%d), rx:%d\n",
                __func__, __LINE__, len);
    /*
     * The packet has been retrieved from the transmission
     * medium. Build an skb around it, so upper layers can handle it
     */
    skb = dev_alloc_skb(len + 2);
    if (!skb) {
        netif_err(priv, rx_err, netdev,
                  "%s(#%d), rx: low on mem - packet dropped\n",
                  __func__, __LINE__);
        netdev->stats.rx_dropped++;
        return;
    }
    skb_reserve(skb, 2); /* align IP on 16B boundary */
    memcpy(skb_put(skb, len), buf, len);

    /* Write metadata, and then pass to the receive level */
    skb->dev = netdev;
    skb->protocol = eth_type_trans(skb, netdev);
    skb->ip_summed = CHECKSUM_UNNECESSARY; /* don't check it */
    netdev->stats.rx_packets++;
    netdev->stats.rx_bytes += len;
    netif_rx(skb);
}

static int nic_open(struct net_device *netdev)
{
    struct nic_priv *priv = netdev_priv(netdev);
    netif_info(priv, ifup, netdev, "%s(#%d), priv:%p\n",
                __func__, __LINE__, priv);
    /* may be using DMA */
    priv->tx_buf = kmalloc(MAX_ETH_FRAME_SIZE, GFP_KERNEL);
    if (priv->tx_buf == NULL) {
        netif_info(priv, ifup, netdev, "%s(#%d), cannot alloc tx buf\n",
                    __func__, __LINE__);
        return -ENOMEM;
    }
    netif_start_queue(netdev);
    return 0;
}

static int nic_close(struct net_device *netdev)
{
    struct nic_priv *priv = netdev_priv(netdev);
    netif_info(priv, ifdown, netdev, "%s(#%d), priv:%p\n",
                __func__, __LINE__, priv);
    netif_stop_queue(netdev);
    return 0;
}

static void nic_hw_xmit(struct net_device *netdev)
{
    struct nic_priv *priv = netdev_priv(netdev);
    struct iphdr *iph;
    u32 *saddr, *daddr;
    struct in_device* in_dev;
    struct in_ifaddr* if_info;

    if (priv->tx_len < sizeof(struct ethhdr) + sizeof(struct iphdr)) {
        netif_info(priv, hw, netdev, "%s(#%d), too short\n",
                   __func__, __LINE__);
        return;
    }
    dump(priv->tx_buf);
    iph = (struct iphdr *)(priv->tx_buf + sizeof(struct ethhdr));
    saddr = &iph->saddr;
    daddr = &iph->daddr;

    netif_info(priv, hw, netdev, "%s(#%d), orig, src:%pI4, dst:%pI4, len:%d\n",
                __func__, __LINE__, saddr, daddr, priv->tx_len);

    in_dev = nic_dev[(netdev == nic_dev[0] ? 1 : 0)]->ip_ptr;
    if (in_dev) {
        if_info = in_dev->ifa_list;
        for (if_info = in_dev->ifa_list; if_info; if_info=if_info->ifa_next) {
#if 0
            printk("label:%s, address=%pI4\n",
               if_info->ifa_label, &if_info->ifa_address);
#endif
            *saddr = *daddr = if_info->ifa_address;
            ((u8 *)saddr)[3]++;
            netif_info(priv, hw, netdev, "%s(#%d), new, src:%pI4, dst:%pI4\n",
                        __func__, __LINE__, saddr, daddr);
            break;
        }
        if (!if_info) {
            /* drop packet */
            netdev->stats.tx_dropped++;
            netif_info(priv, hw, netdev, "%s(#%d), drop packet\n",
                        __func__, __LINE__);
            return;
        }
    }

    iph->check = 0;         /* and rebuild the checksum (ip needs it) */
    iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);

    netdev->stats.tx_packets++;
    netdev->stats.tx_bytes += priv->tx_len;

    nic_rx(nic_dev[(netdev == nic_dev[0] ? 1 : 0)], priv->tx_len, priv->tx_buf);
}

static netdev_tx_t nic_start_xmit(struct sk_buff *skb,
                                  struct net_device *netdev)
{
    struct nic_priv *priv = netdev_priv(netdev);
    netif_info(priv, drv, netdev, "%s(#%d), orig, src:%pI4, dst:%pI4\n",
                __func__, __LINE__, &(ip_hdr(skb)->saddr), &(ip_hdr(skb)->daddr));
    priv->tx_len = skb->len;
    if (likely(priv->tx_len < MAX_ETH_FRAME_SIZE)) {
        if (priv->tx_len < ETH_ZLEN) {
            memset(priv->tx_buf, 0, ETH_ZLEN);
            priv->tx_len = ETH_ZLEN;
        }
        skb_copy_and_csum_dev(skb, priv->tx_buf);
        dev_kfree_skb_any(skb);
    } else {
        dev_kfree_skb_any(skb);
        netdev->stats.tx_dropped++;
        return NETDEV_TX_OK;
    }

    nic_hw_xmit(netdev);
    return NETDEV_TX_OK;
}

static int nic_validate_addr(struct net_device *netdev)
{
    struct nic_priv *priv = netdev_priv(netdev);
    netif_info(priv, drv, netdev, "%s(#%d), priv:%p\n",
                __func__, __LINE__, priv);
    return eth_validate_addr(netdev);
}

static int nic_change_mtu(struct net_device *netdev, int new_mtu)
{
    struct nic_priv *priv = netdev_priv(netdev);
    netif_info(priv, drv, netdev, "%s(#%d), priv:%p\n",
                __func__, __LINE__, priv);
    return eth_change_mtu(netdev, new_mtu);
}

static int nic_set_mac_addr(struct net_device *netdev, void *addr)
{
    struct nic_priv *priv = netdev_priv(netdev);
    netif_info(priv, drv, netdev, "%s(#%d), priv:%p\n",
                __func__, __LINE__, priv);
    return eth_mac_addr(netdev, addr);
}

/*
 * This function is called to fill up an eth header, since arp is not
 * available on the interface
 */
int snull_header(struct sk_buff *skb, struct net_device *netdev,
                unsigned short type, const void *daddr, const void *saddr,
                unsigned len)
{
    struct nic_priv *priv = netdev_priv(netdev);
    struct ethhdr *eth = (struct ethhdr *)skb_push(skb, ETH_HLEN);
    struct net_device *dst_netdev;

    netif_info(priv, drv, netdev, "%s(#%d)\n",
                __func__, __LINE__);
    dst_netdev = nic_dev[(netdev == nic_dev[0] ? 1 : 0)];
    eth->h_proto = htons(type);
    memcpy(eth->h_source, saddr ? saddr : netdev->dev_addr, netdev->addr_len);
    memcpy(eth->h_dest, dst_netdev->dev_addr, dst_netdev->addr_len);
    return (netdev->hard_header_len);
}

static const struct header_ops snull_header_ops = {
        .create  = snull_header,
};

static const struct net_device_ops nic_netdev_ops = {
    /* Kernel calls ndo_open() and ndo_validate_addr()
     * when you bring up the NIC
     */
    .ndo_open               = nic_open,
    .ndo_validate_addr      = nic_validate_addr,

    /* when you shut down the NIC, kernel call the .ndo_stop() */
    .ndo_stop               = nic_close,

    /* Kernel calls ndo_start_xmit() when it wants to 
     *   transmit a packet. 
     */
    .ndo_start_xmit         = nic_start_xmit,

    /* ndo_change_mtu() is called, when you change MTU */
    .ndo_change_mtu         = nic_change_mtu,

    /* ndo_set_mac_address() is called,
     *   when you change the MAC addr
     */
    .ndo_set_mac_address    = nic_set_mac_addr,
};

static struct net_device* nic_alloc_netdev(void)
{
    struct net_device *netdev;

    netdev = alloc_etherdev(sizeof(struct nic_priv));
    if (!netdev) {
        pr_err("%s(#%d): alloc dev failed",
               __func__, __LINE__);
        return NULL;
    }
    eth_hw_addr_random(netdev);
    netdev->netdev_ops = &nic_netdev_ops;

    /* keep the default flags, just add NOARP */
    netdev->flags |= IFF_NOARP;

    /* There are no explicit users, so this is 
     *     now equivalent to NETIF_F_HW_CSUM. */
    netdev->features |= NETIF_F_HW_CSUM;

    netdev->header_ops = &snull_header_ops;

    return netdev;
}

static int __init brook_init(void)
{
    int ret;
    struct nic_priv *priv;

    nic_dev[0] = nic_alloc_netdev();
    if (!nic_dev[0]) {
        pr_err("%s(#%d): alloc netdev[0] failed", __func__, __LINE__);
        return -ENOMEM;
    }

    nic_dev[1] = nic_alloc_netdev();
    if (!nic_dev[1]) {
        pr_err("%s(#%d): alloc netdev[1] failed", __func__, __LINE__);
        ret = -ENOMEM;
        goto alloc_2nd_failed;
    }

    ret = register_netdev(nic_dev[0]);
    if (ret) {
        pr_err("%s(#%d): reg net driver failed. ret:%d",
               __func__, __LINE__, ret);
        goto reg1_failed;
    }

    ret = register_netdev(nic_dev[1]);
    if (ret) {
        pr_err("%s(#%d): reg net driver failed. ret:%d",
               __func__, __LINE__, ret);
        goto reg2_failed;
    }

    priv = netdev_priv(nic_dev[0]);
    priv->msg_enable = DEF_MSG_ENABLE;
    priv = netdev_priv(nic_dev[1]);
    priv->msg_enable = DEF_MSG_ENABLE;
    return 0;

reg2_failed:
    unregister_netdev(nic_dev[0]);
reg1_failed:
    free_netdev(nic_dev[1]);
alloc_2nd_failed:
    free_netdev(nic_dev[0]);
    return ret;
}
module_init(brook_init);

static void __exit brook_exit(void)
{
    int i;
    pr_info("%s(#%d): remove module", __func__, __LINE__);
    for (i = 0; i < ARRAY_SIZE(nic_dev); i++) {
        unregister_netdev(nic_dev[i]);
        free_netdev(nic_dev[i]);
    }
}
module_exit(brook_exit);





參考資料:
  1. Linux Device Drivers, Third Edition, Chapter 17: Network Drivers, https://lwn.net/Kernel/LDD3/,
  2. Linux Networking and Network Devices APIs, https://www.kernel.org/doc/htmldocs/networking/index.html




2016年1月3日 星期日

Linux Kernel(15.2)- platform_device_register()之如何调用driver.probe()


Linux Kernel(16.1)- platform_driver_register()之如何调用driver.probe()之後,我們來看如何由platform_device_register调用driver.probe()

int platform_device_register(struct platform_device *pdev)
{
    device_initialize(&pdev->dev);
    arch_setup_pdev_archdata(pdev);
    return platform_device_add(pdev);
}
EXPORT_SYMBOL_GPL(platform_device_register);


int platform_device_add(struct platform_device *pdev)
{
    int i, ret;

    if (!pdev)
        return -EINVAL;

    if (!pdev->dev.parent)
        pdev->dev.parent = &platform_bus;

    pdev->dev.bus = &platform_bus_type;

    switch (pdev->id) {
    default:
        dev_set_name(&pdev->dev, "%s.%d", pdev->name,  pdev->id);
        break;
    case PLATFORM_DEVID_NONE:
        dev_set_name(&pdev->dev, "%s", pdev->name);
        break;
    case PLATFORM_DEVID_AUTO:
        /*
         * Automatically allocated device ID. We mark it as such so
         * that we remember it must be freed, and we append a suffix
         * to avoid namespace collision with explicit IDs.
         */
        ret = ida_simple_get(&platform_devid_ida, 0, 0, GFP_KERNEL);
        if (ret < 0)
            goto err_out;
        pdev->id = ret;
        pdev->id_auto = true;
        dev_set_name(&pdev->dev, "%s.%d.auto", pdev->name, pdev->id);
        break;
    }

    for (i = 0; i < pdev->num_resources; i++) {
        struct resource *p, *r = &pdev->resource[i];

        if (r->name == NULL)
            r->name = dev_name(&pdev->dev);

        p = r->parent;
        if (!p) {
            if (resource_type(r) == IORESOURCE_MEM)
                p = &iomem_resource;
            else if (resource_type(r) == IORESOURCE_IO)
                p = &ioport_resource;
        }

        if (p && insert_resource(p, r)) {
            dev_err(&pdev->dev, "failed to claim resource %d\n", i);
            ret = -EBUSY;
            goto failed;
        }
    }

    pr_debug("Registering platform device '%s'. Parent at %s\n",
         dev_name(&pdev->dev), dev_name(pdev->dev.parent));

    ret = device_add(&pdev->dev);
    if (ret == 0)
        return ret;

 failed:
    if (pdev->id_auto) {
        ida_simple_remove(&platform_devid_ida, pdev->id);
        pdev->id = PLATFORM_DEVID_AUTO;
    }

    while (--i >= 0) {
        struct resource *r = &pdev->resource[i];
        if (r->parent)
            release_resource(r);
    }

 err_out:
    return ret;
}


int device_add(struct device *dev) 
{
    struct device *parent = NULL;
    struct kobject *kobj;
    struct class_interface *class_intf;
    int error = -EINVAL;

    dev = get_device(dev);
    if (!dev)
        goto done;

    if (!dev->p) {
        error = device_private_init(dev);
        if (error)
            goto done;
    }

    /*
     * for statically allocated devices, which should all be converted
     * some day, we need to initialize the name. We prevent reading back
     * the name, and force the use of dev_name()
     */
    if (dev->init_name) {
        dev_set_name(dev, "%s", dev->init_name);
        dev->init_name = NULL;
    }

    /* subsystems can specify simple device enumeration */
    if (!dev_name(dev) && dev->bus && dev->bus->dev_name)
        dev_set_name(dev, "%s%u", dev->bus->dev_name, dev->id);

    if (!dev_name(dev)) {
        error = -EINVAL;
        goto name_error;
    }

    pr_debug("device: '%s': %s\n", dev_name(dev), __func__);

    parent = get_device(dev->parent);
    kobj = get_device_parent(dev, parent);
    if (kobj)
        dev->kobj.parent = kobj;

    /* use parent numa_node */
    if (parent)
        set_dev_node(dev, dev_to_node(parent));

    /* first, register with generic layer. */
    /* we require the name to be set before, and pass NULL */
    error = kobject_add(&dev->kobj, dev->kobj.parent, NULL);
    if (error)
        goto Error;

    /* notify platform of device entry */
    if (platform_notify)
        platform_notify(dev);

    error = device_create_file(dev, &dev_attr_uevent);
    if (error)
        goto attrError;

    error = device_add_class_symlinks(dev);
    if (error)
        goto SymlinkError;
    error = device_add_attrs(dev);
    if (error)
        goto AttrsError;
    error = bus_add_device(dev);
    if (error)
        goto BusError;
    error = dpm_sysfs_add(dev);
    if (error)
        goto DPMError;
    device_pm_add(dev);

    if (MAJOR(dev->devt)) {
        error = device_create_file(dev, &dev_attr_dev);
        if (error)
            goto DevAttrError;

        error = device_create_sys_dev_entry(dev);
        if (error)
            goto SysEntryError;

        devtmpfs_create_node(dev);
    }

    /* Notify clients of device addition.  This call must come
     * after dpm_sysfs_add() and before kobject_uevent().
     */
    if (dev->bus)
        blocking_notifier_call_chain(&dev->bus->p->bus_notifier,
                         BUS_NOTIFY_ADD_DEVICE, dev);

    kobject_uevent(&dev->kobj, KOBJ_ADD);
    bus_probe_device(dev);
    if (parent)
        klist_add_tail(&dev->p->knode_parent,
                   &parent->p->klist_children);

    if (dev->class) {
        mutex_lock(&dev->class->p->mutex);
        /* tie the class to the device */
        klist_add_tail(&dev->knode_class,
                   &dev->class->p->klist_devices);

        /* notify any interfaces that the device is here */
        list_for_each_entry(class_intf,
                    &dev->class->p->interfaces, node)
            if (class_intf->add_dev)
                class_intf->add_dev(dev, class_intf);
        mutex_unlock(&dev->class->p->mutex);
    }
done:
    put_device(dev);
    return error;
 SysEntryError:
    if (MAJOR(dev->devt))
        device_remove_file(dev, &dev_attr_dev);
 DevAttrError:
    device_pm_remove(dev);
    dpm_sysfs_remove(dev);
 DPMError:
    bus_remove_device(dev);
 BusError:
    device_remove_attrs(dev);
 AttrsError:
    device_remove_class_symlinks(dev);
 SymlinkError:
    device_remove_file(dev, &dev_attr_uevent);
 attrError:
    kobject_uevent(&dev->kobj, KOBJ_REMOVE);
    kobject_del(&dev->kobj);
 Error:
    cleanup_device_parent(dev);
    put_device(parent);
name_error:
    kfree(dev->p);
    dev->p = NULL;
    goto done;
}


void bus_probe_device(struct device *dev)
{
    struct bus_type *bus = dev->bus;
    struct subsys_interface *sif;

    if (!bus)
        return;

    if (bus->p->drivers_autoprobe)
        device_initial_probe(dev);

    mutex_lock(&bus->p->mutex);
    list_for_each_entry(sif, &bus->p->interfaces, node)
        if (sif->add_dev)
            sif->add_dev(dev, sif);
    mutex_unlock(&bus->p->mutex);
}


void device_initial_probe(struct device *dev)
{
    __device_attach(dev, true);
}


static int __device_attach(struct device *dev, bool allow_async)
{
    int ret = 0;

    device_lock(dev);
    if (dev->driver) {
        if (klist_node_attached(&dev->p->knode_driver)) {
            ret = 1;
            goto out_unlock;
        }
        ret = device_bind_driver(dev);
        if (ret == 0)
            ret = 1;
        else {
            dev->driver = NULL;
            ret = 0;
        }
    } else {
        struct device_attach_data data = {
            .dev = dev,
            .check_async = allow_async,
            .want_async = false,
        };

        if (dev->parent)
            pm_runtime_get_sync(dev->parent);

        ret = bus_for_each_drv(dev->bus, NULL, &data,
                    __device_attach_driver);
        if (!ret && allow_async && data.have_async) {
            /*
             * If we could not find appropriate driver
             * synchronously and we are allowed to do
             * async probes and there are drivers that
             * want to probe asynchronously, we'll
             * try them.
             */
            dev_dbg(dev, "scheduling asynchronous probe\n");
            get_device(dev);
            async_schedule(__device_attach_async_helper, dev);
        } else {
            pm_request_idle(dev);
        }

        if (dev->parent)
            pm_runtime_put(dev->parent);
    }
out_unlock:
    device_unlock(dev);
    return ret;
}


static int __device_attach_driver(struct device_driver *drv, void *_data)
{
    struct device_attach_data *data = _data;
    struct device *dev = data->dev;
    bool async_allowed;

    /*
     * Check if device has already been claimed. This may
     * happen with driver loading, device discovery/registration,
     * and deferred probe processing happens all at once with
     * multiple threads.
     */
    if (dev->driver)
        return -EBUSY;

    if (!driver_match_device(drv, dev))
        return 0;

    async_allowed = driver_allows_async_probing(drv);

    if (async_allowed)
        data->have_async = true;

    if (data->check_async && async_allowed != data->want_async)
        return 0;

    return driver_probe_device(drv, dev);
}


static inline int driver_match_device(struct device_driver *drv,
                      struct device *dev)
{
    return drv->bus->match ? drv->bus->match(dev, drv) : 1;
}





Linux Kernel(15.1)- platform_driver_register()之如何调用driver.probe()



轉自platform_driver_register()--如何match之后调用probe

int platform_driver_register(struct platform_driver *drv)
{
    drv->driver.bus = &platform_bus_type;/*关联总线*/
    /*关联driver的设备方法*/
    if (drv->probe)
        drv->driver.probe = platform_drv_probe;
    if (drv->remove)
        drv->driver.remove = platform_drv_remove;
    if (drv->shutdown)
        drv->driver.shutdown = platform_drv_shutdown;

    return driver_register(&drv->driver);/*注册驱动*/
}

/******************************************************************************/
struct platform_driver {
    int (*probe)(struct platform_device *);/*匹配到设备后调用,下面分析内核代码怎么调用的*/
    int (*remove)(struct platform_device *);
    void (*shutdown)(struct platform_device *);
    int (*suspend)(struct platform_device *, pm_message_t state);
    int (*resume)(struct platform_device *);
    struct device_driver driver;
    const struct platform_device_id *id_table;
};

struct bus_type platform_bus_type = {
    .name        = "platform",
    .dev_attrs    = platform_dev_attrs,
    .match        = platform_match,
    .uevent        = platform_uevent,
    .pm        = &platform_dev_pm_ops,
};
/********************************************************************************/

int driver_register(struct device_driver *drv)
{
    int ret;
    struct device_driver *other;

    BUG_ON(!drv->bus->p);

    if ((drv->bus->probe && drv->probe) ||
        (drv->bus->remove && drv->remove) ||
        (drv->bus->shutdown && drv->shutdown))
        printk(KERN_WARNING "Driver '%s' needs updating - please use "
            "bus_type methods\n", drv->name);

    other = driver_find(drv->name, drv->bus);
    if (other) {
        put_driver(other);
        printk(KERN_ERR "Error: Driver '%s' is already registered, "
            "aborting...\n", drv->name);
        return -EBUSY;
    }

    ret = bus_add_driver(drv);
    if (ret)
        return ret;
    ret = driver_add_groups(drv, drv->groups);
    if (ret)
        bus_remove_driver(drv);
    return ret;
}

int bus_add_driver(struct device_driver *drv)
{
    struct bus_type *bus;
    struct driver_private *priv;
    int error = 0;

    bus = bus_get(drv->bus);
    if (!bus)
        return -EINVAL;

    pr_debug("bus: '%s': add driver %s\n", bus->name, drv->name);

    priv = kzalloc(sizeof(*priv), GFP_KERNEL);
    if (!priv) {
        error = -ENOMEM;
        goto out_put_bus;
    }
    klist_init(&priv->klist_devices, NULL, NULL);
    priv->driver = drv;
    drv->p = priv;
    priv->kobj.kset = bus->p->drivers_kset;
    error = kobject_init_and_add(&priv->kobj, &driver_ktype, NULL,
                     "%s", drv->name);
    if (error)
        goto out_unregister;

    if (drv->bus->p->drivers_autoprobe) {
        error = driver_attach(drv);
        if (error)
            goto out_unregister;
    }
    klist_add_tail(&priv->knode_bus, &bus->p->klist_drivers);
    module_add_driver(drv->owner, drv);

    error = driver_create_file(drv, &driver_attr_uevent);
    if (error) {
        printk(KERN_ERR "%s: uevent attr (%s) failed\n",
            __func__, drv->name);
    }
    error = driver_add_attrs(bus, drv);
    if (error) {
        /* How the hell do we get out of this pickle? Give up */
        printk(KERN_ERR "%s: driver_add_attrs(%s) failed\n",
            __func__, drv->name);
    }

    if (!drv->suppress_bind_attrs) {
        error = add_bind_files(drv);
        if (error) {
            /* Ditto */
            printk(KERN_ERR "%s: add_bind_files(%s) failed\n",
                __func__, drv->name);
        }
    }

    kobject_uevent(&priv->kobj, KOBJ_ADD);
    return 0;

out_unregister:
    kobject_put(&priv->kobj);
    kfree(drv->p);
    drv->p = NULL;
out_put_bus:
    bus_put(bus);
    return error;
}

int driver_attach(struct device_driver *drv)
{
    /*对总线上的每一个设备都调用__driver_attach*/
    return bus_for_each_dev(drv->bus, NULL, drv, __driver_attach);
}

static int __driver_attach(struct device *dev, void *data)
{
    struct device_driver *drv = data;

    /*
     * Lock device and try to bind to it. We drop the error
     * here and always return 0, because we need to keep trying
     * to bind to devices and some drivers will return an error
     * simply if it didn't support the device.
     *
     * driver_probe_device() will spit a warning if there
     * is an error.
     */

    if (!driver_match_device(drv, dev))
        return 0;

    if (dev->parent)    /* Needed for USB */
        device_lock(dev->parent);
    device_lock(dev);
    if (!dev->driver)
        driver_probe_device(drv, dev);
    device_unlock(dev);
    if (dev->parent)
        device_unlock(dev->parent);

    return 0;
}

static inline int driver_match_device(struct device_driver *drv,
                      struct device *dev)
{
    /*调用总线的match去匹配设备和驱动*/
    return drv->bus->match ? drv->bus->match(dev, drv) : 1;
}

int driver_probe_device(struct device_driver *drv, struct device *dev)
{
    int ret = 0;

    if (!device_is_registered(dev))
        return -ENODEV;

    pr_debug("bus: '%s': %s: matched device %s with driver %s\n",
         drv->bus->name, __func__, dev_name(dev), drv->name);

    pm_runtime_get_noresume(dev);
    pm_runtime_barrier(dev);
    ret = really_probe(dev, drv);
    pm_runtime_put_sync(dev);

    return ret;
}
static int really_probe(struct device *dev, struct device_driver *drv)
{
    int ret = 0;

    atomic_inc(&probe_count);
    pr_debug("bus: '%s': %s: probing driver %s with device %s\n",
         drv->bus->name, __func__, drv->name, dev_name(dev));
    WARN_ON(!list_empty(&dev->devres_head));

    dev->driver = drv;
    if (driver_sysfs_add(dev)) {
        printk(KERN_ERR "%s: driver_sysfs_add(%s) failed\n",
            __func__, dev_name(dev));
        goto probe_failed;
    }
/**********************************************************************************/
    if (dev->bus->probe) {/*首先看总线有没有probe函数,若有则调用,而平台总线没有probe*/
        ret = dev->bus->probe(dev);
        if (ret)
            goto probe_failed;
    } else if (drv->probe) {/*然后看驱动有没有probe函数,若有则调用,*/
        ret = drv->probe(dev);
        if (ret)
            goto probe_failed;
    }
/************************************************************************************/

    driver_bound(dev);
    ret = 1;
    pr_debug("bus: '%s': %s: bound device %s to driver %s\n",
         drv->bus->name, __func__, dev_name(dev), drv->name);
    goto done;

probe_failed:
    devres_release_all(dev);
    driver_sysfs_remove(dev);
    dev->driver = NULL;

    if (ret != -ENODEV && ret != -ENXIO) {
        /* driver matched but the probe failed */
        printk(KERN_WARNING
               "%s: probe of %s failed with error %d\n",
               drv->name, dev_name(dev), ret);
    }
    /*
     * Ignore errors returned by ->probe so that the next driver can try
     * its luck.
     */
    ret = 0;
done:
    atomic_dec(&probe_count);
    wake_up(&probe_waitqueue);
    return ret;
}


/*平台总线的match逻辑*/
static int platform_match(struct device *dev, struct device_driver *drv)
{
    struct platform_device *pdev = to_platform_device(dev);
    struct platform_driver *pdrv = to_platform_driver(drv);

    /* match against the id table first */
    if (pdrv->id_table)
        return platform_match_id(pdrv->id_table, pdev) != NULL;

    /* fall-back to driver name match */
    return (strcmp(pdev->name, drv->name) == 0);/*驱动名字与设备名字要匹配*/
}




2016年1月2日 星期六

Linux Kernel(15)- Platform Devices


很多人心中都有過一個問題What is the difference between Platform driver and normal device driver?,簡單的來說Platform devices就non-discoverable,也就是device本身沒辦法跟系統說"我在這裡",典型的就是I2C device,它不會通知kernel"我在這裡",通常是預先知道有個I2C device在那裡,再由software設定好,這類non-discoverable device就適用Platform devices架構來寫。
platform device會被connect在platform bus上,而platform bus是一個虛擬的bus(pseudo-bus),這樣可以讓整個架構platform driver符合Linux的標準driver model。

這篇會根據The platform device API教導如何寫一個簡單的Platform devices,基本上最基本的platform device只需要name,因為platform bus會根據platform device與platform driver的name是否match執行driver的probe(),而最簡單的platform driver只需要name,跟probe()與remove()即可。
#include <linux/module.h>
#include <linux/platform_device.h>

MODULE_AUTHOR("Brook");
MODULE_DESCRIPTION("Kernel module for demo");
MODULE_LICENSE("GPL");

#define DEVNAME "brook"

#define DYN_ALLOC 1

static struct platform_device brook_device = {
    .name = DEVNAME,
};

static int brook_probe(struct platform_device *pdev)
{
    pr_info("%s(#%d)\n", __func__, __LINE__);
    return 0;
}

static int brook_remove(struct platform_device *pdev)
{
    pr_info("%s(#%d)\n", __func__, __LINE__);
    return 0;
}

static struct platform_driver brook_driver = {
    .driver = {
        .name  = DEVNAME,
        .owner = THIS_MODULE,
    },
    .probe  = brook_probe,
    .remove = brook_remove,
};

static int __init brook_init(void)
{
    int err;
    pr_info("%s(#%d)\n", __func__, __LINE__);

    err = platform_device_register(&brook_device);
    if (err) {
        pr_err("%s(#%d): platform_device_register failed(%d)\n",
                __func__, __LINE__, err);
        return err;
    }

    err = platform_driver_register(&brook_driver);
    if (err) {
        dev_err(&(brook_device.dev), "%s(#%d): platform_driver_register fail(%d)\n",
                __func__, __LINE__, err);
        goto dev_reg_failed;
    }
    return err;

dev_reg_failed:
    platform_device_unregister(&brook_device);

    return err;
}
module_init(brook_init);

static void __exit brook_exit(void)
{
    dev_info(&(brook_device.dev), "%s(#%d)\n", __func__, __LINE__);
    platform_device_unregister(&brook_device);
    platform_driver_unregister(&brook_driver);
}
module_exit(brook_exit);


使用platform_device_register()會導致"brook.0" does not have a release() function, it is broken and must be fixed.的OOPS,可以改用platform_device_alloc() + platform_device_add(),platform_device_alloc()裡面就會做pa->pdev.dev.release = platform_device_release。
#include <linux/module.h>
#include <linux/platform_device.h>

MODULE_AUTHOR("Brook");
MODULE_DESCRIPTION("Kernel module for demo");
MODULE_LICENSE("GPL");

#define DEVNAME "brook"

#define DYN_ALLOC 1

static struct platform_device *brook_device;

static int brook_probe(struct platform_device *pdev)
{
    pr_info("%s(#%d)\n", __func__, __LINE__);
    return 0;
}

static int brook_remove(struct platform_device *pdev)
{
    pr_info("%s(#%d)\n", __func__, __LINE__);
    return 0;
}

static struct platform_driver brook_driver = {
    .driver = {
        .name  = DEVNAME,
        .owner = THIS_MODULE,
    },
    .probe  = brook_probe,
    .remove = brook_remove,
};

static int __init brook_init(void)
{
    int err;
    pr_info("%s(#%d)\n", __func__, __LINE__);

    /* using platform_device_alloc() + platform_device_add() 
     * instead of platform_device_register() to avoid the OOPS, 
     *     "Device 'brook.0' does not have a release() function,
     *      it is broken and must be fixed."
     */
    brook_device = platform_device_alloc(DEVNAME, 0);
    if (!brook_device) {
        pr_err("%s(#%d): platform_device_alloc fail\n",
               __func__, __LINE__);
        return -ENOMEM;
    }

    err = platform_device_add(brook_device);
    if (err) {
        pr_err("%s(#%d): platform_device_add failed\n",
               __func__, __LINE__);
        goto dev_add_failed;
    }

    err = platform_driver_register(&brook_driver);
    if (err) {
        dev_err(&(brook_device->dev), "%s(#%d): platform_driver_register fail(%d)\n",
                __func__, __LINE__, err);
        goto dev_reg_failed;
    }
    return err;

dev_add_failed:
    platform_device_put(brook_device);
dev_reg_failed:
    platform_device_unregister(brook_device);

    return err;
}
module_init(brook_init);

static void __exit brook_exit(void)
{
    dev_info(&(brook_device->dev), "%s(#%d)\n", __func__, __LINE__);
    platform_device_unregister(brook_device);
    platform_driver_unregister(&brook_driver);
}
module_exit(brook_exit);



    參考資料:
  1. Documentation/driver-model
  2. What is the difference between Platform driver and normal device driver?
  3. The platform device API
  4. Linux Kernel architecture for device drivers
  5. platform_driver_register()--如何match之后调用probe
  6. Improved dynamically allocated platform_device interface




2014年6月14日 星期六

Table Of Content for tag "Linux - kernel"


Linux Kernel(1)- Linux Module簡介
Linux Modules(1.1)module parameters

Linux Kernel(2)- register char device

Linux Kernel(3)- procfs
Linux Kernel(3.1)- procfs之vector方式寫入
Linux Kernel(3.2)- procfs之symlink與mkdir

Linux Kernel(4)- seq_file
Linux Kernel(4.1)- seq_file之範例(fp/proc/devices.c)
Linux Kernel(4.2)- seq_file之single page

Linux Kernel(5)- ioctl

Linux Kernel(6)- miscdev

Linux Kernel(7)- timing
Linux Kernel(7.1)- timer
Linux Modules(7.2)- tasklet
Linux Modules(7.3)- work queue

Linux Kernel(8)- Notification
Linux Kernel(8.1)- Notifier機制剖析

Linux Kernel(9)- Kthread

Linux Kernel(10.1)- drivers/mtd/devices/mtdram.c
Linux Kernel(10.2)- mtd partitions
Linux Kernel(10.3)- Command line partition table parsing

Linux Kernel(11)- sysfs and device node
Linux Kernel(11.1)- sysfs and hotplug

Linux Kernel(12)- netfilter
Linux Kernel(12.1)- netfilter機制之初探

Linux Kernel(13)- syscall

Linux Kernel(14)- Kernel Synchronization
Linux Modules(14.1)- Read Copy Update

Linux Kernel(15)- Platform Devices
Linux Kernel(15.1)- platform_driver_register()之如何调用driver.probe()
Linux Kernel(15.2)- platform_device_register()之如何调用driver.probe()

Linux Kernel(16.1)- Network Device Driver, simple snull





2011年12月10日 星期六

send signal to user-space


某天有個需求是希望當kernel發生某事件時通知user-space的process,心裡想最快就是送signal,於是google一下,果然有人有類似的需求,signals handling in the kernel,於是改了一下把他放上來,值得一提的是,其實這樣並不被鼓勵的,而且原本的kill_proc_info並沒有被export出來,所以如果是module要使用的話,就必須把他export出來,EXPORT_SYMBOL(kill_proc_info)

#include <linux/module.h>
#include <linux/init.h>
#include <linux/moduleparam.h>

#include <linux/sched.h>
#include <linux/kernel.h> /* printk() */
// #include <linux/slab.h> /* kmalloc() */
#include <linux/errno.h>  /* error codes */
#include <linux/types.h>  /* size_t */
#include <linux/signal.h>
#include <linux/proc_fs.h>
#include <linux/uaccess.h>

#define PROC_NAME "sig2pid"

/**
 * 送signal 到pid去
 */
static int send_sig_to_pid(int sig, pid_t pid)
{
    struct siginfo info;

    info.si_signo = sig;
    info.si_errno = 0;
    info.si_code = SI_USER; // sent by kill, sigsend, raise
    info.si_pid = get_current()->pid; // sender's pid
    info.si_uid = current_uid(); // sender's uid

    return kill_proc_info(sig, &info, pid);
}

/**
 * /proc/sig2pid的write ops
 */
static int
sig2pid_proc_write(struct file *file, const char __user * buffer,
                     unsigned long count, void *data)
{
    int sig, pid, ret;
    char line[count];
    ret = copy_from_user(line, buffer, count);

    if (ret) {
        return -EFAULT;
    }
    sscanf(line, "%d %d", &pid, &sig);
    printk("%s(#%d): pid(%d), sig(%d)\n",
            __func__, __LINE__, pid, sig);
    send_sig_to_pid(sig, (pid_t) pid);
    return count;
}

/**
 * 建立/proc/sig2pid
 */
static int create_proc_file(void)
{
    struct proc_dir_entry *p;
    p = create_proc_entry(PROC_NAME, S_IFREG | S_IWUGO, NULL);
    if (!p) {
        printk("%s(#%d): create proc entry failed\n", __func__, __LINE__);
        return -EFAULT;
    }
    p->write_proc = sig2pid_proc_write;
    return 0;
}

int sig2pid_init_module(void)
{
    return create_proc_file();
}

void sig2pid_exit_module(void)
{
    remove_proc_entry(PROC_NAME, NULL);
}

module_init(sig2pid_init_module);
module_exit(sig2pid_exit_module);


    參考資料:
  • http://old.nabble.com/signals-handling-in-the-kernel-to12032525.html#a12032525 , signals handling in the kernel.
  • http://kerneltrap.org/node/5800, how to send signal from kernel space to user space.

github: https://github.com/brook-kuo/Linux_Module/tree/master/process/send_sig_to_userspace


2011年11月6日 星期日

利用gen_init_cpio建立initrd的script


利用gen_init_cpio建立initrd的script
#!/bin/bash
#kernel的目錄
KERN_DIR=/usr/src/linux-kvm
#gen_initramfs_list.sh產生的暫存檔
INITRAMFS_LIST=/tmp/gen_initramfs_list
#initramfs的來源目錄, 為傳入該script的第一個參數
INITRAMFS_DIR=$1
#initrd的目的檔名, 為傳入該script的第二個參數
INITRD=$2
#給INITRAMFS_DIR default值
: ${INITRAMFS_DIR:="/home/brook/projects/rootfs"}
#給INITRD default值, 
: ${INITRD:="/home/brook/initrd"}

if [ ! -d $INITRAMFS_DIR ]; then
    echo "usage: $0 <initramfs_dir> <output_file>"
    exit 1
fi

sh $KERN_DIR/scripts/gen_initramfs_list.sh -d $INITRAMFS_DIR > $INITRAMFS_LIST
$KERN_DIR/usr/gen_init_cpio $INITRAMFS_LIST > $INITRD



2011年5月7日 星期六

Linux Kernel(14)- Kernel Synchronization


這裡簡單介紹的介紹一下Kernel Synchronization的幾個觀念的幾個觀念。
  1. Race Condition
  2. 當多個process同時對同一個資料進行存取,且執行結果和其存取順序的不同而有所不同,稱為race condition。
  3. Critical Regions(或稱critical sections)
  4. 指的是一個存取共用資源的程式片斷。
  5. Kernel Synchronization
  6. 簡單說防止kernel發生race condition。保護的重點是shared data,而且保護的範圍和負擔越小越好。

Kernel Synchronization常見的作法就是locking,每次只允許一個process可以存取share data,就可以避免race condition了。

    在kernel中有幾種引發concurrency的可能
  1. Interrupt
  2. 當CPU收到interrupt會立刻中斷目前工作去執行interrupt handler。
  3. Softirq
  4. 當softirq被raise起來就會中斷目前工作去執行softirq。
  5. kernel preemption
  6. 如果kernel preemption被開啟,那麼task就可能被中斷。
  7. Sleeping
  8. 如果kernel執行到blocking code,就可能被schedule出去。
  9. SMP
  10. 同時間兩個以上的CPU存取相同的資料就會發生race condition。


在linunx kernel中,執行的context主要分成兩種interrupt context和process context,凡是只要in_interrupt()都是interrupt context,所以引起的原因包含Hardware interrupt和softirq兩種。以下就擷取片段程式碼說明。
// thread_info存放在task的stack裡面
# define task_thread_info(task)  ((struct thread_info *)(task)->stack)

// thread_info中的preempt_count分成幾個部份
// bits 0-7 are the preemption count (max preemption depth: 256)
// bits 8-15 are the softirq count (max # of softirqs: 256)
// bits 16-25 are the hardirq count (max # of nested hardirqs: 1024)
// bit 26 is the NMI_MASK
// bit 28 is the PREEMPT_ACTIVE flag
struct thread_info {
    struct task_struct *task;   /* main task structure */
    int    preempt_count;       /* 0 => preemptable */
};

# define preempt_count()        (current_thread_info()->preempt_count)
# define add_preempt_count(val) do { preempt_count() += (val); } while (0)
# define sub_preempt_count(val) do { preempt_count() -= (val); } while (0)

// 當每次呼叫irq_enter()就會將preempt_count屬於HARDIRQ的部份遞增
#define __irq_enter()                       \
    do {                                    \
        account_system_vtime(current);      \
        add_preempt_count(HARDIRQ_OFFSET);  \
        trace_hardirq_enter();              \
    } while (0)

# define invoke_softirq()   do_softirq()
# define IRQ_EXIT_OFFSET    HARDIRQ_OFFSET
/*
 * Exit an interrupt context. Process softirqs if needed and possible:
 */
void irq_exit(void)
{
    // 離開hard interrupt所以要減回去HARDIRQ_OFFSET
    sub_preempt_count(IRQ_EXIT_OFFSET);
    // 如果不在interrupt context(如softirq裡面),
    // 而且softirq有被raise就執行softirq
    if (!in_interrupt() && local_softirq_pending())
        invoke_softirq();
}

此圖出於http://blog.csdn.net/hero7935/archive/2011/05/07/6401522.aspx

    參考資料:
  • Linux Kernel Development 2nd, Novell Press



2011年2月27日 星期日

Linux Modules(14.1)- Read Copy Update


RCU (Read-Copy Update)是kernel同步機制之一,允許多個reader在writer更新資料的同時讀取資料,reader可能讀到更新前或更新後的,但是資料內容是一致的(不是新的就是舊的,這是因為RCU利用指標的dereference和assign達成的),另外,RCU也能確保資料在read-side使用時不會將之free(下一篇介紹RCU的原理在提吧)。


這裡有一張圖用來描述RCU再經典不過了。首先,藍色的reader的開始就是rcu_read_lock(),結束就是rcu_read_unlock(),下面的removal、grace period和reclamation代表著writer的狀態,這邊只要保證讀到舊資料的reader(就是開頭落在removal的reader),都能在grace period結束之前,離開read-side就可以了,聰明的你一定可以看出在grace period開始之後的reader都是讀到新資料,所以RCU就不管他想用多久。
    RCU的三個階段
  1. removal:更新指標。
  2. grace period:等待所有持有舊資料的reader都離開RCU read-side。
  3. reclamation:回收舊資料。

RCU本身就是read-write lock的一種,所以我們介紹一下RCU的reader和writer的形式。
struct foo {
    int x;
};

static struct foo *foo = NULL;

// Reader的形式
static int reader(void)
{
    int ret;

    rcu_read_lock();
    ret = rcu_dereference(foo)->x;
    rcu_read_unlock();

    return ret;
}

// Writer的形式
static void writer(int x)
{
    struct foo *new_foo, *old_foo = foo;

    // 建立新的資料內容new_foo
    new_foo = kmalloc(sizeof(struct foo), GFP_KERNEL);

    // 複製原本的內容
    *new_foo = *old_foo;

    // 修改內容
    new_foo->x = x;

    // removal
    rcu_assign_pointer(foo, new_foo);

    // grace period
    synchronize_rcu();

    // reclamation: 
    kfree(old_foo);

}

synchronize_rcu()就是在等待所謂的grace period,等所有持舊資料的reader都離開RCU read-side才會往下執行kfree(old_foo)。