顯示具有 Socket - NETLINK 標籤的文章。 顯示所有文章
顯示具有 Socket - NETLINK 標籤的文章。 顯示所有文章

2011年2月13日 星期日

Netlink, NETLINK_FIREWALL


關於NETLINK的介紹請看Netlink introduction,這裡假設您已經了解NETLINK,並且準備使用NETLINK_FIREWALL這個netlink family,這個family必須載入ip_queue.ko這個module。而或者您已經直接將他編進kernel當中。

我們由kernel的觀點來看NETLINK_FIREWALL提供哪些功能,首先看到net/ipv4/netfilter/ip_queue.c
static int __init ip_queue_init(void)
{
    ...
    //註冊NETLINK_FIREWALL的handler,即ipq_rcv_skb
        ipqnl = netlink_kernel_create(&init_net, NETLINK_FIREWALL, 0,
                                      ipq_rcv_skb, NULL, THIS_MODULE);
    ...
}

static void
ipq_rcv_skb(struct sk_buff *skb)
{
        mutex_lock(&ipqnl_mutex);
        __ipq_rcv_skb(skb);
        mutex_unlock(&ipqnl_mutex);
}

static inline void
__ipq_rcv_skb(struct sk_buff *skb)
{
    ...
    status = ipq_receive_peer(NLMSG_DATA(nlh), type,
                              nlmsglen - NLMSG_LENGTH(0));
    if (status < 0)
            RCV_SKB_FAIL(status);

    if (flags & NLM_F_ACK)
            netlink_ack(skb, nlh, 0);
}

// 這裡就是提供NETLINK_FIREWALL control功能的function了
// 包含了設定copy to user-space的packet型態,
// 以及設定packet的verdict(NF_DROP/NF_ACCEPT等)
static int
ipq_receive_peer(struct ipq_peer_msg *pmsg,
                 unsigned char type, unsigned int len)
{
        int status = 0;

        if (len < sizeof(*pmsg))
                return -EINVAL;

        switch (type) {
        case IPQM_MODE:
         // 設定copy到user-space的模式為何?IPQ_COPY_META或是IPQ_COPY_PACKET
                status = ipq_set_mode(pmsg->msg.mode.value,
                                      pmsg->msg.mode.range);
                break;

        case IPQM_VERDICT:
        // packet的verdict
                if (pmsg->msg.verdict.value > NF_MAX_VERDICT)
                        status = -EINVAL;
                else
                        status = ipq_set_verdict(&pmsg->msg.verdict,
                                                 len - sizeof(*pmsg));
                        break;
        default:
                status = -EINVAL;
        }
        return status;
}
上述這段code就能大概了解NETLINK_FIREWALL在kernel的流程與提供的facility為何,透過IPQM_MODE設定copy to user-space的資料模式,當user-space收到資料後,判斷該資料是要DROP還是ACCEPT,決定後再透過IPQM_VERDICT告訴kernel,該封包是要DROP還是ACCEPT。

初步了解kernel提供的功能之後,下面就寫一個當接收到icmp echo封包,且seq為奇數的就DROP,其餘的就ACCEPT的範例。
#include <stdio.h>
#include <stdint.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <errno.h>
#include <string.h>
#include <unistd.h>
#include <netinet/ip.h>
#include <netinet/ip_icmp.h>
#include <netinet/in.h>
#include <linux/netlink.h>
#include <linux/netfilter.h>
#include <linux/netfilter_ipv4/ip_queue.h>

/**
 * 建立socket
 */
static int create_nl_socket(int proto)
{
    int sock;
    struct sockaddr_nl addr;

    if ((sock = socket(AF_NETLINK, SOCK_RAW, proto)) < 0) {
        fprintf(stderr, "open sock failed.(%s)\n", strerror(errno));
        return -1;
    }

    memset(&addr, 0, sizeof(addr));
    addr.nl_family = AF_NETLINK;
    addr.nl_pid = getpid();

    if (bind(sock, (struct sockaddr*)&addr, sizeof(addr)) < 0) {
        fprintf(stderr, "bind failed.(%s)\n", strerror(errno));
        goto bind_err;
    }

    return sock;

bind_err:
    close(sock);
    return -1;
}


/**
 * 設定IPQM_MODE
 */
static int ipq_set_mode(int sock, uint8_t mode, size_t range)
{
    unsigned char buf[1024];
    struct msghdr msg;
    struct sockaddr_nl dst = { .nl_family = AF_NETLINK };
    struct nlmsghdr *nlh;
    struct ipq_peer_msg *pmsg;
    struct iovec iov = {
        .iov_base = (void *) buf,
        .iov_len = sizeof(buf)
    };

    memset(buf, 0, sizeof(buf));
    msg = (struct msghdr) {
            .msg_name = (void *)&dst,
            .msg_namelen = sizeof(dst),
            .msg_iov = &iov,
            .msg_iovlen = 1,
    };

    nlh = (struct nlmsghdr*) buf;
    *nlh = (struct nlmsghdr) {
        .nlmsg_len = sizeof(buf),
        .nlmsg_flags = NLM_F_REQUEST,
        .nlmsg_type = IPQM_MODE,
        .nlmsg_pid = getpid(),
    };

    pmsg = (struct ipq_peer_msg*) NLMSG_DATA(nlh);
    *pmsg = (struct ipq_peer_msg) {
        .msg.mode.value = mode, // IPQM_META或是IPQM_PACKET
        .msg.mode.range = range, // 封包的大小
    };

    printf("%s(#%d):  nlmsglen:%d, NLMSG_LENGTH(0):%d\n",
            __func__, __LINE__, nlh->nlmsg_len, NLMSG_LENGTH(0));
    return sendmsg(sock, &msg, 0);
}

/**
 * 列印封包內容
 */
static void print_pkt(ipq_packet_msg_t *ipq_pkt)
{
    int i;
    printf("packet_id:0x%lx, mark:0x%lx\n,"
            "hook:%d, idev:%s, odev:%s\n,"
            "hw_proto:%d, hw_type:%d, hw_addrlen:%d\n,"
            "hw_addr:0x%02X%02X%02X%02X%02X%02X%02X%02X\n,"
            "data_len:%ld, payload:\n",
            ipq_pkt->packet_id, ipq_pkt->mark,
            ipq_pkt->hook, ipq_pkt->indev_name, ipq_pkt->outdev_name,
            ipq_pkt->hw_protocol, ipq_pkt->hw_type, ipq_pkt->hw_addrlen,
            ipq_pkt->hw_addr[0], ipq_pkt->hw_addr[1],
            ipq_pkt->hw_addr[2], ipq_pkt->hw_addr[3],
            ipq_pkt->hw_addr[4], ipq_pkt->hw_addr[5],
            ipq_pkt->hw_addr[6], ipq_pkt->hw_addr[7],
            ipq_pkt->data_len);
    for (i = 0; i < ipq_pkt->data_len; i++) {
        printf("%02X ", ipq_pkt->payload[i]);
        if (!((i+1) % 16)) printf("\n");
    }
}


/**
 * 根據封包內容給verdict
 */
static void 
get_verdict(ipq_packet_msg_t *ipq_pkt, int *verdict, unsigned long *id)
{
    struct iphdr *iph;
    struct icmphdr *icmph;

    *id = ipq_pkt->packet_id;
    if (ipq_pkt->data_len < sizeof(struct iphdr)) {
        *verdict = NF_DROP;
        return;
    }
    iph = (struct iphdr *) ipq_pkt->payload;
    if (iph->protocol == IPPROTO_ICMP) {
        icmph = (struct icmphdr *) (ipq_pkt->payload + iph->ihl * 4);
        printf("Type: %d, Id:0x%04x, seq:0x%04x\n",
                icmph->type, ntohs(icmph->un.echo.id),
           ntohs(icmph->un.echo.sequence));
        // 序號為奇數就將之DROP
        if (ntohs(icmph->un.echo.sequence) % 2) {
            *verdict = NF_DROP;
            return;
        }
    }
    // 其餘就是ACCEPT
    *verdict = NF_ACCEPT;
}

/**
 * 設定封包的verdict
 */
static int set_verdict(int sock, int verdict, unsigned long id)
{
    unsigned char buf[1024];
    struct msghdr msg;
    struct sockaddr_nl dst = { .nl_family = AF_NETLINK };
    struct nlmsghdr *nlh;
    struct ipq_peer_msg *pmsg;
    struct iovec iov = { .iov_base = (void *) buf, .iov_len = sizeof(buf) };

    memset(buf, 0, sizeof(buf));
    msg = (struct msghdr) {
            .msg_name = (void *)&dst,
            .msg_namelen = sizeof(dst),
            .msg_iov = &iov,
            .msg_iovlen = 1,
    };

    nlh = (struct nlmsghdr*) buf;
    *nlh = (struct nlmsghdr) {
        .nlmsg_len = sizeof(buf),
        .nlmsg_flags = NLM_F_REQUEST,
        .nlmsg_type = IPQM_VERDICT,
        .nlmsg_pid = getpid(),
    };

    pmsg = (struct ipq_peer_msg*) NLMSG_DATA(nlh);
    *pmsg = (struct ipq_peer_msg) {
        .msg.verdict.value = verdict, // NF_DROP或是NF_ACCEPT
        // packet_id詳細資料請看kernel的ipq_set_verdict()
        .msg.verdict.id = id,
    };

    char *p = "NONE";
    switch (verdict) {
        case NF_DROP:
            p = "DROP";
            break;
        case NF_ACCEPT:
            p = "ACCEPT";
            break;
    }
    printf("%s(#%d): %s packet %ld\n", __func__, __LINE__, p, id);
    return sendmsg(sock, &msg, 0);
}

/**
 * 處理接收到的封包
 */
static int ipq_recv_pkt(int sock, size_t len)
{
    unsigned char buf[NLMSG_SPACE(0) + len];
    struct msghdr msg;
    struct sockaddr_nl dst = { .nl_family = AF_NETLINK };
    struct nlmsghdr *nlh;
    struct iovec iov = {
        .iov_base = (void *) buf,
        .iov_len = len,
    };

    memset(buf, 0, sizeof(buf));
    msg = (struct msghdr) {
            .msg_name = (void *)&dst,
            .msg_namelen = sizeof(dst),
            .msg_iov = &iov,
            .msg_iovlen = 1,
    };

    len = recvmsg(sock, &msg, 0);
    for (nlh = (struct nlmsghdr *) buf; NLMSG_OK (nlh, len);
            nlh = NLMSG_NEXT (nlh, len)) {
        /* The end of multipart message. */
        if (nlh->nlmsg_type == NLMSG_DONE) {
            printf("NLMSG_DONE\n");
            return 0;
        }

        /* Do some error handling. */
        if (nlh->nlmsg_type == NLMSG_ERROR) {
            fprintf(stderr, "NLMSG_ERROR\n");
            return -1;
        }

        if (nlh->nlmsg_type == IPQM_PACKET) {
            int verdict;
            unsigned long id;

            print_pkt(NLMSG_DATA(nlh));
            get_verdict(NLMSG_DATA(nlh), &verdict, &id);
            set_verdict(sock, verdict, id);
        }
    }
    return 0;
}


int main(int argc, char *argv[])
{
    int sock, ret, cnt;

    sock = create_nl_socket(NETLINK_FIREWALL);
    if (sock < 0) {
        fprintf(stderr, "create_nl_socket failed\n");
        return -1;
    }

    ret = ipq_set_mode(sock, IPQ_COPY_PACKET, 2048);
    if (ret < 0) {
        fprintf(stderr, "ipq_set_mode failed\n");
    } else {
        printf("ipq_set_mode success\n");
    }

    for (cnt = 0; cnt < 10; cnt++) {
        ret = ipq_recv_pkt(sock, 2048);
    }
    close(sock);
    return 0;
}


透過iptable將經過OUTPUT chain的packet送到QUEUE去,這樣kernel才會將packet丟到NFTLINK_FIRWALL處理。


您可以看到kernel送出來的packet內容是從IP header開始。


您可以發現ping有一半的packet被DROP了。

Kernel version:2.6.37


2010年5月22日 星期六

Netlink introduction


Netlink被用來當作kernel和user space之間溝通資訊的方式之一,使用標準的socket介面來作為Netlink的API,其address family必須填AF_NETLINK,而socket type為SOCK_RAW或SOCK_DGRAM,protocol則根據不同的netlink group不同而有所不同,如NETLINK_ROUTE或NETLINK_GENERIC等等(詳細資訊可以man 7 netlink)。

socket = socket(AF_NETLINK, SOCK_RAW, netlink_family);

Netlink Socket Address Structure

如同一般的socket,Netlink也需要socket address,其socket address為:
struct sockaddr_nl {
    sa_family_t     nl_family;  /* AF_NETLINK */
    unsigned short  nl_pad;     /* Zero. */
    pid_t           nl_pid;     /* Process ID. */
    __u32           nl_groups;  /* Multicast groups mask. */
};
如果傳送的對象是kernel或者以multicast傳送,則nl_pid設為0。如果是kernel傳送給user-space上面的application時(multicast),就會填入正確的pid,然而nl_pid實際的意義並不是指PID,而只是用於識別一個netlink socket而已,對於application在建立一個netlink socket時,可以將nl_pid設為0,然後bind(),kernel會自動將PID填入。而nl_groups是用multicast,採用bit mask方式,所以每個netlink family有32個multicast group。

Netlink Format

Netlink message包含一個或多個struct nlmsghdr,主要用於辨識後面的payload內容為何。
struct nlmsghdr {
    __u32 nlmsg_len;    /* Length of message including header. */
    __u16 nlmsg_type;   /* Type of message content. */
    __u16 nlmsg_flags;  /* Additional flags. */
    __u32 nlmsg_seq;    /* Sequence number. */
    __u32 nlmsg_pid;    /* PID of the sending process. */
};

所以常見的讀取netlink message形式如下:
  int len;
  char buf[4096];
  struct iovec iov = { buf, sizeof(buf) };
  struct sockaddr_nl sa;
  struct msghdr msg;
  struct nlmsghdr *nh;

  msg = { (void *)&sa, sizeof(sa), &iov, 1, NULL, 0, 0 };
  len = recvmsg(fd, &msg, 0);
  for (nh = (struct nlmsghdr *) buf; NLMSG_OK (nh, len);
    nh = NLMSG_NEXT (nh, len)) {
    /* The end of multipart message. */
    if (nh->nlmsg_type == NLMSG_DONE)
      return;
    if (nh->nlmsg_type == NLMSG_ERROR)
      /* Do some error handling. */
      ...
    /* Continue with parsing payload. */
    ...
  }


Macro

Netlink也提供一些macro來操作這些netlink datagram,因為kernel和user-space溝通會有align的問題,所以不論是在組packet或者parse packet,都請用這些macro。
#define NLMSG_ALIGNTO 4
#define NLMSG_ALIGN(len) ( ((len)+NLMSG_ALIGNTO-1) & ~(NLMSG_ALIGNTO-1) )
#define NLMSG_HDRLEN  ((int) NLMSG_ALIGN(sizeof(struct nlmsghdr)))
#define NLMSG_LENGTH(len) ((len)+NLMSG_ALIGN(NLMSG_HDRLEN))
#define NLMSG_SPACE(len) NLMSG_ALIGN(NLMSG_LENGTH(len))
#define NLMSG_DATA(nlh)  ((void*)(((char*)nlh) + NLMSG_LENGTH(0)))
#define NLMSG_NEXT(nlh,len)  ((len) -= NLMSG_ALIGN((nlh)->nlmsg_len), \
      (struct nlmsghdr*)(((char*)(nlh)) + \
                                 NLMSG_ALIGN((nlh)->nlmsg_len)))
#define NLMSG_OK(nlh,len) ((len) >= (int)sizeof(struct nlmsghdr) && \
      (nlh)->nlmsg_len >= sizeof(struct nlmsghdr) && \
      (nlh)->nlmsg_len <= (len))
#define NLMSG_PAYLOAD(nlh,len) ((nlh)->nlmsg_len - NLMSG_SPACE((len)))
NLMSG_ALIGN()很明顯的就是在做align。
NLMSG_HDRLEN()回傳struct nlmsghdr所占的大小。
NLMSG_LENGTH()回傳len + struct nlmsghdr所占的大小,通常用於填struct nlmsghdr的nlmsg_len用。
NLMSG_SPACE()回傳len + struct nlmsghdr再取align的大小,也就是netlink message的大小。
NLMSG_DATA()回傳nlh往後一個struct nlmsghdr所占的大小,就是取struct nlmsghdr後面跟著的資料。
NLMSG_NEXT()取得下一個struct nlmsghdr,並且將len減去原本的nlmsg_len,就等同在算剩餘大小。
NLMSG_OK()先算剩餘長度是否大於等於一個struct nlmsghdr,再算這個nlmsg_len是否大於等於一個struct nlmsghdr,而且nlmsg_len還要小於剩餘長度(len),如果都符合,就算是一個OK的struct nlmsghdr。

各macro與netlink message之間的關係。

Common API

基於netlink family基本上都是大同小異,所以參考Document/accounting/getdelays.c寫了一些API來用。
static int create_nl_socket(int proto)
{
    int sock;
    struct sockaddr_nl addr;

    if ((sock = socket(AF_NETLINK, SOCK_RAW, proto)) < 0) {
        fprintf(stderr, "open sock failed.(%s)\n", strerror(errno));
        return -1;
    }

    memset(&addr, 0, sizeof(addr));
    addr.nl_family = AF_NETLINK;

    if (bind(sock, (struct sockaddr*)&addr, sizeof(addr)) < 0) {
        fprintf(stderr, "bind failed.(%s)\n", strerror(errno));
        goto bind_err;
    }

    return sock;

bind_err:
    close(sock);
    return -1;
}


這個章節單純的只對netlink提供一些overview,後面會針對各個netlink family有更詳細的描述。