Monday, December 17, 2012

OpenvSwitch 代码分析(四)

本小节主要分析ovsd和datapath之间通过netlink进行通信的机制。

datapath运行在内核态,ovsd运行在用户态,两者通过netlink通信。

datapath使用generic netlink

dp_init()函数(datapath.c)中,调用dp_register_genl()完成对四种类型的family以及相应操作的注册,包括datapathvportflowpacket。前三种family,都对应四种操作都包括NEWDELGETSET,而packet的操作仅为EXECUTE
这些family和操作的定义均在datapath.c中。
flow family为例。代码为
static const struct nla_policy flow_policy[OVS_FLOW_ATTR_MAX + 1] = {
   [OVS_FLOW_ATTR_KEY] = { .type = NLA_NESTED },
   [OVS_FLOW_ATTR_ACTIONS] = { .type = NLA_NESTED },
   [OVS_FLOW_ATTR_CLEAR] = { .type = NLA_FLAG },
};

static struct genl_family dp_flow_genl_family = {
   .id = GENL_ID_GENERATE,
   .hdrsize = sizeof(struct ovs_header),
   .name = OVS_FLOW_FAMILY,
   .version = OVS_FLOW_VERSION,
   .maxattr = OVS_FLOW_ATTR_MAX,
    SET_NETNSOK
};
而绑定的ops的定义为
static struct genl_ops dp_flow_genl_ops[] = {
   { .cmd = OVS_FLOW_CMD_NEW,
     .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
     .policy = flow_policy,
     .doit = ovs_flow_cmd_new_or_set
   },
   { .cmd = OVS_FLOW_CMD_DEL,
     .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
     .policy = flow_policy,
     .doit = ovs_flow_cmd_del
   },
   { .cmd = OVS_FLOW_CMD_GET,
     .flags = 0,                               /* OK for unprivileged users. */
     .policy = flow_policy,
     .doit = ovs_flow_cmd_get,
     .dumpit = ovs_flow_cmd_dump
   },
   { .cmd = OVS_FLOW_CMD_SET,
     .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
     .policy = flow_policy,
     .doit = ovs_flow_cmd_new_or_set,
   },
};
可见,dp定义的nlmsg类型除了genl头和nl头之外,还有自定义的ovs_header

ovsd使用netlink

ovsd对于netlink的实现,主要在lib/netlink-socket.c文件中。而对这些netlink操作的调用,主要在lib/dpif-linux.c文件(以dpif_linux_class为例)中对于各个行为的处理,各种可能的消息类型在datapath模块中事先进行了内核注册。
datapath中对netlink family类型进行了注册,ovsd在使用这些netlink family之前需要获取它们的信息,这一过程主要在lib/dpif-linux.c文件(以dpif_linux_class为例),dpif_linux_init()函数。代码为
static int
dpif_linux_init(void)
{
    static int error = -1;

    if (error < 0) {
        unsigned int ovs_vport_mcgroup;

        error = nl_lookup_genl_family(OVS_DATAPATH_FAMILY,
                                      &ovs_datapath_family);
        if (error) {
            VLOG_ERR("Generic Netlink family '%s' does not exist. "
                     "The Open vSwitch kernel module is probably not loaded.",
                     OVS_DATAPATH_FAMILY);
        }
        if (!error) {
            error = nl_lookup_genl_family(OVS_VPORT_FAMILY, &ovs_vport_family);
        }
        if (!error) {
            error = nl_lookup_genl_family(OVS_FLOW_FAMILY, &ovs_flow_family);
        }
        if (!error) {
            error = nl_lookup_genl_family(OVS_PACKET_FAMILY,
                                          &ovs_packet_family);
        }
        if (!error) {
            error = nl_sock_create(NETLINK_GENERIC, &genl_sock);
        }
        if (!error) {
            error = nl_lookup_genl_mcgroup(OVS_VPORT_FAMILY, OVS_VPORT_MCGROUP,
                                           &ovs_vport_mcgroup,
                                           OVS_VPORT_MCGROUP_FALLBACK_ID);
        }
        if (!error) {
            static struct dpif_linux_vport vport;
            nln = nln_create(NETLINK_GENERIC, ovs_vport_mcgroup,
                             dpif_linux_nln_parse, &vport);
        }
    }

    return error;
}
完成这些查找后,ovsd即可利用dpif中的api,通过发出这些netlink消息给datapath实现对datapath的操作。
相关的中间层API定义主要在dpif_class(位于lib/dpif-provider.h)的抽象类型中。
各个抽象API具体的实现,仍然分为dpif_linux_classlib/dpif-linux.c)和dpif_netdev_classlib/dpif-netdev.c)两个具体类型。这些api作为中间层,对上层则被lib/dpif.c文件中的高级api所封装使用。
这里以dpif_flow_put()lib/dpif.c)为例分析netlink消息的构建和发出过程。该函数试图对绑定的datapath中的flow进行设置。其主要调用了函数dpif_flow_put__(),而dpif_flow_put__()的代码为
static int
dpif_flow_put__(struct dpif *dpif, const struct dpif_flow_put *put)
{
    int error;

    COVERAGE_INC(dpif_flow_put);
    assert(!(put->flags & ~(DPIF_FP_CREATE | DPIF_FP_MODIFY
                            | DPIF_FP_ZERO_STATS)));

    error = dpif->dpif_class->flow_put(dpif, put);
    if (error && put->stats) {
        memset(put->stats, 0, sizeof *put->stats);
    }
    log_flow_put_message(dpif, put, error);
    return error;
}
可见,其调用了具体的dpif_class中的抽象接口,以dpif_linux_class为例,该接口实际为dpif_linux_flow_put(),其代码如下
static int
dpif_linux_flow_put(struct dpif *dpif_, const struct dpif_flow_put *put)
{
    struct dpif_linux_flow request, reply;
    struct ofpbuf *buf;
    int error;

    dpif_linux_init_flow_put(dpif_, put, &request);
    error = dpif_linux_flow_transact(&request,
                                     put->stats ? &reply : NULL,
                                     put->stats ? &buf : NULL);
    if (!error && put->stats) {
        dpif_linux_flow_get_stats(&reply, put->stats);
        ofpbuf_delete(buf);
    }
    return error;
}
从代码中,可见主要执行两个过程,利用dpif_linux_init_flow_put()进行初始化,和利用dpif_linux_flow_transact()发送nlmsg
dpif_linux_init_flow_put()利用put构建了request消息,代码为
static void
dpif_linux_init_flow_put(struct dpif *dpif_, const struct dpif_flow_put *put,
                         struct dpif_linux_flow *request)
{
    static struct nlattr dummy_action;

    struct dpif_linux *dpif = dpif_linux_cast(dpif_);

    dpif_linux_flow_init(request);
    request->cmd = (put->flags & DPIF_FP_CREATE
                    ? OVS_FLOW_CMD_NEW : OVS_FLOW_CMD_SET);
    request->dp_ifindex = dpif->dp_ifindex;
    request->key = put->key;
    request->key_len = put->key_len;
    /* Ensure that OVS_FLOW_ATTR_ACTIONS will always be included. */
    request->actions = put->actions ? put->actions : &dummy_action;
    request->actions_len = put->actions_len;
    if (put->flags & DPIF_FP_ZERO_STATS) {
        request->clear = true;
    }
    request->nlmsg_flags = put->flags & DPIF_FP_MODIFY ? 0 : NLM_F_CREATE;
}
dpif_linux_flow_transact()则具体负责发出nlmsg,代码为
static int
dpif_linux_flow_transact(struct dpif_linux_flow *request,
                         struct dpif_linux_flow *reply, struct ofpbuf **bufp)
{
    struct ofpbuf *request_buf;
    int error;

    assert((reply != NULL) == (bufp != NULL));

    if (reply) {
        request->nlmsg_flags |= NLM_F_ECHO;
    }

    request_buf = ofpbuf_new(1024);
    dpif_linux_flow_to_ofpbuf(request, request_buf);
    error = nl_sock_transact(genl_sock, request_buf, bufp);
    ofpbuf_delete(request_buf);

    if (reply) {
        if (!error) {
            error = dpif_linux_flow_from_ofpbuf(reply, *bufp);
        }
        if (error) {
            dpif_linux_flow_init(reply);
            ofpbuf_delete(*bufp);
            *bufp = NULL;
        }
    }
    return error;
}
dpif_linux_flow_transact()的第一个参数request为发送消息的相关数据,第二个参数和第三个参数如果给定非空,则用来存储可能收到的回复信息。
该函数首先调用dpif_linux_flow_to_ofpbuf()函数,利用request中的attrs信息创建一个struct ofpbuf *request_buf=ovs_header+attrs。其中ovs_header结构(include/linux/openvswitch.h)中仅保存了一个dp_ifindex信息。之后则调用nl_sock_transact()函数发出消息。
nl_sock_transact()函数代码为
int
nl_sock_transact(struct nl_sock *sock, const struct ofpbuf *request,
                 struct ofpbuf **replyp)
{
    struct nl_transaction *transactionp;
    struct nl_transaction transaction;

    transaction.request = CONST_CAST(struct ofpbuf *, request);
    transaction.reply = replyp ? ofpbuf_new(1024) : NULL;
    transactionp = &transaction;

    nl_sock_transact_multiple(sock, &transactionp, 1);

    if (replyp) {
        if (transaction.error) {
            ofpbuf_delete(transaction.reply);
            *replyp = NULL;
        } else {
            *replyp = transaction.reply;
        }
    }

    return transaction.error;
}
nl_sock_transact()函数进一步调用了nl_sock_transact_multiple()函数发出消息。
nl_sock_transact_multiple()函数第一个参数为要发送消息的socket,第二个参数存储了需要发送的消息,第三个参数指定发出消息的数目。函数中主要调用了nl_sock_transact_multiple__()函数构建nlmsg并发出。
其他类型的行为的处理过程类似。