datapath运行在内核态,ovsd运行在用户态,两者通过netlink通信。
datapath使用generic netlink
在dp_init()函数(datapath.c)中,调用dp_register_genl()完成对四种类型的family以及相应操作的注册,包括datapath、vport、flow和packet。前三种family,都对应四种操作都包括NEW、DEL、GET、SET,而packet的操作仅为EXECUTE。
这些family和操作的定义均在datapath.c中。
以flow family为例。代码为
static const struct nla_policy
flow_policy[OVS_FLOW_ATTR_MAX + 1] = {
[OVS_FLOW_ATTR_KEY]
= { .type = NLA_NESTED },
[OVS_FLOW_ATTR_ACTIONS]
= { .type = NLA_NESTED },
[OVS_FLOW_ATTR_CLEAR]
= { .type = NLA_FLAG },
};
static struct genl_family
dp_flow_genl_family = {
.id
= GENL_ID_GENERATE,
.hdrsize
= sizeof(struct ovs_header),
.name
= OVS_FLOW_FAMILY,
.version
= OVS_FLOW_VERSION,
.maxattr
= OVS_FLOW_ATTR_MAX,
SET_NETNSOK
};
而绑定的ops的定义为
static struct genl_ops
dp_flow_genl_ops[] = {
{
.cmd = OVS_FLOW_CMD_NEW,
.flags = GENL_ADMIN_PERM, /* Requires
CAP_NET_ADMIN privilege. */
.policy = flow_policy,
.doit = ovs_flow_cmd_new_or_set
},
{
.cmd = OVS_FLOW_CMD_DEL,
.flags = GENL_ADMIN_PERM, /* Requires
CAP_NET_ADMIN privilege. */
.policy = flow_policy,
.doit = ovs_flow_cmd_del
},
{
.cmd = OVS_FLOW_CMD_GET,
.flags = 0, /* OK for unprivileged users. */
.policy = flow_policy,
.doit = ovs_flow_cmd_get,
.dumpit = ovs_flow_cmd_dump
},
{
.cmd = OVS_FLOW_CMD_SET,
.flags = GENL_ADMIN_PERM, /* Requires
CAP_NET_ADMIN privilege. */
.policy = flow_policy,
.doit = ovs_flow_cmd_new_or_set,
},
};
可见,dp定义的nlmsg类型除了genl头和nl头之外,还有自定义的ovs_header。
ovsd使用netlink
ovsd对于netlink的实现,主要在lib/netlink-socket.c文件中。而对这些netlink操作的调用,主要在lib/dpif-linux.c文件(以dpif_linux_class为例)中对于各个行为的处理,各种可能的消息类型在datapath模块中事先进行了内核注册。
datapath中对netlink family类型进行了注册,ovsd在使用这些netlink
family之前需要获取它们的信息,这一过程主要在lib/dpif-linux.c文件(以dpif_linux_class为例),dpif_linux_init()函数。代码为
static int
dpif_linux_init(void)
{
static int error = -1;
if (error < 0) {
unsigned int ovs_vport_mcgroup;
error = nl_lookup_genl_family(OVS_DATAPATH_FAMILY,
&ovs_datapath_family);
if (error) {
VLOG_ERR("Generic Netlink
family '%s' does not exist. "
"The Open vSwitch
kernel module is probably not loaded.",
OVS_DATAPATH_FAMILY);
}
if (!error) {
error =
nl_lookup_genl_family(OVS_VPORT_FAMILY, &ovs_vport_family);
}
if (!error) {
error =
nl_lookup_genl_family(OVS_FLOW_FAMILY, &ovs_flow_family);
}
if (!error) {
error =
nl_lookup_genl_family(OVS_PACKET_FAMILY,
&ovs_packet_family);
}
if (!error) {
error =
nl_sock_create(NETLINK_GENERIC, &genl_sock);
}
if (!error) {
error =
nl_lookup_genl_mcgroup(OVS_VPORT_FAMILY, OVS_VPORT_MCGROUP,
&ovs_vport_mcgroup,
OVS_VPORT_MCGROUP_FALLBACK_ID);
}
if (!error) {
static struct dpif_linux_vport
vport;
nln = nln_create(NETLINK_GENERIC,
ovs_vport_mcgroup,
dpif_linux_nln_parse, &vport);
}
}
return error;
}
完成这些查找后,ovsd即可利用dpif中的api,通过发出这些netlink消息给datapath实现对datapath的操作。
相关的中间层API定义主要在dpif_class(位于lib/dpif-provider.h)的抽象类型中。
各个抽象API具体的实现,仍然分为dpif_linux_class(lib/dpif-linux.c)和dpif_netdev_class(lib/dpif-netdev.c)两个具体类型。这些api作为中间层,对上层则被lib/dpif.c文件中的高级api所封装使用。
这里以dpif_flow_put()(lib/dpif.c)为例分析netlink消息的构建和发出过程。该函数试图对绑定的datapath中的flow进行设置。其主要调用了函数dpif_flow_put__(),而dpif_flow_put__()的代码为
static int
dpif_flow_put__(struct dpif *dpif,
const struct dpif_flow_put *put)
{
int error;
COVERAGE_INC(dpif_flow_put);
assert(!(put->flags & ~(DPIF_FP_CREATE | DPIF_FP_MODIFY
|
DPIF_FP_ZERO_STATS)));
error = dpif->dpif_class->flow_put(dpif, put);
if (error && put->stats) {
memset(put->stats, 0, sizeof
*put->stats);
}
log_flow_put_message(dpif, put, error);
return error;
}
可见,其调用了具体的dpif_class中的抽象接口,以dpif_linux_class为例,该接口实际为dpif_linux_flow_put(),其代码如下
static int
dpif_linux_flow_put(struct dpif *dpif_,
const struct dpif_flow_put *put)
{
struct dpif_linux_flow request, reply;
struct ofpbuf *buf;
int error;
dpif_linux_init_flow_put(dpif_, put,
&request);
error = dpif_linux_flow_transact(&request,
put->stats ? &reply : NULL,
put->stats ? &buf : NULL);
if (!error && put->stats) {
dpif_linux_flow_get_stats(&reply,
put->stats);
ofpbuf_delete(buf);
}
return error;
}
从代码中,可见主要执行两个过程,利用dpif_linux_init_flow_put()进行初始化,和利用dpif_linux_flow_transact()发送nlmsg。
dpif_linux_init_flow_put()利用put构建了request消息,代码为
static void
dpif_linux_init_flow_put(struct dpif
*dpif_, const struct dpif_flow_put *put,
struct dpif_linux_flow
*request)
{
static struct nlattr dummy_action;
struct dpif_linux *dpif = dpif_linux_cast(dpif_);
dpif_linux_flow_init(request);
request->cmd = (put->flags & DPIF_FP_CREATE
? OVS_FLOW_CMD_NEW :
OVS_FLOW_CMD_SET);
request->dp_ifindex = dpif->dp_ifindex;
request->key = put->key;
request->key_len = put->key_len;
/* Ensure that OVS_FLOW_ATTR_ACTIONS will always be included. */
request->actions = put->actions ? put->actions :
&dummy_action;
request->actions_len = put->actions_len;
if (put->flags & DPIF_FP_ZERO_STATS) {
request->clear = true;
}
request->nlmsg_flags = put->flags & DPIF_FP_MODIFY ? 0 :
NLM_F_CREATE;
}
而dpif_linux_flow_transact()则具体负责发出nlmsg,代码为
static int
dpif_linux_flow_transact(struct
dpif_linux_flow *request,
struct dpif_linux_flow
*reply, struct ofpbuf **bufp)
{
struct ofpbuf *request_buf;
int error;
assert((reply != NULL) == (bufp != NULL));
if (reply) {
request->nlmsg_flags |= NLM_F_ECHO;
}
request_buf = ofpbuf_new(1024);
dpif_linux_flow_to_ofpbuf(request,
request_buf);
error = nl_sock_transact(genl_sock,
request_buf, bufp);
ofpbuf_delete(request_buf);
if (reply) {
if (!error) {
error =
dpif_linux_flow_from_ofpbuf(reply, *bufp);
}
if (error) {
dpif_linux_flow_init(reply);
ofpbuf_delete(*bufp);
*bufp = NULL;
}
}
return error;
}
dpif_linux_flow_transact()的第一个参数request为发送消息的相关数据,第二个参数和第三个参数如果给定非空,则用来存储可能收到的回复信息。
该函数首先调用dpif_linux_flow_to_ofpbuf()函数,利用request中的attrs信息创建一个struct
ofpbuf *request_buf=ovs_header+attrs。其中ovs_header结构(include/linux/openvswitch.h)中仅保存了一个dp_ifindex信息。之后则调用nl_sock_transact()函数发出消息。
nl_sock_transact()函数代码为
int
nl_sock_transact(struct nl_sock
*sock, const struct ofpbuf *request,
struct ofpbuf **replyp)
{
struct nl_transaction *transactionp;
struct nl_transaction transaction;
transaction.request = CONST_CAST(struct ofpbuf *, request);
transaction.reply = replyp ? ofpbuf_new(1024) : NULL;
transactionp = &transaction;
nl_sock_transact_multiple(sock,
&transactionp, 1);
if (replyp) {
if (transaction.error) {
ofpbuf_delete(transaction.reply);
*replyp = NULL;
} else {
*replyp = transaction.reply;
}
}
return transaction.error;
}
nl_sock_transact()函数进一步调用了nl_sock_transact_multiple()函数发出消息。
nl_sock_transact_multiple()函数第一个参数为要发送消息的socket,第二个参数存储了需要发送的消息,第三个参数指定发出消息的数目。函数中主要调用了nl_sock_transact_multiple__()函数构建nlmsg并发出。
其他类型的行为的处理过程类似。