本小节简要分析datapath收到网包，到跟ovsd进行交互的过程。

动态过程

datapath收到网包

在dp_init中通过调用dp_register_genl()注册了对于dp,vport,flow,packet四种类型事件的netlink family和ops。

当内核中的openvswitch.ko收到一个添加网桥的指令时候，即接收到OVS_DATAPATH_FAMILY通道的OVS_DP_CMD_NEW命令。该命令绑定的回调函数为ovs_dp_cmd_new。相关实现在datapath/datapath.c文件中，关键代码如下：

static struct genl_ops dp_datapath_genl_ops[] = {

{ .cmd = OVS_DP_CMD_NEW,

.flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */

.policy = datapath_policy,

.doit = ovs_dp_cmd_new

{ .cmd = OVS_DP_CMD_DEL,

.flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */

.policy = datapath_policy,

.doit = ovs_dp_cmd_del

{ .cmd = OVS_DP_CMD_GET,

.flags = 0, /* OK for unprivileged users. */

.policy = datapath_policy,

.doit = ovs_dp_cmd_get,

.dumpit = ovs_dp_cmd_dump

{ .cmd = OVS_DP_CMD_SET,

.flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */

.policy = datapath_policy,

.doit = ovs_dp_cmd_set,

};

ovs_dp_cmd_new函数除了初始化dp结构外，还调用new_vport()函数来生成一个新的vport。而new_port函数则调用ovs_vport_add()函数，来尝试生成一个新的vport。关键代码如下：

static struct vport *new_vport(const struct vport_parms *parms)

{

struct vport *vport;

vport = ovs_vport_add(parms);

if (!IS_ERR(vport)) {

struct datapath *dp = parms->dp;

struct hlist_head *head = vport_hash_bucket(dp, vport->port_no);

hlist_add_head_rcu(&vport->dp_hash_node, head);

dp_ifinfo_notify(RTM_NEWLINK, vport);

}

return vport;

}

ovs_vport_add()函数会检查vport类型，并调用相关的create()函数来生成vport结构。关键代码为

struct vport *ovs_vport_add(const struct vport_parms *parms)

{

struct vport *vport;

int err = 0;

int i;

ASSERT_RTNL();

for (i = 0; i < n_vport_types; i++) {

if (vport_ops_list[i]->type == parms->type) {

struct hlist_head *bucket;

vport = vport_ops_list[i]->create(parms);

if (IS_ERR(vport)) {

err = PTR_ERR(vport);

goto out;

}

bucket = hash_bucket(ovs_dp_get_net(vport->dp),

vport->ops->get_name(vport));

hlist_add_head_rcu(&vport->hash_node, bucket);

return vport;

}

err = -EAFNOSUPPORT;

out:

return ERR_PTR(err);

}

其中vport_ops_list[]在ovs_vport_init()的初始化过程中，被定义为与base_vport_ops_list相同。关键代码如下

int ovs_vport_init(void)

{

int err;

int i;

dev_table = kzalloc(VPORT_HASH_BUCKETS * sizeof(struct hlist_head),

GFP_KERNEL);

if (!dev_table) {

err = -ENOMEM;

goto error;

}

vport_ops_list = kmalloc(ARRAY_SIZE(base_vport_ops_list) *

sizeof(struct vport_ops *), GFP_KERNEL);

if (!vport_ops_list) {

err = -ENOMEM;

goto error_dev_table;

}

/* create a vport_ops_list, templated from base_vport_ops_list.*/

for (i = 0; i < ARRAY_SIZE(base_vport_ops_list); i++) {

const struct vport_ops *new_ops = base_vport_ops_list[i]; //check each vport_ops instance

if (new_ops->init)

err = new_ops->init(); //init each vport_ops

else

err = 0;

if (!err)

vport_ops_list[n_vport_types++] = new_ops;

else if (new_ops->flags & VPORT_F_REQUIRED) {

ovs_vport_exit();

goto error;

}

return 0;

error_dev_table:

kfree(dev_table);

error:

return err;

}

而base_vport_ops_list[]变量的成员目前有5种，分别为

/* List of statically compiled vport implementations. Don't forget to also

* add yours to the list at the bottom of vport.h. */

static const struct vport_ops *base_vport_ops_list[] = {

&ovs_netdev_vport_ops, //netdev instance

&ovs_internal_vport_ops,

&ovs_patch_vport_ops,

&ovs_gre_vport_ops,

#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,26)

&ovs_capwap_vport_ops,

#endif

};

因此，当vport定义为网络类型时，会执行ovs_netdev_vport_ops中定义的相关函数，包括init、create等，函数列表如下：

const struct vport_ops ovs_netdev_vport_ops = {

.type = OVS_VPORT_TYPE_NETDEV,

.flags = VPORT_F_REQUIRED,

.init = netdev_init,

.exit = netdev_exit,

.create = netdev_create,

.destroy = netdev_destroy,

.set_addr = ovs_netdev_set_addr,

.get_name = ovs_netdev_get_name,

.get_addr = ovs_netdev_get_addr,

.get_kobj = ovs_netdev_get_kobj,

.get_dev_flags = ovs_netdev_get_dev_flags,

.is_running = ovs_netdev_is_running,

.get_operstate = ovs_netdev_get_operstate,

.get_ifindex = ovs_netdev_get_ifindex,

.get_mtu = ovs_netdev_get_mtu,

.send = netdev_send,

};

可见，当dp是网络设备时（vport-netdev.c），最终由ovs_vport_add()函数调用的是netdev_create()函数，而netdev_create()函数中最关键的一步是注册了收到网包时的回调函数。

err = netdev_rx_handler_register(netdev_vport->dev, netdev_frame_hook, vport);

该操作将netdev_vport->dev收到网包时的相关数据由, netdev_frame_hook()函数来处理。后面都是进行一些辅助处理后，依次调用各处理函数，中间从ovs_vport_receive()回到vport.c，从ovs_dp_process_received_packet()回到datapath.c，进行统一处理。

netdev_frame_hook()ànetdev_port_receive()àovs_vport_receive()àovs_dp_process_received_packet()

在ovs_dp_process_received_packet()（datapath/datapath.c）中进行复杂的包处理过程，进行流查表，查表后执行对应的行为。当查找失败时候，使用ovs_dp_upcall()发送upcall到用户空间（ovs-vswitchd）。此后处理过程交给ovsd处理。

ovs_dp_process_received_packet()àovs_dp_upcall()à queue_userspace_packet()

ovs-vswitchd 快速处理

ovs-vswitchd利用bridge_run()/bridge_fast_run()（vswitchd/bridge.c）不断轮询各个bridge，执行相应的操作。

主要通过调用ofproto_run_fast()和ofproto_run()来运行各个bridge上的ofproto的处理过程。其中run_fast()函数简化了一些不必要的处理，主要处理upcall，运行速度更快。

bridge_run_fast()

下面我们首先以run_fast()函数为例进行分析。

void

bridge_run_fast(void)

{

struct bridge *br;

HMAP_FOR_EACH (br, node, &all_bridges) {

ofproto_run_fast(br->ofproto);

}

ofproto_run_fast()调用struct ofproto_class {}中对应的run_fast()。其中struct ofproto_class（ofproto/ofproto-provider.h）是j个抽象类。run_fast()是个函数指针，实际上，在bridge_run()中进行了赋值。

可能的ofproto_class类型在ofproto_classes[]变量中声明。而ofproto_classes[]变量是通过ofproto_initialize()来进行初始化的。在ofproto/ofproto.c中有如下的代码

static void

ofproto_initialize(void)

{

static bool inited;

if (!inited) {

inited = true;

ofproto_class_register(&ofproto_dpif_class);

}

其中，ofproto_class_register()定义如下。

int

ofproto_class_register(const struct ofproto_class *new_class)

{

size_t i;

for (i = 0; i < n_ofproto_classes; i++) {

if (ofproto_classes[i] == new_class) {

return EEXIST;

}

if (n_ofproto_classes >= allocated_ofproto_classes) {

ofproto_classes = x2nrealloc(ofproto_classes,

&allocated_ofproto_classes,

sizeof *ofproto_classes);

}

ofproto_classes[n_ofproto_classes++] = new_class;

return 0;

}

可见，ofproto_classes在初始化ofproto_initialize()（该初始化函数多次被调用，但仅执行一次）后仅含有一个变量，即ofproto_dpif_class，而ofproto_dpif_class的定义在ofproto/ofproto-dpif.c中，声明了各个变量和操作函数，如下

const struct ofproto_class ofproto_dpif_class = {

enumerate_types,

enumerate_names,

del,

alloc,

construct,

destruct,

dealloc,

run,

run_fast,

wait,

get_memory_usage,

flush,

get_features,

get_tables,

port_alloc,

port_construct,

port_destruct,

port_dealloc,

port_modified,

port_reconfigured,

port_query_by_name,

port_add,

port_del,

port_get_stats,

port_dump_start,

port_dump_next,

port_dump_done,

port_poll,

port_poll_wait,

port_is_lacp_current,

NULL, /* rule_choose_table */

rule_alloc,

rule_construct,

rule_destruct,

rule_dealloc,

rule_get_stats,

rule_execute,

rule_modify_actions,

set_frag_handling,

packet_out,

set_netflow,

get_netflow_ids,

set_sflow,

set_cfm,

get_cfm_fault,

get_cfm_opup,

get_cfm_remote_mpids,

get_cfm_health,

set_stp,

get_stp_status,

set_stp_port,

get_stp_port_status,

set_queues,

bundle_set,

bundle_remove,

mirror_set,

mirror_get_stats,

set_flood_vlans,

is_mirror_output_bundle,

forward_bpdu_changed,

set_mac_idle_time,

set_realdev,

};

Ofproto_class的初始化多次被调用，一个可能的流程如下：

bridge_run()à bridge_reconfigure()à bridge_update_ofprotos()à ofproto_create()àofproto_initialize()

除了这个过程以外，在ofproto_class_find__()中也都调用了ofproto_initialize()。

因此，ofproto_class中的成员的函数指针实际上指向了ofproto_dpif_class中的各个函数。

run_fast()

我们来看ofproto_dpif_class中的run_fast(struct ofproto *ofproto)（ofproto/ofproto-dpif.c）的代码。

static int

run_fast(struct ofproto *ofproto_)

{

struct ofproto_dpif *ofproto = ofproto_dpif_cast(ofproto_);

struct ofport_dpif *ofport;

unsigned int work;

HMAP_FOR_EACH (ofport, up.hmap_node, &ofproto->up.ports) {

port_run_fast(ofport);

}

/* Handle one or more batches of upcalls, until there's nothing left to do

* or until we do a fixed total amount of work.

* We do work in batches because it can be much cheaper to set up a number

* of flows and fire off their patches all at once. We do multiple batches

* because in some cases handling a packet can cause another packet to be

* queued almost immediately as part of the return flow. Both

* optimizations can make major improvements on some benchmarks and

* presumably for real traffic as well. */

work = 0;

while (work < FLOW_MISS_MAX_BATCH) {

int retval = handle_upcalls(ofproto, FLOW_MISS_MAX_BATCH - work);

if (retval <= 0) {

return -retval;

}

work += retval;

}

return 0;

}

实际上主要是做了对handle_upcalls()的调用。这也可以理解，因为ovs-vswitchd在各个bridge上很重要的一个操作就是要监听和处理来自各个bridge上的请求。

handle_upcalls()

代码如下：

static int

handle_upcalls(struct ofproto_dpif *ofproto, unsigned int max_batch)

{

struct dpif_upcall misses[FLOW_MISS_MAX_BATCH];

struct ofpbuf miss_bufs[FLOW_MISS_MAX_BATCH];

uint64_t miss_buf_stubs[FLOW_MISS_MAX_BATCH][4096 / 8];

int n_processed;

int n_misses;

int i;

assert(max_batch <= FLOW_MISS_MAX_BATCH);

n_misses = 0;

for (n_processed = 0; n_processed < max_batch; n_processed++) {

struct dpif_upcall *upcall = &misses[n_misses];

struct ofpbuf *buf = &miss_bufs[n_misses];

int error;

ofpbuf_use_stub(buf, miss_buf_stubs[n_misses],

sizeof miss_buf_stubs[n_misses]);

error = dpif_recv(ofproto->dpif, upcall, buf);

if (error) {

ofpbuf_uninit(buf);

break;

}

switch (classify_upcall(upcall)) {

case MISS_UPCALL:

/* Handle it later. */

n_misses++;

break;

case SFLOW_UPCALL:

if (ofproto->sflow) {

handle_sflow_upcall(ofproto, upcall);

}

ofpbuf_uninit(buf);

break;

case BAD_UPCALL:

ofpbuf_uninit(buf);

break;

}

/* Handle deferred MISS_UPCALL processing. */

handle_miss_upcalls(ofproto, misses, n_misses);

for (i = 0; i < n_misses; i++) {

ofpbuf_uninit(&miss_bufs[i]);

}

return n_processed;

}

在这部分代码中，完成对相关的upcall的处理。包括datapath找不到流表时的MISS_UPCALL和SFLOW相关流量处理等。对流表MISS_UPCALL部分的处理主要在handle_miss_upcalls()（ofproto/ofproto-dpif.c）。其中，handle_miss_upcalls()的主要过程为执行handle_flow_miss()和dpif_operate()。handle_flow_miss()负责查找出对upcall的对应行动，后者根据行动执行操作。

HMAP_FOR_EACH (miss, hmap_node, &todo) {

handle_flow_miss(ofproto, miss, flow_miss_ops, &n_ops);

}

assert(n_ops <= ARRAY_SIZE(flow_miss_ops));

/* Execute batch. */

for (i = 0; i < n_ops; i++) {

dpif_ops[i] = &flow_miss_ops[i].dpif_op;

}

dpif_operate(ofproto->dpif, dpif_ops, n_ops);

handle_flow_miss()

位于ofproto/ofproto-dpif.c，主要过程如下

static void

handle_flow_miss(struct ofproto_dpif *ofproto, struct flow_miss *miss,

struct flow_miss_op *ops, size_t *n_ops)

{

struct facet *facet;

uint32_t hash;

/* The caller must ensure that miss->hmap_node.hash contains

* flow_hash(miss->flow, 0). */

hash = miss->hmap_node.hash;

facet = facet_lookup_valid(ofproto, &miss->flow, hash);

if (!facet) {

struct rule_dpif *rule = rule_dpif_lookup(ofproto, &miss->flow);

if (!flow_miss_should_make_facet(ofproto, miss, hash)) {

handle_flow_miss_without_facet(miss, rule, ops, n_ops);

return;

}

facet = facet_create(rule, &miss->flow, hash);

}

handle_flow_miss_with_facet(miss, facet, ops, n_ops);

}

该过程主要包括两部分，首先是在本地通过facet_lookup_valid()函数查找流表，看是否有与flow精确匹配的规则。

如果不存在facet，则通过rule_dpif_lookup()函数来查找匹配的规则，并利用flow_miss_should_make_facet ()测试是否值得在ovsd的ofproto中添加相应规则并写到datapath中（多数情况下）。如果为是，handle_flow_miss_without_facet()将结果rule写入到ops或者利用facet_create()在ofproto内添加新的facet。

如果存在facet，则利用handle_flow_miss_with_facet()更新ops。其中，handle_flow_miss_with_facet()调用handle_flow_miss_common()进行状态测试：如果在fail mod，则发送miss消息（ofproto/ofproto-dpif.c文件中，调用send_packet_in_miss()àconnmgr_send_packet_in()）给controller。之后检查是否创建slow flow的标记等。

以rule_dpif_lookup()为例，该函数进一步的调用rule_dpif_lookup__()函数。其代码为

static struct rule_dpif *

rule_dpif_lookup__(struct ofproto_dpif *ofproto, const struct flow *flow,

uint8_t table_id)

{

struct cls_rule *cls_rule;

struct classifier *cls;

if (table_id >= N_TABLES) {

return NULL;

}

cls = &ofproto->up.tables[table_id].cls;

if (flow->nw_frag & FLOW_NW_FRAG_ANY

&& ofproto->up.frag_handling == OFPC_FRAG_NORMAL) {

/* For OFPC_NORMAL frag_handling, we must pretend that transport ports

* are unavailable. */

struct flow ofpc_normal_flow = *flow;

ofpc_normal_flow.tp_src = htons(0);

ofpc_normal_flow.tp_dst = htons(0);

cls_rule = classifier_lookup(cls, &ofpc_normal_flow);

} else {

cls_rule = classifier_lookup(cls, flow);

}

return rule_dpif_cast(rule_from_cls_rule(cls_rule));

}

其中，classifier_lookup()通过对ofproto中存储的规则表进行查找，找到最高优先级的匹配规则，并返回该规则。

dpif_operate()

dpif_operate(ofproto->dpif, dpif_ops, n_ops)主要根据handle_flow_miss()后确定的行动，执行相关的操作。

主要代码如下：

/* Execute batch. */

for (i = 0; i < n_ops; i++) {

dpif_ops[i] = &flow_miss_ops[i].dpif_op;

}

dpif_operate(ofproto->dpif, dpif_ops, n_ops);

其中dpif_operate()首先判断具体的dpif_class中是否存在operate()函数，如果有则调用执行。否则就根据op的类型调用具体的dpif_class中的flow_put()、flow_del()、或execute()函数。

operate()函数存在

先分析存在operate()函数的情况。

dpif->dpif_class->operate(dpif, ops, n_ops);

operate()需要根据具体的类型来定。dpif_class类型仍然在dpif.c（lib/dpif.c）中通过base_dpif_classes[]来声明。

static const struct dpif_class *base_dpif_classes[] = {

#ifdef HAVE_NETLINK

&dpif_linux_class,

#endif

&dpif_netdev_class,

};

其中dpif_linux_class通过netlink跟本地的datapath通信，而dpif_netdev_class则意味着通过网络协议跟远程的datapath通信。此处分析以常见的dpif-linux为例，其operate()实际为dpif_linux_operate()（lib/dpif-linux.c）。

dpif_linux_operate()实际上主要就是调用了dpif_linux_operate__()，在dpif_linux_operate__()中，首先利用传入的dpif_op **ops对不同类型（PUT、DEL、EXECUTE）的行动分别创建相应的多个aux->request消息，之后调用nl_sock_transact_multiple()函数（lib/netlink-socket.c）来发出request，并收回reply。代码为

nl_sock_transact_multiple(genl_sock, txnsp, n_ops);

注意，txnsp中同时包括发出的request和收回的reply。

之后，检查reply函数，更新相关的统计变量。

operate()函数不存在

如果对于某个具体的dpif_class，并没有提供operate()，则需要分别处理不同类型的行为，代码为

for (i = 0; i < n_ops; i++) {

struct dpif_op *op = ops[i];

switch (op->type) {

case DPIF_OP_FLOW_PUT:

op->error = dpif_flow_put__(dpif, &op->u.flow_put);

break;

case DPIF_OP_FLOW_DEL:

op->error = dpif_flow_del__(dpif, &op->u.flow_del);

break;

case DPIF_OP_EXECUTE:

op->error = dpif_execute__(dpif, &op->u.execute);

break;

default:

NOT_REACHED();

}

可以处理DPIF_OP_FLOW_PUT、DPIF_OP_FLOW_DEL和DPIF_OP_EXECUTE三种类型的情况（均在lib/dpif.h中定义）。

enum dpif_op_type {

DPIF_OP_FLOW_PUT = 1,

DPIF_OP_FLOW_DEL,

DPIF_OP_EXECUTE,

};

值得注意的是第三种情况，DPIF_OP_EXECUTE，需要将执行命令发回datapath。dpif_execute__()调用了dpif结构中的dpif_class抽象类型中的execute()函数。

仍然以dpif-linux（lib/dpif-linux.c）为例，定义为

const struct dpif_class dpif_linux_class = {

"system",

dpif_linux_enumerate,

dpif_linux_open,

dpif_linux_close,

dpif_linux_destroy,

dpif_linux_run,

dpif_linux_wait,

dpif_linux_get_stats,

dpif_linux_port_add,

dpif_linux_port_del,

dpif_linux_port_query_by_number,

dpif_linux_port_query_by_name,

dpif_linux_get_max_ports,

dpif_linux_port_get_pid,

dpif_linux_port_dump_start,

dpif_linux_port_dump_next,

dpif_linux_port_dump_done,

dpif_linux_port_poll,

dpif_linux_port_poll_wait,

dpif_linux_flow_get,

dpif_linux_flow_put,

dpif_linux_flow_del,

dpif_linux_flow_flush,

dpif_linux_flow_dump_start,

dpif_linux_flow_dump_next,

dpif_linux_flow_dump_done,

dpif_linux_execute,

dpif_linux_operate,

dpif_linux_recv_set,

dpif_linux_queue_to_priority,

dpif_linux_recv,

dpif_linux_recv_wait,

dpif_linux_recv_purge,

};

所以，执行函数的为dpif_linux_execute()（lib/dpif-linux.c），该函数首先调用的是dpif_linux_execute__()函数。

static int

dpif_linux_execute__(int dp_ifindex, const struct dpif_execute *execute)

{

uint64_t request_stub[1024 / 8];

struct ofpbuf request;

int error;

ofpbuf_use_stub(&request, request_stub, sizeof request_stub);

dpif_linux_encode_execute(dp_ifindex, execute, &request);

error = nl_sock_transact(genl_sock, &request, NULL);

ofpbuf_uninit(&request);

return error;

}

该函数创建一个OVS_PACKET_CMD_EXECUTE类型的nlmsg，并利用nl_sock_transact()将它发出给datapath。

ovs-vswitchd完整处理

主要通过调用bridge_run()来完成完整的配置和处理。

该函数执行完整的bridge的操作，包括对of命令的操作和网桥的维护，数据库信息同步、维护到controller的连接等。之后调用ofproto_run()处理相关的ofproto消息（参考3.2.17）。

最后判断是否到了周期log的时间，进行log记录（但默认的周期为LLONG_MAX），并刷新各个连接和状态信息等。

Life Memo

Pages

Monday, November 26, 2012

OpenvSwitch 代码分析（三）

动态过程

datapath收到网包

ovs-vswitchd 快速处理

bridge_run_fast()

run_fast()

handle_upcalls()

handle_flow_miss()

dpif_operate()

ovs-vswitchd完整处理

Searching

Blog Archive