本小节分析vswitchd模块,该模块作为管理底层datapath的重要模块,实现了openflow的主要逻辑,以及对交换机的具体管理和除简单转发外的其他逻辑。可见,该模块十分重要,主要生成ovs-vswitchd文件,其中主文件为vswitchd/ovs-vswitchd.c。
整体分析
Vswitchd模块主要包括bridge、ofproto等子模块。作为主要逻辑实现模块,负责解析和执行其他各个openflow命令。
bridge模块
负责所管理的所有datapath,对外的接口很简单,包括
void bridge_init(const char
*remote);
void bridge_exit(void);
void bridge_run(void);
void bridge_run_fast(void);
void bridge_wait(void);
void bridge_get_memory_usage(struct
simap *usage);
数据结构主要在bridge.c中定义了bridge结构,定义为
struct bridge {
   
struct hmap_node node;      /* In
'all_bridges'. */
   
char *name;                 /*
User-specified arbitrary name. */
   
char *type;                 /*
Datapath type. */
   
uint8_t ea[ETH_ADDR_LEN];   /*
Bridge Ethernet Address. */
   
uint8_t default_ea[ETH_ADDR_LEN]; /* Default MAC. */
   
const struct ovsrec_bridge *cfg;
   
/* OpenFlow switch processing. */
   
struct ofproto *ofproto;    /*
OpenFlow switch. */
   
/* Bridge ports. */
   
struct hmap ports;          /*
"struct port"s indexed by name. */
   
struct hmap ifaces;         /*
"struct iface"s indexed by ofp_port. */
   
struct hmap iface_by_name;  /*
"struct iface"s indexed by name. */
   
struct list ofpp_garbage;   /*
"struct ofpp_garbage" slated for removal. */
   
struct hmap if_cfg_todo;    /*
"struct if_cfg"s slated for creation.
                                   Indexed on
'cfg->name'. */
   
/* Port mirroring. */
   
struct hmap mirrors;        /*
"struct mirror" indexed by UUID. */
   
/* Synthetic local port if necessary. */
   
struct ovsrec_port synth_local_port;
   
struct ovsrec_interface synth_local_iface;
   
struct ovsrec_interface *synth_local_ifacep;
};
其中,最重要的是ofproto指针,指向一个openflow switch,负责进行openflow
switch的所有处理。实际上,vswitchd的主要功能就是不断检测并调用所有bridge上的ofproto,执行其上的处理函数。
ofproto
类型定义在ofproto/ofproto-provider.h中。
struct ofproto {
   
struct hmap_node hmap_node; /* In global 'all_ofprotos' hmap. */
   
const struct ofproto_class *ofproto_class;
   
char *type;                 /*
Datapath type. */
   
char *name;                 /*
Datapath name. */
   
/* Settings. */
   
uint64_t fallback_dpid;     /*
Datapath ID if no better choice found. */
   
uint64_t datapath_id;       /*
Datapath ID. */
   
unsigned flow_eviction_threshold; /* Threshold at which to begin flow
                                       * table
eviction. Only affects the
                                       *
ofproto-dpif implementation */
   
bool forward_bpdu;          /*
Option to allow forwarding of BPDU frames
                                 * when NORMAL
action is invoked. */
   
char *mfr_desc;             /*
Manufacturer. */
   
char *hw_desc;              /*
Hardware. */
   
char *sw_desc;              /*
Software version. */
   
char *serial_desc;          /*
Serial number. */
   
char *dp_desc;              /*
Datapath description. */
   
enum ofp_config_flags frag_handling; /* One of OFPC_*.  */
   
/* Datapath. */
   
struct hmap ports;          /*
Contains "struct ofport"s. */
   
struct shash port_by_name;
   
/* Flow tables. */
  
 struct oftable *tables;
   
int n_tables;
   
/* OpenFlow connections. */
   
struct connmgr *connmgr;
   
/* Flow table operation tracking. */
   
int state;                  /*
Internal state. */
   
struct list pending;        /*
List of "struct ofopgroup"s. */
   
unsigned int n_pending;     /*
list_size(&pending). */
   
struct hmap deletions;      /* All
OFOPERATION_DELETE "ofoperation"s. */
   
/* Flow table operation logging. */
   
int n_add, n_delete, n_modify; /* Number of unreported ops of each kind.
*/
   
long long int first_op, last_op; /* Range of times for unreported ops.
*/
   
long long int next_op_report;   
/* Time to report ops, or LLONG_MAX. */
   
long long int op_backoff;       
/* Earliest time to report ops again. */
   
/* Linux VLAN device support (e.g. "eth0.10" for VLAN 10.)
    
*
    
* This is deprecated.  It is only
for compatibility with broken device
    
* drivers in old versions of Linux that do not properly support VLANs
when
    
* VLAN devices are not used.  When
broken device drivers are no longer in
    
* widespread use, we will delete these interfaces. */
   
unsigned long int *vlan_bitmap; /* 4096-bit bitmap of in-use VLANs. */
   
bool vlans_changed;             /*
True if new VLANs are in use. */
   
int min_mtu;                    /*
Current MTU of non-internal ports. */
};
其中最关键的是ofproto_class,是ofproto交换机的具体实现,定义了对于of协议的处理(包括run和run_fast函数,前者处理更为全面,调用了后者)。处理函数的实现在ofproto/ofproto-dpif.c中。
run_fast()
位于ofproto-dpif.c中。
分析run_fast函数,主要完成了两个需要周期性及时完成的事情。
首先对各个port上调用port_run_fast,检查是否要发送连续性检查的网包消息(CCM,参考IEEE
802.1aq),如果是,则发出。
HMAP_FOR_EACH (ofport, up.hmap_node,
&ofproto->up.ports) {
        port_run_fast(ofport);
   
}
然后,检查是否有upcall,对所有的来自datapath的upcall进行处理。
while (work <
FLOW_MISS_MAX_BATCH) {
        int retval = handle_upcalls(ofproto,
FLOW_MISS_MAX_BATCH - work);
        if (retval <= 0) {
            return -retval;
        }
        work += retval;
   
}
handle_upcalls()
位于ofproto-dpif.c中。
该函数从对应的dpif中获取到upcalls后,对upcalls进行类型检查。对于SFLOW_UPCALL和BAD_UPCALL,进行对应处理后释放存有upcall消息的buf,而对于MISS_UPCALL类型,则调用handle_miss_upcalls进行后续的处理。
其中,upcall的类型为dpif_upcall(lib/dpif.h),定义为
/* A packet passed up from the
datapath to userspace.
 *
 * If 'key' or 'actions' is nonnull, then it
points into data owned by
 * 'packet', so their memory cannot be freed
separately.  (This is hardly a
 * great way to do things but it works out OK
for the dpif providers and
 * clients that exist so far.)
 */
struct dpif_upcall {
   
/* All types. */
   
enum dpif_upcall_type type;
   
struct ofpbuf *packet;      /*
Packet data. */
   
struct nlattr *key;         /*
Flow key. */
   
size_t key_len;             /*
Length of 'key' in bytes. */
   
/* DPIF_UC_ACTION only. */
   
uint64_t userdata;          /*
Argument to OVS_ACTION_ATTR_USERSPACE. */
};
handle_miss_upcalls ()
位于ofproto-dpif.c中。
该函数从upcall中提取相关的流信息,把属于同一个key的网包放到一起,最后放进todo
list中。最后检查todo
list中的每个元素,调用handle_flow_miss()进行处理。
HMAP_FOR_EACH (miss, hmap_node,
&todo) {
        handle_flow_miss(ofproto, miss,
flow_miss_ops, &n_ops);
}
处理完毕后,调用dpif_operate()(位于vswitchd/dpif.c)执行查找到的行动。
for (i = 0; i < n_ops; i++) {
        dpif_ops[i] =
&flow_miss_ops[i].dpif_op;
   
}
   
dpif_operate(ofproto->dpif, dpif_ops, n_ops);
handle_miss_upcall ()
位于ofproto/ofproto-dpif.c中。
该函数处理给定的某个miss_upcall。首先,先判断是否发生了精确匹配,如果发生了,则直接按照匹配结果调用handle_flow_miss_with_facet();如果没有精确匹配结果,则调用handle_flow_miss_without_facet()。
unixctl_server相关
主循环中调用了unixctl_server_run()函数。该函数首先获取到远端server的连接,然后,执行连接中的命令,代码为。
LIST_FOR_EACH_SAFE (conn, next,
node, &server->conns) {
        int error = run_connection(conn);
        if (error && error != EAGAIN) {
            kill_connection(conn);
        }
   
}
ovs-vswitchd.c
主文件,其中main()为入口主函数,执行一系列的初始化,并配置各个队列,最后是主循环,进行任务处理。分析主要代码如下
Int main(int argc, char *argv[])
{
   
char *unixctl_path = NULL;
   
struct unixctl_server *unixctl;
   
struct signal *sighup;
   
char *remote;
   
bool exiting;
   
int retval;
   
proctitle_init(argc, argv); //backup orignal argvs
   
set_program_name(argv[0]);
   
stress_init_command(); //register stress cmds to the commands
   
remote = parse_options(argc, argv, &unixctl_path);
   
signal(SIGPIPE, SIG_IGN); //ignore the pipe read end signal
   
sighup = signal_register(SIGHUP); //register the SIGHUP signal handler
   
process_init(); //create notification pipe and register signal for child
process exit
   
ovsrec_init(); //todo: make clear here
   
daemonize_start(); //daemonize the process
   
if (want_mlockall) {
#ifdef HAVE_MLOCKALL
        if (mlockall(MCL_CURRENT | MCL_FUTURE))
{
            VLOG_ERR("mlockall failed:
%s", strerror(errno));
        }
#else
        VLOG_ERR("mlockall not supported
on this system");
#endif
   
}
   
worker_start(); //start a worker subprocess, call worker_main (receive
data and process)
   
retval = unixctl_server_create(unixctl_path, &unixctl);//create a
unix domain socket
   
if (retval) {
        exit(EXIT_FAILURE);
   
}
   
unixctl_command_register("exit", "", 0, 0,
ovs_vswitchd_exit, &exiting);
   
bridge_init(remote);//ini the bridge, configure from the ovsdb server,
register ctrl commands
   
free(remote);
   
exiting = false;
   
while (!exiting) {
        worker_run(); //reply with the worker
subprocess
        if (signal_poll(sighup)) {
            vlog_reopen_log_file();
        }
        memory_run();//monitor the memory
        if (memory_should_report()) {
            struct simap usage;
            simap_init(&usage);
            bridge_get_memory_usage(&usage);
            memory_report(&usage);
            simap_destroy(&usage);
        }
        bridge_run_fast(); //check each bridge
and run it's handler
        bridge_run(); //main process part,
process of pkts
        bridge_run_fast();
        unixctl_server_run(unixctl);
        netdev_run(); //run periodic functions
by all network devices.
        worker_wait();
        signal_wait(sighup);
        memory_wait();
        bridge_wait();
        unixctl_server_wait(unixctl);
        netdev_wait();
        if (exiting) {
            poll_immediate_wake();
        }
        poll_block();
   
}
   
bridge_exit();
   
unixctl_server_destroy(unixctl);
   
signal_unregister(sighup);
   
return 0;
}
proctitle_init(argc, argv)
复制出输入的参数列表到新的存储中,让argv指向这块内存,主要是为了后面的proctitle_set()函数(在daemonize_start()->monitor_daemon()中调用,可能修改原argv存储)做准备。
    set_program_name(argv[0])
设置程序名称、版本、编译日期等信息。
    stress_init_command()
注册stress 相关命令(list、set、enable、disable)到commands结构。
    remote = parse_options(argc,
argv, &unixctl_path)
解析参数,其中unixctl_path存储unixctrl域的sock名,作为接受外部控制命令的渠道;而remote存储连接到ovsdb的信息,即连接到配置数据库的sock名。
    signal(SIGPIPE, SIG_IGN)
忽略pipe读结束的信号。
    sighup = signal_register(SIGHUP)
注册对 SIGHUP信号(终端挂起)的处理函数。处理函数为写到fds[1]中空字符。
    process_init()
注册对SIGCHLD信号(子进程结束)的处理函数。处理函数为执行all_process上的所有进程。
    ovsrec_init()
数据表结构初始化。包括13张数据表。表的具体结构请参考ovsdb的相关文档。
daemonize_start()
让进程变成守护程序。
worker_start()
开启一个worker子进程。子进程与主进程交互数据。
unixctl_server_create(unixctl_path, &unixctl)
创建一个unixctl server(存放在unixctl),并监听在unixctl_path指定的punix路径。
unixctl_command_register("exit", "", 0, 0,
vs_vswitchd_exit, &exiting)
注册unixctl命令。
    bridge_init(remote)
从remote数据库获取配置信息,并初始化bridge。
主循环
exiting = false;
   
while (!exiting) {
        worker_run(); //reply with the worker
subprocess
        if (signal_poll(sighup)) {
            vlog_reopen_log_file();
        }
        memory_run();//monitor the memory
        if (memory_should_report()) {
            struct simap usage;
    
       simap_init(&usage);
           
bridge_get_memory_usage(&usage);
            memory_report(&usage);
            simap_destroy(&usage);
        }
        bridge_run_fast(); //check each bridge
and run it's handler
        bridge_run(); //main process part,
process of pkts
        bridge_run_fast();
        unixctl_server_run(unixctl);
        netdev_run(); //run periodic functions
by all network devices.
        worker_wait();
        signal_wait(sighup);
        memory_wait();
        bridge_wait();
        unixctl_server_wait(unixctl);
        netdev_wait();
        if (exiting) {
            poll_immediate_wake();
        }
        poll_block();
}
worker_run()
执行从worker子进程中获取的RPC reply,执行其中的cb_reply回调函数。主要过程为
rxbuf_run(&client_rx,
client_sock, sizeof(struct worker_reply));
reply->reply_cb(&client_rx.payload,
client_rx.fds,  client_rx.n_fds,
reply->reply_aux);
bridge_run_fast()
执行在all_bridge上的每个bridge的ofproto上的run_fast。主要是监听和处理来自datapath的upcall,主要过程为
HMAP_FOR_EACH (br, node,
&all_bridges) {
    
   ofproto_run_fast(br->ofproto);
   
}
bridge_run()
主要的对网包进行慢速处理过程。包括完成必要的配置更新(在配置更新中会从数据库读入配置信息,生成必要的bridge和dp等数据结构),以及执行在all_bridge上的每个bridge的ofproto上的run(),并做相应的信息统计。
其中,ofproto上的run()主要依次调用了如下函数:
调用dpif_run()处理所有注册的netlink notifier的汇报事件。
调用run_fast()处理常见的周期性事件,包括对upcalls的处理等。
可选调用netflow_run()和sflow_run(),进行对netflow和sflow的支持
可选调用port_run()进行发送CCM。
可选调用bundle_run()处理LACP、bonding等杂项。
可选调用stp_run()进行STP支持。
mac_learning_run()获取超时的mac entry,并将其删除掉。
可选调用governor_run()进行限速处理。
unixctl_server_run(unixctl)
从unixctl指定的server中获取数据,并执行对应的配置命令。主要过程为
struct unixctl_conn *conn =
xzalloc(sizeof *conn);
           
list_push_back(&server->conns, &conn->node);
            conn->rpc =
jsonrpc_open(stream);
netdev_run()
执行在netdev_classes上定义的每个netdev_class实体,调用它们的run()。主要过程为
SHASH_FOR_EACH(node,
&netdev_classes) {
        const struct netdev_class *netdev_class
= node->data;
        if (netdev_class->run) {
            netdev_class->run();
        }
}
循环等待事件处理
包括woker、signal、memory、bridge、unixctl_server、netdev等事件,被poll_fd_wait()注册。
poll_block(void)
阻塞,直到之前被poll_fd_wait()注册过的事件发生,或者等待时间超过poll_timer_wait()注册的最短时间。
清理工作
退出bridge,关闭unixctl 连接,取消对sighup信号的处理注册。
bridge_exit();
unixctl_server_destroy(unixctl);
signal_unregister(sighup);
通用类型
基础宏定义
首先分析下几个常见的基础宏。
CONTAINER_OF宏:返回拥有某个给定member的struct结构的起始地址。其中,struct为所定义的数据结构,其中有一个变量名字为member,pointer为指向member变量的一个指针。该宏返回整个数据结构的起始地址。定义为
#define CONTAINER_OF(POINTER,
STRUCT, MEMBER)                          
\
        ((STRUCT *) (void *) ((char *)
(POINTER) - offsetof (STRUCT, MEMBER)))
OBJECT_CONTAINING宏:返回含有某个给定member的对象object的地址。
#define OBJECT_CONTAINING(POINTER,
OBJECT, MEMBER)                      \
   
((OVS_TYPEOF(OBJECT)) (void *)                                      \
    
((char *) (POINTER) - OBJECT_OFFSETOF(OBJECT, MEMBER)))
ASSIGN_CONTAINER宏:返回含有某个给定member的对象object和1。
#define ASSIGN_CONTAINER(OBJECT,
POINTER, MEMBER) \
   
((OBJECT) = OBJECT_CONTAINING(POINTER, OBJECT, MEMBER), 1)
普通列表
ovs代码中大量使用了列表的结构。列表的声明在lib/list.h中。列表的抽象结构用户不必关心,用户只需要维护好自己关心的节点的数据结构即可。
使用一个list: struct list L =
LIST_INITIALIZER(&L)。
正向遍历:
#define LIST_FOR_EACH(ITER, MEMBER,
LIST)                               \
   
for (ASSIGN_CONTAINER(ITER, (LIST)->next, MEMBER);                  \
         &(ITER)->MEMBER != (LIST);                                     \
         ASSIGN_CONTAINER(ITER,
(ITER)->MEMBER.next, MEMBER))
逆向遍历
#define LIST_FOR_EACH_REVERSE(ITER,
MEMBER, LIST)                       \
   
for (ASSIGN_CONTAINER(ITER, (LIST)->prev, MEMBER);                  \
         &(ITER)->MEMBER != (LIST);                                     \
         ASSIGN_CONTAINER(ITER,
(ITER)->MEMBER.prev, MEMBER))
安全遍历
#define LIST_FOR_EACH_SAFE(ITER,
NEXT, MEMBER, LIST)            \
   
for (ASSIGN_CONTAINER(ITER, (LIST)->next, MEMBER);          \
         (&(ITER)->MEMBER != (LIST)                             \
          ? ASSIGN_CONTAINER(NEXT,
(ITER)->MEMBER.next, MEMBER) \
          : 0);                                                
\
         (ITER) = (NEXT))
Hash列表
列表的声明在lib/hmap.h中。
Hmap列表,包含两个指针,指向其中含有的节点,定义为
/* A hash map. */
struct hmap {
   
struct hmap_node **buckets; /* Must point to 'one' iff 'mask' == 0. */
   
struct hmap_node *one;
   
size_t mask;
   
size_t n;
};
而hmap_node类型,包括一个hash值和一个后继指针,定义为
struct hmap_node {
   
size_t hash;                /*
Hash value. */
   
struct hmap_node *next;     /*
Next in linked list. */
};
对hmap的列表遍历似乎通过如下宏来实现的
#define HMAP_FOR_EACH(NODE, MEMBER,
HMAP)                               \
   
for (ASSIGN_CONTAINER(NODE, hmap_first(HMAP), MEMBER);              \
         &(NODE)->MEMBER != NULL;                                       \
         ASSIGN_CONTAINER(NODE, hmap_next(HMAP,
&(NODE)->MEMBER), MEMBER))
其中ASSIGN_CONTAINER有三个参数,分别是一个输出指针,一个输入指针和一个成员。输入指针指向该成员,而输出指针获取指向包含有该成员的类型的地址。
所以遍历宏实际上,遍历了hmap的每个节点,并将包含该节点的数据结构逐个返回。
ofproto
ofproto_class
log消息
#define VLOG_FATAL(...)
vlog_fatal(THIS_MODULE, __VA_ARGS__)
#define VLOG_ABORT(...)
vlog_abort(THIS_MODULE, __VA_ARGS__)
#define VLOG_EMER(...)
VLOG(VLL_EMER, __VA_ARGS__)
#define VLOG_ERR(...) VLOG(VLL_ERR,
__VA_ARGS__)
#define VLOG_WARN(...)
VLOG(VLL_WARN, __VA_ARGS__)
#define VLOG_INFO(...)
VLOG(VLL_INFO, __VA_ARGS__)
#define VLOG_DBG(...) VLOG(VLL_DBG,
__VA_ARGS__)
 
No comments:
Post a Comment