Linux Network Architecture Network Layer

Linux Network ArchitectureNetwork Layer Isaac Y. Tsai <eplusplus@gmail.com>

Outline • Network Layer in Linux • Network filter and iptable framework • PF Ring architecture

Interface between device driver and network layer

Network layer functions • <kernel src>/net/ipv4/ip_input.c • ip_rcv(skb) • ip_rcv_finish(skb) • ip_local_deliver(skb) • ip_local_deliver_finish(skb) • <kernel src>/net/ipv4/ip_forward.c • ip_forward(skb) • ip_forward_finish(skb) • <kernel src>/net/ipv4/ipmr.c • int ip_mr_input(skb) • <kernel src>/net/ipv4/ip_output.c • ip_queue_xmit(skb,ipfragok) • ip_local_out(skb) • __ip_local_out(skb) • ip_output(skb) • ip_finish_output(skb) • ip_finish_output2(skb) • ip_mc_output(skb)

netif_receive_skb() • <kernel src>/net/core/dev.c • int netif_receive_skb(struct sk_buff *skb) • { • struct packet_type *ptype, *pt_prev; • struct net_device *orig_dev, *master, *null_or_orig, *null_or_bond; • int ret = NET_RX_DROP; • __be16 type; • if (!skb->tstamp.tv64) net_timestamp(skb); • if (vlan_tx_tag_present(skb) && vlan_hwaccel_do_receive(skb)) • return NET_RX_SUCCESS; • if (netpoll_receive_skb(skb)) return NET_RX_DROP; • if (!skb->skb_iif) skb->skb_iif = skb->dev->ifindex; • null_or_orig = NULL; orig_dev = skb->dev; • master = ACCESS_ONCE(orig_dev->master);

netif_receive_skb() (cont’ed) • if (master) { • if (skb_bond_should_drop(skb, master)) null_or_orig = orig_dev; • else skb->dev = master; • } • __get_cpu_var(netdev_rx_stat).total++; • skb_reset_network_header(skb); skb_reset_transport_header(skb); • skb->mac_len = skb->network_header - skb->mac_header; pt_prev = NULL; • rcu_read_lock(); • #ifdef CONFIG_NET_CLS_ACT • if (skb->tc_verd & TC_NCLS) { skb->tc_verd = CLR_TC_NCLS(skb->tc_verd); goto ncls; } • #endif • list_for_each_entry_rcu(ptype, &ptype_all, list) { • if (ptype->dev == null_or_orig || ptype->dev == skb->dev || ptype->dev == orig_dev) { • if (pt_prev) ret = deliver_skb(skb, pt_prev, orig_dev); • pt_prev = ptype; • } • }

netif_receive_skb() (cont’ed) • #ifdef CONFIG_NET_CLS_ACT • skb = handle_ing(skb, &pt_prev, &ret, orig_dev); if (!skb) goto out; • ncls: • #endif • skb = handle_bridge(skb, &pt_prev, &ret, orig_dev); • if (!skb) goto out; • skb = handle_macvlan(skb, &pt_prev, &ret, orig_dev); • if (!skb) goto out; • null_or_bond = NULL; • if ((skb->dev->priv_flags & IFF_802_1Q_VLAN) && • (vlan_dev_real_dev(skb->dev)->priv_flags & IFF_BONDING)) { • null_or_bond = vlan_dev_real_dev(skb->dev); • } • type = skb->protocol;

netif_receive_skb() (cont’ed) • list_for_each_entry_rcu(ptype,&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) { • if (ptype->type == type && (ptype->dev == null_or_orig || • ptype->dev == skb->dev || ptype->dev == orig_dev || • ptype->dev == null_or_bond)) { • if (pt_prev) ret = deliver_skb(skb, pt_prev, orig_dev); • pt_prev = ptype; • } • } • if (pt_prev) { • ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev); • } else { • kfree_skb(skb); • ret = NET_RX_DROP; • } • out: • rcu_read_unlock(); • return ret; • }

net_rx_action() • <kernel src>/net/core/dev.c • static void net_rx_action(struct softirq_action *h) • { • struct list_head *list = &__get_cpu_var(softnet_data).poll_list; • unsigned long time_limit = jiffies + 2; • int budget = netdev_budget; • void *have; • local_irq_disable(); • while (!list_empty(list)) { • struct napi_struct *n; • int work, weight; • if (unlikely(budget <= 0 || time_after(jiffies, time_limit))) • goto softnet_break; • local_irq_enable(); • n = list_first_entry(list, struct napi_struct, poll_list);

net_rx_action() (cont’ed) • have = netpoll_poll_lock(n); • weight = n->weight; work = 0; • if (test_bit(NAPI_STATE_SCHED, &n->state)) { • work = n->poll(n, weight); • trace_napi_poll(n); • } • WARN_ON_ONCE(work > weight); • budget -= work; • local_irq_disable(); • if (unlikely(work == weight)) { • if (unlikely(napi_disable_pending(n))) { • local_irq_enable(); • napi_complete(n); • local_irq_disable(); • } else • list_move_tail(&n->poll_list, list); • } • netpoll_poll_unlock(have); • }

net_rx_action() (cont’ed) • out: • local_irq_enable(); • #ifdef CONFIG_NET_DMA • /* • * There may not be any more sk_buffs coming right now, so push • * any pending DMA copies to hardware • */ • dma_issue_pending_all(); • #endif • return; • softnet_break: • __get_cpu_var(netdev_rx_stat).time_squeeze++; • __raise_softirq_irqoff(NET_RX_SOFTIRQ); • goto out; • }

Packet reception path: ip_rcv() • Network layer packet reception code ip_rcv() • ip_rcv() first performs some error checking related to packet type, packet header and it keeps some packet statistics. At the end of the code, it makes a macro function call to NF_HOOK(PF_INET, NF_INET_PRE_ROUTING, skb, dev, NULL, ip_rcv_finish);

ip_rcv() • <kernel src>/net/ipv4/ip_input.c • int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) • { • struct iphdr *iph; • u32 len; • if (skb->pkt_type == PACKET_OTHERHOST) goto drop; • IP_UPD_PO_STATS_BH(dev_net(dev), IPSTATS_MIB_IN, skb->len); • if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL) { • IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INDISCARDS); • goto out; • } • if (!pskb_may_pull(skb, sizeof(struct iphdr))) goto inhdr_error;

ip_rcv() (cont’ed) • iph = ip_hdr(skb); • if (iph->ihl < 5 || iph->version != 4) goto inhdr_error; • if (!pskb_may_pull(skb, iph->ihl*4)) goto inhdr_error; • iph = ip_hdr(skb); • if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl))) goto inhdr_error; • len = ntohs(iph->tot_len); • if (skb->len < len) { • IP_INC_STATS_BH(dev_net(dev),IPSTATS_MIB_INTRUNCATEDPKTS); • goto drop; • } else if (len < (iph->ihl*4)) goto inhdr_error;

ip_rcv() (cont’ed) • if (pskb_trim_rcsum(skb, len)) { • IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INDISCARDS); • goto drop; • } • /* Remove any debris in the socket control block */ • memset(IPCB(skb), 0, sizeof(struct inet_skb_parm)); • /* Must drop socket now because of tproxy. */ • skb_orphan(skb); • return NF_HOOK(PF_INET, NF_INET_PRE_ROUTING, skb, dev, NULL, ip_rcv_finish); • inhdr_error: • IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INHDRERRORS); • drop: • kfree_skb(skb); • out: • return NET_RX_DROP; • }

ip_rcv_finish() • ip_rcv_finish() calls ip_route_input() • The skb->dst pointer of the socket buffer is set to an entry in the routing cache, which stores not only the destination on the IP level, but also a pointer to an entry in the hard header cache (cache for layer-2 frame packet headers), if present. If ip_route_input() cannot find a route, then the packet is discarded. Finally in ip_rcv_finish(), the procedure of the IP protocol reaches the junction between packets addressed to the local computer and packets to be forwarded. The information about the further path of an IP packet is stored in the routing entry skb->dst. Notice that a trick often used in the Linux kernel is used here. If a switch (variable value) is used to select different functions, then we simply insert a pointer to each of these functions. This saves us an if or switch instruction for each decision of how the program should continue. In the example used here, the pointer skb->dst->input() points to the function that should be used to handle a packet further:

The pointer skb->dst->input() points to the function that should be used to handle a packet further: • ip_local_deliver() is entered in the case of unicast and multicast packets that should be delivered to the local computer. • ip_forward() handles all unicast packets that should be forwarded. • ip_mr_input() is used for multicast packets that should be forwarded.

ip_rcv_finish(skb) • <kernel src>/net/ipv4/ip_input.c • static int ip_rcv_finish(struct sk_buff *skb) • { • const struct iphdr *iph = ip_hdr(skb); • struct rtable *rt; • if (skb_dst(skb) == NULL) { • int err = ip_route_input(skb, iph->daddr, iph->saddr, iph->tos, skb->dev); • if (unlikely(err)) { • if (err == -EHOSTUNREACH) • IP_INC_STATS_BH(dev_net(skb->dev), IPSTATS_MIB_INADDRERRORS); • else if (err == -ENETUNREACH) • IP_INC_STATS_BH(dev_net(skb->dev), IPSTATS_MIB_INNOROUTES); • goto drop; • } • }

ip_rcv_finish(skb) (cont’ed) • #ifdef CONFIG_NET_CLS_ROUTE • if (unlikely(skb_dst(skb)->tclassid)) { • struct ip_rt_acct *st = per_cpu_ptr(ip_rt_acct, smp_processor_id()); • u32 idx = skb_dst(skb)->tclassid; st[idx&0xFF].o_packets++; • st[idx&0xFF].o_bytes += skb->len; st[(idx>>16)&0xFF].i_packets++; • st[(idx>>16)&0xFF].i_bytes += skb->len; • } • #endif • if (iph->ihl > 5 && ip_rcv_options(skb)) goto drop; • rt = skb_rtable(skb); • if (rt->rt_type == RTN_MULTICAST) { • IP_UPD_PO_STATS_BH(dev_net(rt->u.dst.dev), IPSTATS_MIB_INMCAST, skb->len); • } else if (rt->rt_type == RTN_BROADCAST) • IP_UPD_PO_STATS_BH(dev_net(rt->u.dst.dev), IPSTATS_MIB_INBCAST, skb->len); • return dst_input(skb); • drop: • kfree_skb(skb); return NET_RX_DROP; • }

ip_local_deliver(skb) • <kernel src>/net/ipv4/ip_input.c • /* Deliver IP Packets to the higher protocol layers. */ • int ip_local_deliver(struct sk_buff *skb) • { • /* Reassemble IP fragments. */ • if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) { • if (ip_defrag(skb, IP_DEFRAG_LOCAL_DELIVER)) • return 0; • } • return NF_HOOK(PF_INET, NF_INET_LOCAL_IN, skb, skb->dev, NULL, • ip_local_deliver_finish); • }

ip_local_deliver_finish(skb) • <kernel src>/net/ipv4/ip_input.c • static int ip_local_deliver_finish(struct sk_buff *skb) • { • struct net *net = dev_net(skb->dev); • __skb_pull(skb, ip_hdrlen(skb)); • /* Point into the IP datagram, just past the header. */ • skb_reset_transport_header(skb); • rcu_read_lock(); • { • int protocol = ip_hdr(skb)->protocol; • int hash, raw; • const struct net_protocol *ipprot; • resubmit: • raw = raw_local_deliver(skb, protocol); • hash = protocol & (MAX_INET_PROTOS - 1); • ipprot = rcu_dereference(inet_protos[hash]);

ip_local_deliver_finish(skb) (cont’ed) • if (ipprot != NULL) { • int ret; • if (!net_eq(net, &init_net) && !ipprot->netns_ok) { • if (net_ratelimit()) • printk("%s: proto %d isn't netns-ready\n", __func__, protocol); • kfree_skb(skb); goto out; • } • if (!ipprot->no_policy) { • if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { • kfree_skb(skb); goto out; • } • nf_reset(skb); • } • ret = ipprot->handler(skb); • if (ret < 0) { • protocol = -ret; goto resubmit; • } • IP_INC_STATS_BH(net, IPSTATS_MIB_INDELIVERS); • } else {

ip_local_deliver_finish(skb) (cont’ed) • if (!raw) { • if (xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { • IP_INC_STATS_BH(net, IPSTATS_MIB_INUNKNOWNPROTOS); • icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PROT_UNREACH, 0); • } • } else • IP_INC_STATS_BH(net, IPSTATS_MIB_INDELIVERS); • kfree_skb(skb); • } • } • out: • rcu_read_unlock(); • return 0; • }

dst_input(skb) • <net/dst.h> • static inline int dst_input(struct sk_buff *skb) { • return skb_dst(skb)->input(skb); • } • <linux/skbuff.h> • static inline struct dst_entry *skb_dst(const struct sk_buff *skb){ • return (struct dst_entry *)skb->_skb_dst; • } • static inline void skb_dst_set(struct sk_buff *skb, struct dst_entry *dst){ • skb->_skb_dst = (unsigned long)dst; • }

dst_output(skb) • <net/dst.h> • /* Output packet to network from transport. */ • static inline int dst_output(struct sk_buff *skb){ • return skb_dst(skb)->output(skb); • }

struct dst_entry • <net/dst.h> • struct dst_entry { • struct rcu_head rcu_head; • struct dst_entry *child; • struct net_device *dev; • short error, obsolete; • int flags; • unsigned long expires; • unsigned short header_len, trailer_len; /* space to reserve at tail */ • unsigned int rate_tokens; • unsigned long rate_last; /* rate limiting for ICMP */ • struct dst_entry *path; • struct neighbour *neighbour; • struct hh_cache *hh; • #ifdef CONFIG_XFRM • struct xfrm_state *xfrm; • #else

struct dst_entry (cont’ed) • void *__pad1; • #endif • int (*input)(struct sk_buff*); • int (*output)(struct sk_buff*); • struct dst_ops *ops; • u32 metrics[RTAX_MAX]; • #ifdef CONFIG_NET_CLS_ROUTE • __u32 tclassid; • #else • __u32 __pad2; • #endif • /* Align __refcnt to a 64 bytes alignment */ • #ifdef CONFIG_64BIT • long __pad_to_align_refcnt[1]; • #endif

struct dst_entry (cont’ed) • /* • * __refcnt wants to be on a different cache line from • * input/output/ops or performance tanks badly • */ • atomic_t __refcnt; /* client references */ • int __use; • unsigned long lastuse; • union { • struct dst_entry *next; • struct rtable *rt_next; • struct rt6_info *rt6_next; • struct dn_route *dn_next; • }; • };

ip_forward(skb) • The primary task of ip_forward(skb) is to process a few conditions of the Internet Protocol (e.g., a packet's lifetime) and packet options. First, packets not marked with pkt_type == PACKET_HOST are deleted. Next, the reach of the packet is checked. If the value in its TTL field is 1 (before it is decremented), then the packet is deleted. RFC 791 specifies that, if such an action occurs, an ICMP packet has to be returned to the sender to inform the latter (ICMP_TIME_EXCEEDED). • Once a redirect message has been checked, if applicable, the socket buffer is checked to see if there is sufficient memory for the headroom. This means that the function skb_cow(skb, headroom) is used to check whether there is still sufficient space for the MAC header in the output network device (out_dev->hard_header_len). If this is not the case, then skb_realloc_headroom() creates sufficient space. Subsequently, the TTL field of the IP packet is decremented by one. • When the actual packet length (including the MAC header) is known, it is checked for whether it really fits into the frame format of the new output network device. If it is too long (skb->len > mtu), and if no fragmenting is allowed because the Don't-Fragment bit is set in the IP header, then the packet is discarded, and the ICMP message ICMP_FRAG_NEEDED is transmitted to the sender. In any case, the packet is not fragmented yet; fragmenting is delayed. The early test for such cases prevents potential Don't-Fragment candidates from running through the entire IP protocol-handling process, only to be dropped eventually.

ip_forward(skb) • <kernel src>/net/ipv4/ip_forward.c • int ip_forward(struct sk_buff *skb) • { • struct iphdr *iph; /* Our header */ • struct rtable *rt; /* Route we use */ • struct ip_options * opt = &(IPCB(skb)->opt); • if (skb_warn_if_lro(skb)) goto drop; • if (!xfrm4_policy_check(NULL, XFRM_POLICY_FWD, skb)) • goto drop; • if (IPCB(skb)->opt.router_alert && ip_call_ra_chain(skb)) • return NET_RX_SUCCESS; • if (skb->pkt_type != PACKET_HOST) • goto drop; • skb_forward_csum(skb); • /* According to the RFC, we must first decrease the TTL field. If • that reaches zero, we must reply an ICMP control message telling that the packet's lifetime expired. */ • if (ip_hdr(skb)->ttl <= 1) goto too_many_hops; • if (!xfrm4_route_forward(skb)) goto drop;

ip_forward(skb) (cont’ed) • rt = skb_rtable(skb); • if (opt->is_strictroute && rt->rt_dst != rt->rt_gateway) goto sr_failed; • if (unlikely(skb->len > dst_mtu(&rt->u.dst) && !skb_is_gso(skb) && • (ip_hdr(skb)->frag_off & htons(IP_DF))) && !skb->local_df) { • IP_INC_STATS(dev_net(rt->u.dst.dev), IPSTATS_MIB_FRAGFAILS); • icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, • htonl(dst_mtu(&rt->u.dst))); • goto drop; • } • /* We are about to mangle packet. Copy it! */ • if (skb_cow(skb, LL_RESERVED_SPACE(rt->u.dst.dev)+rt->u.dst.header_len)) • goto drop; • iph = ip_hdr(skb); /* Decrease ttl after skb cow done */ • ip_decrease_ttl(iph); • /* now generate an ICMP HOST REDIRECT giving the route calculated. */ • if (rt->rt_flags&RTCF_DOREDIRECT && !opt->srr && !skb_sec_path(skb)) • ip_rt_send_redirect(skb);

ip_forward(skb) (cont’ed) • skb->priority = rt_tos2priority(iph->tos); • return NF_HOOK(PF_INET, NF_INET_FORWARD, skb, skb->dev, rt->u.dst.dev, • ip_forward_finish); • sr_failed: • /* Strict routing permits no gatewaying */ • icmp_send(skb, ICMP_DEST_UNREACH, ICMP_SR_FAILED, 0); • goto drop; • too_many_hops: • /* Tell the sender its packet died... */ • IP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_INHDRERRORS); • icmp_send(skb, ICMP_TIME_EXCEEDED, ICMP_EXC_TTL, 0); • drop: • kfree_skb(skb); • return NET_RX_DROP; • }

ip_forward_finish(skb) • <kernel src>/net/ipv4/ip_forward.c • static int ip_forward_finish(struct sk_buff *skb) • { • struct ip_options * opt = &(IPCB(skb)->opt); • IP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_OUTFORWDATAGRAMS); • if (unlikely(opt->optlen)) ip_forward_options(skb); • return dst_output(skb); • } • ip_forward_finish(). This function has actually very little functionality (unless FASTROUTE is enabled). Once the IP options, if used, have been processed in ip_forward_options(), the ip_send() function is invoked to check on whether the packet has to be fragmented and to eventually do a fragmentation, if applicable.

ip_forward_options(skb) • <kernel src>/net/ipv4/ip_forward.c • void ip_forward_options(struct sk_buff *skb) • { • struct ip_options * opt = &(IPCB(skb)->opt); • unsigned char * optptr; • struct rtable *rt = skb_rtable(skb); • unsigned char *raw = skb_network_header(skb); • if (opt->rr_needaddr) { • optptr = (unsigned char *)raw + opt->rr; • ip_rt_get_source(&optptr[optptr[2]-5], rt); opt->is_changed = 1; • } • if (opt->srr_is_hit) { • int srrptr, srrspace; optptr = raw + opt->srr; • for ( srrptr=optptr[2], srrspace = optptr[1]; srrptr <= srrspace; srrptr += 4 ) { • if (srrptr + 3 > srrspace) break; • if (memcmp(&rt->rt_dst, &optptr[srrptr-1], 4) == 0) break; • }

ip_forward_options(skb) (cont’ed) • if (srrptr + 3 <= srrspace) { • opt->is_changed = 1; • ip_rt_get_source(&optptr[srrptr-1], rt); • ip_hdr(skb)->daddr = rt->rt_dst; • optptr[2] = srrptr+4; • } else if (net_ratelimit()) • printk(KERN_CRIT "ip_forward(): Argh! Destination lost!\n"); • if (opt->ts_needaddr) { • optptr = raw + opt->ts; • ip_rt_get_source(&optptr[optptr[2]-9], rt); • opt->is_changed = 1; • } • } • if (opt->is_changed) { • opt->is_changed = 0; • ip_send_check(ip_hdr(skb)); • } • }

ip_send_check(iph) • <kernel src>/net/ipv4/ip_output.c • /* Generate a checksum for an outgoing IP datagram. */ • __inline__ void ip_send_check(struct iphdr *iph){ • iph->check = 0; • iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); • }

ip_queue_xmit(skb, ipfragok) • <kernel src>/net/ipv4/ip_output.c • int ip_queue_xmit(struct sk_buff *skb, int ipfragok) • { • struct sock *sk = skb->sk; • struct inet_sock *inet = inet_sk(sk); • struct ip_options *opt = inet->opt; • struct rtable *rt; • struct iphdr *iph; • rt = skb_rtable(skb); • if (rt != NULL) goto packet_routed; • /* Make sure we can route this packet. */ • rt = (struct rtable *)__sk_dst_check(sk, 0); • if (rt == NULL) { • __be32 daddr; • /* Use correct destination address if we have options. */ • daddr = inet->inet_daddr; • if(opt && opt->srr) daddr = opt->faddr;

ip_queue_xmit(skb, ipfragok) (cont’ed) • { • struct flowi fl = { .oif = sk->sk_bound_dev_if, • .mark = sk->sk_mark, • .nl_u = { .ip4_u = { .daddr = daddr, • .saddr = inet->inet_saddr, • .tos = RT_CONN_FLAGS(sk) } }, • .proto = sk->sk_protocol, • .flags = inet_sk_flowi_flags(sk), • .uli_u = { .ports = { .sport = inet->inet_sport, • .dport = inet->inet_dport } } }; • security_sk_classify_flow(sk, &fl); • if (ip_route_output_flow(sock_net(sk), &rt, &fl, sk, 0)) • goto no_route; • }

ip_queue_xmit(skb, ipfragok) (cont’ed) • sk_setup_caps(sk, &rt->u.dst); • } • skb_dst_set(skb, dst_clone(&rt->u.dst)); • packet_routed: • if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) • goto no_route; • skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0)); • skb_reset_network_header(skb); • iph = ip_hdr(skb); • *((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff)); • if (ip_dont_fragment(sk, &rt->u.dst) && !ipfragok) • iph->frag_off = htons(IP_DF); • else • iph->frag_off = 0; • iph->ttl = ip_select_ttl(inet, &rt->u.dst); • iph->protocol = sk->sk_protocol; • iph->saddr = rt->rt_src; iph->daddr = rt->rt_dst;

ip_queue_xmit(skb, ipfragok) (cont’ed) • if (opt && opt->optlen) { • iph->ihl += opt->optlen >> 2; • ip_options_build(skb, opt, inet->inet_daddr, rt, 0); • } • ip_select_ident_more(iph, &rt->u.dst, sk, • (skb_shinfo(skb)->gso_segs ?: 1) - 1); • skb->priority = sk->sk_priority; • skb->mark = sk->sk_mark; • return ip_local_out(skb); • no_route: • IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES); • kfree_skb(skb); • return -EHOSTUNREACH; • }

ip_local_out(skb) • <kernel src>/net/ipv4/ip_output.c • int ip_local_out(struct sk_buff *skb) • { • int err; • err = __ip_local_out(skb); • if (likely(err == 1)) err = dst_output(skb); • return err; • } • EXPORT_SYMBOL_GPL(ip_local_out);

__ip_local_out(skb) • <kernel src>/net/ipv4/ip_output.c • int __ip_local_out(struct sk_buff *skb) • { • struct iphdr *iph = ip_hdr(skb); • iph->tot_len = htons(skb->len); • ip_send_check(iph); • return nf_hook(PF_INET, NF_INET_LOCAL_OUT, skb, • NULL, skb_dst(skb)->dev, dst_output); • }

ip_output(skb) • <kernel src>/net/ipv4/ip_output.c • int ip_output(struct sk_buff *skb) • { • struct net_device *dev = skb_dst(skb)->dev; • IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len); • skb->dev = dev; • skb->protocol = htons(ETH_P_IP); • return NF_HOOK_COND(PF_INET, NF_INET_POST_ROUTING, skb, NULL, dev, ip_finish_output,!(IPCB(skb)->flags & IPSKB_REROUTED)); • }

ip_finish_output(skb) • <kernel src>/net/ipv4/ip_output.c • static int ip_finish_output(struct sk_buff *skb) • { • #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM) • /* Policy lookup after SNAT yielded a new policy */ • if (skb_dst(skb)->xfrm != NULL) { • IPCB(skb)->flags |= IPSKB_REROUTED; • return dst_output(skb); • } • #endif • if (skb->len > ip_skb_dst_mtu(skb) && !skb_is_gso(skb)) • return ip_fragment(skb, ip_finish_output2); • else • return ip_finish_output2(skb); • }

ip_finish_output2(skb) • <kernel src>/net/ipv4/ip_output.c • static inline int ip_finish_output2(struct sk_buff *skb) • { • struct dst_entry *dst = skb_dst(skb); • struct rtable *rt = (struct rtable *)dst; • struct net_device *dev = dst->dev; • unsigned int hh_len = LL_RESERVED_SPACE(dev); • if (rt->rt_type == RTN_MULTICAST) { • IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTMCAST, skb->len); • } else if (rt->rt_type == RTN_BROADCAST) • IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTBCAST, skb->len); • /* Be paranoid, rather than too clever. */ • if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) { • struct sk_buff *skb2; • skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));

ip_finish_output2(skb) (cont’ed) • if (skb2 == NULL) { • kfree_skb(skb); • return -ENOMEM; • } • if (skb->sk) • skb_set_owner_w(skb2, skb->sk); • kfree_skb(skb); • skb = skb2; • } • if (dst->hh) return neigh_hh_output(dst->hh, skb); • else if (dst->neighbour) return dst->neighbour->output(skb); • if (net_ratelimit()) • printk(KERN_DEBUG "ip_finish_output2: No header cache and no neighbour!\n"); • kfree_skb(skb); • return -EINVAL; • }

Netfilter hooks for connection tracking

HF_HOOK() • <linux/netfilter.h> • static inline int • NF_HOOK(uint8_t pf, unsigned int hook, struct sk_buff *skb, • struct net_device *in, struct net_device *out, • int (*okfn)(struct sk_buff *)) • { • return NF_HOOK_THRESH(pf, hook, skb, in, out, okfn, INT_MIN); • } • static inline int • NF_HOOK_THRESH(uint8_t pf, unsigned int hook, struct sk_buff *skb, • struct net_device *in, struct net_device *out, • int (*okfn)(struct sk_buff *), int thresh) • { • int ret = nf_hook_thresh(pf, hook, skb, in, out, okfn, thresh); • if (ret == 1) ret = okfn(skb); • return ret; • }

Arguments of NF_HOOK macro • pf (protocol family): This is the identifier of the protocol family: PF_INET for IP Version 4, PF_INET6 for IP Version 6. • hook: This is the hook identifier. All valid identifiers for each protocol family are defined in a header file (e.g., <linux/netfilter_ipv4.h>). • skb: This is a pointer to the sk_buff structure with the packet to be handled. • indev (input device): This is a pointer to the net_device structure of the network device that received the packet. It is set to NULL in the above example, because the packet is an outgoing packet. • outdev (output device): This is a pointer to the net_device structure of the network device that should be used by the packet to leave the local computer. In the above example, the device used has to be determined first by use of the routing table (rt). • okfn() (okay function): This function is invoked when all filter functions registered with this hook returned NF_ACCEPT, thereby okaying the packet's transit.

nf_hook() • <linux/netfilter.h> • static inline int nf_hook(u_int8_t pf, unsigned int hook, • struct sk_buff *skb, struct net_device *indev, • struct net_device *outdev, int (*okfn)(struct sk_buff *)) • { • return nf_hook_thresh(pf, hook, skb, indev, outdev, okfn, INT_MIN); • } • static inline int nf_hook_thresh(u_int8_t pf, unsigned int hook, • struct sk_buff *skb, struct net_device *indev, • struct net_device *outdev, int (*okfn)(struct sk_buff *), int thresh) • { • #ifndef CONFIG_NETFILTER_DEBUG • if (list_empty(&nf_hooks[pf][hook])) • return 1; • #endif • return nf_hook_slow(pf, hook, skb, indev, outdev, okfn, thresh); • }

Linux Network Architecture Network Layer

Linux Network Architecture Network Layer

Presentation Transcript

Network Layer

Network layer

Network Layer

Network Layer

Network Layer

Network Layer

Network Layer

Linux Network Architecture Device Driver

Network Layer

Network Layer

Network Layer

Network Layer

Layer Architecture of Network Protocols

Network Layer

Network Layer

Network Layer

Network layer

Network Layer

Network Layer