Linuxカーネルネットワークプロトコルスタック---Linuxカーネルルーティングメカニズム(二)のip層が開始する->パケットが処理されるまで
81511 ワード
上の2編に続きます:クリックしてリンクを開きます
クリックしてリンクを開く
まずipヘッダ構造を見てみましょう.
次はスタートip_rcv関数の開始:
では、主にip_を見てみましょう.rcv_finish関数!
では、それぞれip_route_input 、 ip_rcv_options 、 dst_input
ip_route_input関数は次のとおりです.
具体的にip_を見てみましょうroute_input_slow関数:
マクロCONFIG_IP_MULTIPLE_TABLESマクロは2つの方式のfibテーブルの初期化をもたらすため,マルチルーティングテーブルとマルチパステーブルがない場合がある.
>マルチルーティングテーブルの状況:
最も重要なルーティングプロセス関数:fib_lookup関数
次はfib_を見てみましょうrules_lookup関数:
クリックしてリンクを開く
まずipヘッダ構造を見てみましょう.
struct iphdr {
#if defined(__LITTLE_ENDIAN_BITFIELD) //
__u8 ihl:4, // (4 ): IP 32 bit ( IP 4 -- 32 ),
version:4; // (4 ), 4, IPv4。( : 4 1 1111=15, 15*32/8=60B)
#elif defined (__BIG_ENDIAN_BITFIELD) // :
__u8 version:4, // ,
ihl:4;
#else
#error "Please fix "
#endif
__u8 tos; // (8 ): (TOS) 3 bit ,4 bit TOS 1 bit 0。4 bit TOS : 、 、 。4 bit 1 bit。 4 bit 0, 。
__be16 tot_len; // (16 ) IP , 。
__be16 id; // (16 ) 。 1。
__be16 frag_off; // frag_off 13 -- (Fragment offset) 。 , ( ) 8 。 8 。iphdr->frag_off 3 (1) 0 , 0;(2) 1 “ ”(MF -- More Fragment) 。 , 1。 (3) 2 “ ”(DF -- Don't Fragment) , 1,IP , , ICMP 。
__u8 ttl; //
__u8 protocol; // (8 ): 。TCP , UDP 。
__sum16 check; // (16 ) IP 。 。
__be32 saddr; // IP
__be32 daddr; // ip
/*The options start here. */ //
};
次はスタートip_rcv関数の開始:
/*
* Main IP Receive routine.
*/
int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt,
struct net_device *orig_dev)
{
struct iphdr *iph;
u32 len;
if (dev->nd_net != &init_net) // ,
goto drop;
/* When the interface is in promisc. mode, drop all the crap
* that it receives, do not try to analyse it.
*/// PACKET_OTHERHOST , , promisc , 3 , hook , !
if (skb->pkt_type == PACKET_OTHERHOST) // if_packet.h
goto drop;
IP_INC_STATS_BH(IPSTATS_MIB_INRECEIVES); //
if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL) { // skb ! , skb, skb !
IP_INC_STATS_BH(IPSTATS_MIB_INDISCARDS); // NULL, !
goto out; // skb~~~
}
// skb ip , skb , ! error, ok~
if (!pskb_may_pull(skb, sizeof(struct iphdr)))
goto inhdr_error;
iph = ip_hdr(skb); // ip ( ip )
/*
* RFC1122: 3.1.2.2 MUST silently discard any IP frame that fails the checksum.
*
* Is the datagram acceptable?
*
* 1. Length at least the size of an ip header
* 2. Version of 4
* 3. Checksums correctly. [Speed optimisation for later, skip loopback checksums]
* 4. Doesn't have a bogus length
*/
// , 4 , 32 , 60B; 20B; 20*8bit, ihl=20*8/32=5 0101
if (iph->ihl < 5 || iph->version != 4) //
goto inhdr_error;
if (!pskb_may_pull(skb, iph->ihl*4))// ip ( option) skb->data , ip
goto inhdr_error;
iph = ip_hdr(skb);
// ip
if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl)))
goto inhdr_error;
//
len = ntohs(iph->tot_len);
if (skb->len < len) { // ,drop
IP_INC_STATS_BH(IPSTATS_MIB_INTRUNCATEDPKTS);
goto drop;
} else if (len < (iph->ihl*4)) // ip
goto inhdr_error;
/* Our transport medium may have padded the buffer out. Now we know it
* is IP we can trim to the true length of the frame.
* Note this now means skb->len holds ntohs(iph->tot_len).
*/// , skb->len len
if (pskb_trim_rcsum(skb, len)) {
IP_INC_STATS_BH(IPSTATS_MIB_INDISCARDS);
goto drop;
}
/* Remove any debris in the socket control block */
memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));
return NF_HOOK(PF_INET, NF_INET_PRE_ROUTING, skb, dev, NULL, // ! ip_rcv_finish !
ip_rcv_finish);
inhdr_error:
IP_INC_STATS_BH(IPSTATS_MIB_INHDRERRORS);
drop:
kfree_skb(skb);
out:
return NET_RX_DROP;
}
では、主にip_を見てみましょう.rcv_finish関数!
static int ip_rcv_finish(struct sk_buff *skb) // : : ;
{
const struct iphdr *iph = ip_hdr(skb); // ip
struct rtable *rt;
/*
* Initialise the virtual path cache for the packet. It describes
* how the packet travels inside Linux networking.
*/
if (skb->dst == NULL) { //
int err = ip_route_input(skb, iph->daddr, iph->saddr, iph->tos,// ! ! !(1)
skb->dev);
if (unlikely(err)) {
if (err == -EHOSTUNREACH)
IP_INC_STATS_BH(IPSTATS_MIB_INADDRERRORS);
else if (err == -ENETUNREACH)
IP_INC_STATS_BH(IPSTATS_MIB_INNOROUTES);
goto drop;
}
}
// QOS
#ifdef CONFIG_NET_CLS_ROUTE
if (unlikely(skb->dst->tclassid)) {
struct ip_rt_acct *st = per_cpu_ptr(ip_rt_acct, smp_processor_id());
u32 idx = skb->dst->tclassid;
st[idx&0xFF].o_packets++;
st[idx&0xFF].o_bytes+=skb->len;
st[(idx>>16)&0xFF].i_packets++;
st[(idx>>16)&0xFF].i_bytes+=skb->len;
}
#endif
// 20B, option skb option, ip_rcv_options
if (iph->ihl > 5 && ip_rcv_options(skb)) // !!!!!!!!!!!!!!!!(2) ~~~~~~~~
goto drop;
rt = (struct rtable*)skb->dst;
if (rt->rt_type == RTN_MULTICAST) //
IP_INC_STATS_BH(IPSTATS_MIB_INMCASTPKTS);
else if (rt->rt_type == RTN_BROADCAST) //
IP_INC_STATS_BH(IPSTATS_MIB_INBCASTPKTS);
return dst_input(skb); // ! ip_route_input , ip_rcv_options 。 ip_local_deliver( ), ip_forward( ). // (3)
drop:
kfree_skb(skb);
return NET_RX_DROP;
}
では、それぞれip_route_input 、 ip_rcv_options 、 dst_input
ip_route_input関数は次のとおりです.
int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
u8 tos, struct net_device *dev)
{
struct rtable * rth; // , ! ,
unsigned hash;
int iif = dev->ifindex; //
struct net *net; //
net = dev->nd_net; //
tos &= IPTOS_RT_MASK; //
hash = rt_hash(daddr, saddr, iif); // , , , hash
rcu_read_lock(); // !
for (rth = rcu_dereference(rt_hash_table[hash].chain); rth; //
rth = rcu_dereference(rth->u.dst.rt_next)) { // :static struct rt_hash_bucket *rt_hash_table;
if (rth->fl.fl4_dst == daddr && // :http://blog.csdn.net/shanshanpt/article/details/19918171
rth->fl.fl4_src == saddr && // rth->u.dst.rt_next: , ~~~~~~~~ ~~~~~~~
rth->fl.iif == iif &&
rth->fl.oif == 0 &&
rth->fl.mark == skb->mark &&
rth->fl.fl4_tos == tos &&
rth->u.dst.dev->nd_net == net &&
rth->rt_genid == atomic_read(&rt_genid)) { // !
dst_use(&rth->u.dst, jiffies); // dist :use++, jiffies;__refcnt
RT_CACHE_STAT_INC(in_hit);
rcu_read_unlock();
skb->dst = (struct dst_entry*)rth; // dist
return 0; // , OK
}
RT_CACHE_STAT_INC(in_hlist_search);
}
rcu_read_unlock();
/* Multicast recognition logic is moved from route cache to here.
The problem was that too many Ethernet cards have broken/missing
hardware multicast filters :-( As result the host on multicasting
network acquires a lot of useless route cache entries, sort of
SDR messages from all the world. Now we try to get rid of them.
Really, provided software IP multicast filter is organized
reasonably (at least, hashed), it does not result in a slowdown
comparing with route cache reject entries.
Note, that multicast routers are not affected, because
route cache entry is created eventually.
*/
if (ipv4_is_multicast(daddr)) { //
struct in_device *in_dev;
rcu_read_lock();
if ((in_dev = __in_dev_get_rcu(dev)) != NULL) { // ipv4
int our = ip_check_mc(in_dev, daddr, saddr, //
ip_hdr(skb)->protocol);
if (our
#ifdef CONFIG_IP_MROUTE //
|| (!ipv4_is_local_multicast(daddr) && //
IN_DEV_MFORWARD(in_dev))
#endif
) {
rcu_read_unlock();
return ip_route_input_mc(skb, daddr, saddr, // !
tos, dev, our); // dev
}
}
rcu_read_unlock();
return -EINVAL;
}
return ip_route_input_slow(skb, daddr, saddr, tos, dev); // ! && !
}
具体的にip_を見てみましょうroute_input_slow関数:
/*
* NOTE. We drop all the packets that has local source
* addresses, because every properly looped back packet
* must have correct destination already attached by output routine.
*
* Such approach solves two big problems:
* 1. Not simplex devices are handled properly.
* 2. IP spoofing attempts are filtered with 100% of guarantee.
*/
static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
u8 tos, struct net_device *dev)
{
struct fib_result res;
struct in_device *in_dev = in_dev_get(dev);
struct flowi fl = { .nl_u = { .ip4_u = //
{ .daddr = daddr,
.saddr = saddr,
.tos = tos,
.scope = RT_SCOPE_UNIVERSE,
} },
.mark = skb->mark,
.iif = dev->ifindex };
unsigned flags = 0;
u32 itag = 0;
struct rtable * rth;
unsigned hash;
__be32 spec_dst;
int err = -EINVAL;
int free_res = 0;
struct net * net = dev->nd_net;
/* IP on this device is disabled. */
if (!in_dev) //
goto out;
/* Check for the most weird martians, which can be not detected
by fib_lookup.
*/
if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) || // || ||
ipv4_is_loopback(saddr))
goto martian_source;
if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0)) //
goto brd_input;
/* Accept zero addresses only to limited broadcast;
* I even do not know to fix it or not. Waiting for complains :-)
*/
if (ipv4_is_zeronet(saddr)) // 0
goto martian_source;
if (ipv4_is_lbcast(daddr) || ipv4_is_zeronet(daddr) || // ,
ipv4_is_loopback(daddr))
goto martian_destination;
/*
* Now we are ready to route packet.
*/ // fib_lookup !!!!!!!!!!!!!!!!!!!!!!!!!(1)
if ((err = fib_lookup(net, &fl, &res)) != 0) { // !!!
if (!IN_DEV_FORWARD(in_dev)) // fib , fib_result res
goto e_hostunreach; //
goto no_route; // ,
}
free_res = 1; //
RT_CACHE_STAT_INC(in_slow_tot);
if (res.type == RTN_BROADCAST) //
goto brd_input;
if (res.type == RTN_LOCAL) { // :
int result;
result = fib_validate_source(saddr, daddr, tos , //
net->loopback_dev->ifindex,
dev, &spec_dst, &itag);
if (result < 0)
goto martian_source; //
if (result)
flags |= RTCF_DIRECTSRC; // : src
spec_dst = daddr; //
goto local_input; // goto
}
// : , ,
if (!IN_DEV_FORWARD(in_dev)) //
goto e_hostunreach;
if (res.type != RTN_UNICAST) // ! ,
goto martian_destination;
err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos); // , !!! !!!!!(2)
done: // : ! , err
in_dev_put(in_dev); // dev
if (free_res) //
fib_res_put(&res); //
out: return err;
brd_input: // 0
if (skb->protocol != htons(ETH_P_IP)) // IP
goto e_inval;
if (ipv4_is_zeronet(saddr)) // 0,
spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
else {
err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst, //
&itag);
if (err < 0)
goto martian_source;
if (err)
flags |= RTCF_DIRECTSRC;
}
flags |= RTCF_BROADCAST; //
res.type = RTN_BROADCAST; //
RT_CACHE_STAT_INC(in_brd); // cache ++
local_input: //
rth = dst_alloc(&ipv4_dst_ops); //
if (!rth)
goto e_nobufs;
rth->u.dst.output= ip_rt_bug; //
rth->rt_genid = atomic_read(&rt_genid); //
atomic_set(&rth->u.dst.__refcnt, 1);
rth->u.dst.flags= DST_HOST; // ,
if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
rth->u.dst.flags |= DST_NOPOLICY;
rth->fl.fl4_dst = daddr; //
rth->rt_dst = daddr;
rth->fl.fl4_tos = tos;
rth->fl.mark = skb->mark;
rth->fl.fl4_src = saddr;
rth->rt_src = saddr;
#ifdef CONFIG_NET_CLS_ROUTE
rth->u.dst.tclassid = itag;
#endif
rth->rt_iif =
rth->fl.iif = dev->ifindex;
rth->u.dst.dev = net->loopback_dev; //
dev_hold(rth->u.dst.dev);
rth->idev = in_dev_get(rth->u.dst.dev);
rth->rt_gateway = daddr;
rth->rt_spec_dst= spec_dst;
rth->u.dst.input= ip_local_deliver; // !!! :ip_local_deliver, ...forward
rth->rt_flags = flags|RTCF_LOCAL;
if (res.type == RTN_UNREACHABLE) { //
rth->u.dst.input= ip_error;
rth->u.dst.error= -err;
rth->rt_flags &= ~RTCF_LOCAL;
}
rth->rt_type = res.type;
hash = rt_hash(daddr, saddr, fl.iif); // hash
err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst); // hash ( cache )
goto done; //
no_route: //
RT_CACHE_STAT_INC(in_no_route); //
spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE); //
res.type = RTN_UNREACHABLE;//
if (err == -ESRCH)
err = -ENETUNREACH;
goto local_input; // ( ~ )
/*
* Do not cache martian addresses: they should be logged (RFC1812)
*/
martian_destination: //
RT_CACHE_STAT_INC(in_martian_dst);
#ifdef CONFIG_IP_ROUTE_VERBOSE
if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
printk(KERN_WARNING "martian destination %u.%u.%u.%u from "
"%u.%u.%u.%u, dev %s
",
NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
#endif
e_hostunreach:
err = -EHOSTUNREACH; // ,
goto done;
e_inval:
err = -EINVAL; //
goto done;
e_nobufs:
err = -ENOBUFS; //
goto done;
martian_source: //
ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
goto e_inval;
}
マクロCONFIG_IP_MULTIPLE_TABLESマクロは2つの方式のfibテーブルの初期化をもたらすため,マルチルーティングテーブルとマルチパステーブルがない場合がある.
>マルチルーティングテーブルの状況:
最も重要なルーティングプロセス関数:fib_lookup関数
int fib_lookup(struct net *net, struct flowi *flp, struct fib_result *res)
{
struct fib_lookup_arg arg = {
.result = res,
};
int err;
err = fib_rules_lookup(net->ipv4.rules_ops, flp, 0, &arg); // ( net ipv4 )
res->r = arg.rule; // ( )
return err;
}
次はfib_を見てみましょうrules_lookup関数:
int fib_rules_lookup(struct fib_rules_ops *ops, struct flowi *fl,
int flags, struct fib_lookup_arg *arg)
{
struct fib_rule *rule;
int err;
rcu_read_lock();
list_for_each_entry_rcu(rule, &ops->rules_list, list) { //
jumped:
if (!fib_rule_match(rule, ops, fl, flags)) // ~~~~~ (1)
continue;
if (rule->action == FR_ACT_GOTO) { //
struct fib_rule *target;
target = rcu_dereference(rule->ctarget);
if (target == NULL) {
continue;
} else {
rule = target; //
goto jumped; //
}
} else if (rule->action == FR_ACT_NOP) // ,
continue;
else
err = ops->action(rule, fl, flags, arg); // ? ? fib4_rule_action(2)
if (err != -EAGAIN) {
fib_rule_get(rule);
arg->rule = rule;
goto out;
}
}
err = -ESRCH;
out:
rcu_read_unlock();
return err;
}
fib_rule_match :
static int fib_rule_match(struct fib_rule *rule, struct fib_rules_ops *ops,
struct flowi *fl, int flags)
{
int ret = 0;
if (rule->ifindex && (rule->ifindex != fl->iif)) // id
goto out;
if ((rule->mark ^ fl->mark) & rule->mark_mask) //
goto out;
ret = ops->match(rule, fl, flags); // ops match ~ action , ?
out:
return (rule->flags & FIB_RULE_INVERT) ? !ret : ret;
}
ops(ipv 4のopsですよ~~~)のaction とmatch を てみましょう.
まず を てみましょう.static struct fib_rules_ops fib4_rules_ops_template = {
.family = AF_INET,
.rule_size = sizeof(struct fib4_rule),
.addr_size = sizeof(u32),
.action = fib4_rule_action, // ~
.match = fib4_rule_match, // ~
.configure = fib4_rule_configure,
.compare = fib4_rule_compare,
.fill = fib4_rule_fill,
.default_pref = fib4_rule_default_pref,
.nlmsg_payload = fib4_rule_nlmsg_payload,
.flush_cache = fib4_rule_flush_cache,
.nlgroup = RTNLGRP_IPV4_RULE,
.policy = fib4_rule_policy,
.owner = THIS_MODULE,
};
はfib 4_rule_Action とfib 4_rule_match :static int fib4_rule_action(struct fib_rule *rule, struct flowi *flp,
int flags, struct fib_lookup_arg *arg)
{
int err = -EAGAIN;
struct fib_table *tbl;
switch (rule->action) { // action,
case FR_ACT_TO_TBL: //
break;
case FR_ACT_UNREACHABLE: //
err = -ENETUNREACH;
goto errout;
case FR_ACT_PROHIBIT:
err = -EACCES;
goto errout;
case FR_ACT_BLACKHOLE:
default:
err = -EINVAL;
goto errout;
}
if ((tbl = fib_get_table(rule->fr_net, rule->table)) == NULL) // ( 1 )
goto errout;
err = tbl->tb_lookup(tbl, flp, (struct fib_result *) arg->result); // !!! ( 2 )
if (err > 0)
err = -EAGAIN;
errout:
return err; // ~
}
まずfibを てみましょうget_table :struct fib_table *fib_get_table(struct net *net, u32 id)
{
struct fib_table *tb;
struct hlist_node *node;
struct hlist_head *head;
unsigned int h;
if (id == 0)
id = RT_TABLE_MAIN; //
h = id & (FIB_TABLE_HASHSZ - 1); // hash idx
rcu_read_lock();
head = &net->ipv4.fib_table_hash[h]; // net ipv4 hash ( fib_new_table )
// ! ? !
hlist_for_each_entry_rcu(tb, node, head, tb_hlist) { // list
if (tb->tb_id == id) { // tb_id : id( LOCAL, MAIN...)
rcu_read_unlock();
return tb; //
}
}
rcu_read_unlock();
return NULL;
}
ルーティングテーブルが つかったことを っています. に、このパケットをどのように するかをルーティングテーブルで す があります. を てください.
、tb_について するlookup:
どうやってtbを った?lookupはどんな ですか?それともどんな に けられていますか?tableはfibを っていると ったのを えています.new_tableが したのですが、この には か な がありますか?
この を てfib_new_table(エピソードだよ~)struct fib_table *fib_new_table(struct net *net, u32 id)
{
struct fib_table *tb;
unsigned int h;
if (id == 0)
id = RT_TABLE_MAIN;
tb = fib_get_table(net, id);
if (tb)
return tb;
tb = fib_hash_table(id); // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ~
if (!tb)
return NULL;
h = id & (FIB_TABLE_HASHSZ - 1);
hlist_add_head_rcu(&tb->tb_hlist, &net->ipv4.fib_table_hash[h]);
return tb;
}
またfib_を てhash_table :(エピソード)struct fib_table *fib_hash_table(u32 id)
{
struct fib_table *tb;
tb = kmalloc(sizeof(struct fib_table) + sizeof(struct fn_hash), fib
GFP_KERNEL);
if (tb == NULL)
return NULL;
tb->tb_id = id;
tb->tb_default = -1;
tb->tb_lookup = fn_hash_lookup; // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ~~~~~~~
tb->tb_insert = fn_hash_insert; // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ~~~~~~~
tb->tb_delete = fn_hash_delete; // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ~~~~~~~
tb->tb_flush = fn_hash_flush;
tb->tb_select_default = fn_hash_select_default;
tb->tb_dump = fn_hash_dump;
memset(tb->tb_data, 0, sizeof(struct fn_hash));
return tb;
}
ではtb_lookupは はfn_hash_lookup、 は てみましょう.
fn_hash_lookup :static int
fn_hash_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result *res)
{
int err;
struct fn_zone *fz;
struct fn_hash *t = (struct fn_hash*)tb->tb_data; // fib_table fn_hash ( )
// ( :http://blog.csdn.net/shanshanpt/article/details/19918171 )
read_lock(&fib_hash_lock);
for (fz = t->fn_zone_list; fz; fz = fz->fz_next) { // [33] ( )
struct hlist_head *head;
struct hlist_node *node;
struct fib_node *f;
__be32 k = fz_key(flp->fl4_dst, fz); // fl4_det&((fz)->fz_mask)
head = &fz->fz_hash[fn_hash(k, fz)]; // ( : )
hlist_for_each_entry(f, node, head, fn_hash) { //
if (f->fn_key != k) // : ( !)
continue;
err = fib_semantic_match(&f->fn_alias, // !( fib_info ~~~ )
flp, res,
f->fn_key, fz->fz_mask, // ~~~~~~
fz->fz_order);
if (err <= 0)
goto out;
}
}
err = 1;
out:
read_unlock(&fib_hash_lock);
return err;
}
fibを てsemantic_match :/* Note! fib_semantic_match intentionally uses RCU list functions. */
int fib_semantic_match(struct list_head *head, const struct flowi *flp,
struct fib_result *res, __be32 zone, __be32 mask,
int prefixlen)
{
struct fib_alias *fa;
int nh_sel = 0;
list_for_each_entry_rcu(fa, head, fa_list) { //
int err;
if (fa->fa_tos && // tos
fa->fa_tos != flp->fl4_tos)
continue;
if (fa->fa_scope < flp->fl4_scope) //
continue;
fa->fa_state |= FA_S_ACCESSED;
err = fib_props[fa->fa_type].error; // , err
if (err == 0) { //
struct fib_info *fi = fa->fa_info; // fib_info
if (fi->fib_flags & RTNH_F_DEAD) // ( )
continue;
switch (fa->fa_type) { //
case RTN_UNICAST: //
case RTN_LOCAL: //
case RTN_BROADCAST: //
case RTN_ANYCAST: //
case RTN_MULTICAST: //
for_nexthops(fi) { // info , ,
if (nh->nh_flags&RTNH_F_DEAD) // nh? for_nexthops , ~
continue;
if (!flp->oif || flp->oif == nh->nh_oif) // , ,
break; // , ~ ~
}
#ifdef CONFIG_IP_ROUTE_MULTIPATH //
if (nhsel < fi->fib_nhs) {
nh_sel = nhsel;
goto out_fill_res; //
}
#else
if (nhsel < 1) { // 1
goto out_fill_res; //
}
#endif
endfor_nexthops(fi);
continue;
default:
printk(KERN_WARNING "fib_semantic_match bad type %#x
",
fa->fa_type);
return -EINVAL;
}
}
return err;
}
return 1;
out_fill_res: //
res->prefixlen = prefixlen;
res->nh_sel = nh_sel;
res->type = fa->fa_type;
res->scope = fa->fa_scope;
res->fi = fa->fa_info;
atomic_inc(&res->fi->fib_clntref);
return 0;
}
この を てみましょう.struct fib_result {
unsigned char prefixlen; //
unsigned char nh_sel; // ( 1)
unsigned char type; //
unsigned char scope; //
#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED //
__u32 network; //
__u32 netmask; //
#endif
struct fib_info *fi; //
#ifdef CONFIG_IP_MULTIPLE_TABLES
struct fib_rule *r;
#endif
};
マルチルーティング・テーブルについて します. に、 ルーティング・テーブルの を てみましょう.
> ルーティングテーブルの :static inline int fib_lookup(struct net *net, const struct flowi *flp,
struct fib_result *res)
{
struct fib_table *table;
table = fib_get_table(net, RT_TABLE_LOCAL); // ( , ~~~~~~~~ )
if (!table->tb_lookup(table, flp, res)) // ~~~~~~~~~~~~~~~~~~~~~~~
return 0;
table = fib_get_table(net, RT_TABLE_MAIN); //
if (!table->tb_lookup(table, flp, res)) // ~~~~~~~~~~~~~~~~~~~~~~~
return 0;
return -ENETUNREACH;
}
の の のfibを てみましょう.get_table :easystatic inline struct fib_table *fib_get_table(struct net *net, u32 id)
{
struct hlist_head *ptr;
ptr = id == RT_TABLE_LOCAL ?
&net->ipv4.fib_table_hash[TABLE_LOCAL_INDEX] : //
&net->ipv4.fib_table_hash[TABLE_MAIN_INDEX]; //
return hlist_entry(ptr->first, struct fib_table, tb_hlist);
}
というのは、ここまで てからの は のマルチルートと じですよ~~~~~~~~~~~~~~~~
ここまで ると、ルーティングが して ってきたこと( はres)、または したことがわかります~~~~では、 の の に って、その であるべきだと います!
るべきはip_route_input_slowのip_mkroute_input !!(ルーティングバッファの )
------>は、 バッファに しいルーティングアイテムを することです.static inline int ip_mkroute_input(struct sk_buff *skb,
struct fib_result* res,
const struct flowi *fl,
struct in_device *in_dev,
__be32 daddr, __be32 saddr, u32 tos)
{
struct rtable* rth = NULL;
int err;
unsigned hash;
#ifdef CONFIG_IP_ROUTE_MULTIPATH
if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
fib_select_multipath(fl, res);
#endif
/* create a routing cache entry */
err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth); // (1), rth
if (err)
return err;
/* put it into the cache */
hash = rt_hash(daddr, saddr, fl->iif); // hash、
return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst); // (2)
}
てみようmkroute_input :static inline int __mkroute_input(struct sk_buff *skb,
struct fib_result* res,
struct in_device *in_dev,
__be32 daddr, __be32 saddr, u32 tos,
struct rtable **result)
{
struct rtable *rth;
int err;
struct in_device *out_dev;
unsigned flags = 0;
__be32 spec_dst;
u32 itag;
/* get a working reference to the output device */
out_dev = in_dev_get(FIB_RES_DEV(*res));
if (out_dev == NULL) {
if (net_ratelimit())
printk(KERN_CRIT "Bug in ip_route_input" \
"_slow(). Please, report
");
return -EINVAL;
}
err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res), //
in_dev->dev, &spec_dst, &itag);
if (err < 0) {
ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr, //
saddr);
err = -EINVAL;
goto cleanup;
}
if (err)
flags |= RTCF_DIRECTSRC;
if (out_dev == in_dev && err && !(flags & RTCF_MASQ) &&
(IN_DEV_SHARED_MEDIA(out_dev) ||
inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
flags |= RTCF_DOREDIRECT;
if (skb->protocol != htons(ETH_P_IP)) { // ip , ARP , ~
/* Not IP (i.e. ARP). Do not create route, if it is
* invalid for proxy arp. DNAT routes are always valid.
*/
if (out_dev == in_dev) { // dev, ,
err = -EINVAL;
goto cleanup;
}
}
rth = dst_alloc(&ipv4_dst_ops); // ( )
if (!rth) {
err = -ENOBUFS;
goto cleanup;
}
atomic_set(&rth->u.dst.__refcnt, 1); // 1
rth->u.dst.flags= DST_HOST;
if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
rth->u.dst.flags |= DST_NOPOLICY;
if (IN_DEV_CONF_GET(out_dev, NOXFRM))
rth->u.dst.flags |= DST_NOXFRM;
rth->fl.fl4_dst = daddr; // rtable
rth->rt_dst = daddr;
rth->fl.fl4_tos = tos;
rth->fl.mark = skb->mark;
rth->fl.fl4_src = saddr;
rth->rt_src = saddr;
rth->rt_gateway = daddr;
rth->rt_iif =
rth->fl.iif = in_dev->dev->ifindex;
rth->u.dst.dev = (out_dev)->dev;
dev_hold(rth->u.dst.dev);
rth->idev = in_dev_get(rth->u.dst.dev);
rth->fl.oif = 0;
rth->rt_spec_dst= spec_dst;
rth->u.dst.input = ip_forward; // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
rth->u.dst.output = ip_output; // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
rth->rt_genid = atomic_read(&rt_genid);
rt_set_nexthop(rth, res, itag);
rth->rt_flags = flags;
*result = rth; // ~~~~~~~~~~~~~~~~~~
err = 0;
cleanup:
/* release the working reference to the output device */
in_dev_put(out_dev);
return err;
}
キャッシュをhashテーブルに するには、まずdst_を び します.allocはルートバッファを り て、 にSLABにキャッシュノードを り て、 り てるたびにゴミ を みます.void * dst_alloc(struct dst_ops * ops)
{
struct dst_entry * dst;
if (ops->gc && atomic_read(&ops->entries) > ops->gc_thresh) { //
if (ops->gc(ops))
return NULL;
}
dst = kmem_cache_zalloc(ops->kmem_cachep, GFP_ATOMIC); // slab
if (!dst)
return NULL;
//
atomic_set(&dst->__refcnt, 0);
dst->ops = ops;
dst->lastuse = jiffies;
dst->path = dst;
dst->input = dst->output = dst_discard;
#if RT_CACHE_DEBUG >= 2
atomic_inc(&dst_total);
#endif
atomic_inc(&ops->entries);
return dst; //
}
そして たちは の しいルートをどのように たちのhashテーブルに するかを ています.rt_を てintern_hash :static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
{
struct rtable *rth, **rthp;
unsigned long now;
struct rtable *cand, **candp;
u32 min_score;
int chain_length;
int attempts = !in_softirq();
restart:
chain_length = 0;
min_score = ~(u32)0;
cand = NULL;
candp = NULL;
now = jiffies;
rthp = &rt_hash_table[hash].chain; // hash hash
// hash 。 , , , 。
// hash CPU , CPU CPU
spin_lock_bh(rt_hash_lock_addr(hash));
while ((rth = *rthp) != NULL) {
if (rth->rt_genid != atomic_read(&rt_genid)) { //
*rthp = rth->u.dst.rt_next; //
rt_free(rth); //
continue;
}
if (compare_keys(&rth->fl, &rt->fl) && compare_netns(rth, rt)) { //
/* Put it first */
// : , ,
// ,
*rthp = rth->u.dst.rt_next;
/*
* Since lookup is lockfree, the deletion
* must be visible to another weakly ordered CPU before
* the insertion at the start of the hash chain.
*/
rcu_assign_pointer(rth->u.dst.rt_next,
rt_hash_table[hash].chain);
/*
* Since lookup is lockfree, the update writes
* must be ordered for consistency on SMP.
*/
rcu_assign_pointer(rt_hash_table[hash].chain, rth);
dst_use(&rth->u.dst, now); //
spin_unlock_bh(rt_hash_lock_addr(hash)); //
rt_drop(rt); //
*rp = rth;
return 0;
}
// ? , cache。
// tr_score ,
if (!atomic_read(&rth->u.dst.__refcnt)) {
u32 score = rt_score(rth); //
if (score <= min_score) { // ,
cand = rth;
candp = rthp;
min_score = score;
}
}
chain_length++;
// :
// ( , 10 ( , 10) , ,
// ~ 8 , , ~~~)
rthp = &rth->u.dst.rt_next; // ,
}
if (cand) { //
// ( , ~~~ CPU ~
/* ip_rt_gc_elasticity used to be average length of chain
* length, when exceeded gc becomes really aggressive.
*
* The second limit is less certain. At the moment it allows
* only 2 entries per bucket. We will see.
*/
if (chain_length > ip_rt_gc_elasticity) { // ? ~
*candp = cand->u.dst.rt_next;
rt_free(cand); //
}
}
/* Try to bind route to arp only if it is output
route or unicast forwarding path.
*/
// , arp ,
// , ,
if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
int err = arp_bind_neighbour(&rt->u.dst); // ARP,
if (err) {
spin_unlock_bh(rt_hash_lock_addr(hash));
if (err != -ENOBUFS) {
rt_drop(rt);
return err;
}
/* Neighbour tables are full and nothing
can be released. Try to shrink route cache,
it is most likely it holds some neighbour records.
*/
// ,
// rt_garbage_collect ,
if (attempts-- > 0) {
int saved_elasticity = ip_rt_gc_elasticity;
int saved_int = ip_rt_gc_min_interval;
ip_rt_gc_elasticity = 1;
ip_rt_gc_min_interval = 0;
rt_garbage_collect(&ipv4_dst_ops);
ip_rt_gc_min_interval = saved_int;
ip_rt_gc_elasticity = saved_elasticity;
goto restart;
}
if (net_ratelimit()) // , , ~~~~ ~~~~~~
printk(KERN_WARNING "Neighbour table overflow.
");
rt_drop(rt);
return -ENOBUFS;
}
}
rt->u.dst.rt_next = rt_hash_table[hash].chain; // next chain
#if RT_CACHE_DEBUG >= 2
if (rt->u.dst.rt_next) {
struct rtable *trt;
printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash,
NIPQUAD(rt->rt_dst));
for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next)
printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst));
printk("
");
}
#endif
// rt, rt ~~~~
rt_hash_table[hash].chain = rt;
spin_unlock_bh(rt_hash_lock_addr(hash));//
*rp = rt;
return 0;
}
OK、 バッファの で しいルートを してOKを して、 はまた のどこに るべきですか???
つまりip_route_input_slowでip_mkroute_Inputが すると、 ip_route_input_slow が しましたが、 にどこが えましたか?ip_route_Input 、
ではip_に りますrcv_finish、 は にこのルートを つけたか、あるいは、 つからなかったか、 つからなかったかは を て、ip_を なければなりません.rcv_finishの の :dstを めます.input /* Input packet from network to transport. */
static inline int dst_input(struct sk_buff *skb)
{
int err;
for (;;) {
err = skb->dst->input(skb); // , input , input ~
// :ip_forward ip_local_deliver
if (likely(err == 0)) // :rth->u.dst.input= ip_local_deliver; rth->u.dst.input = ip_forward
return err;
/* Oh, Jamal... Seems, I will not forgive you this mess. :-) */
if (unlikely(err != NET_XMIT_BYPASS))
return err;
}
}
:まずローカル データを る:ip_forward int ip_forward(struct sk_buff *skb)
{
struct iphdr *iph; /* Our header */
struct rtable *rt; /* Route we use */
struct ip_options * opt = &(IPCB(skb)->opt);
if (!xfrm4_policy_check(NULL, XFRM_POLICY_FWD, skb)) //
goto drop;
// forward ip , ip , ~~~~~~~~~~~~~~~~
if (IPCB(skb)->opt.router_alert && ip_call_ra_chain(skb)) // ip option: Router_alter ( ip), , ip_call_ra_chain 。
return NET_RX_SUCCESS;
if (skb->pkt_type != PACKET_HOST) // , drop
goto drop; // 2 , 2 ,skb->pkt_type PCAKET_HOST
skb_forward_csum(skb); // :skb->ip_summed = CHECKSUM_NONE;
// , 4 。 ip_summed CHECKSUM_NONE
/*
* According to the RFC, we must first decrease the TTL field. If
* that reaches zero, we must reply an ICMP control message telling
* that the packet's lifetime expired.
*/
if (ip_hdr(skb)->ttl <= 1) // ttl ,
goto too_many_hops;
if (!xfrm4_route_forward(skb)) // IPsec(Internet Protocol Security)
goto drop;
rt = (struct rtable*)skb->dst; // ( + ~ )
if (opt->is_strictroute && rt->rt_dst != rt->rt_gateway) // Strict option。 , option rt_gateway( )
goto sr_failed;
if (unlikely(skb->len > dst_mtu(&rt->u.dst) && !skb_is_gso(skb) &&
(ip_hdr(skb)->frag_off & htons(IP_DF))) && !skb->local_df) { // 。 icmp,
IP_INC_STATS(IPSTATS_MIB_FRAGFAILS);
icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, // icmp
htonl(dst_mtu(&rt->u.dst)));
goto drop;
}
/* We are about to mangle packet. Copy it! */
if (skb_cow(skb, LL_RESERVED_SPACE(rt->u.dst.dev)+rt->u.dst.header_len)) // skb , copy ,
goto drop;
iph = ip_hdr(skb); // ip
/* Decrease ttl after skb cow done */
ip_decrease_ttl(iph); // ttl ( ip )
/*
* We now generate an ICMP HOST REDIRECT giving the route
* we calculated.
*/
if (rt->rt_flags&RTCF_DOREDIRECT && !opt->srr && !skb->sp) // , host ICMP REDIRESCT ( host source routing option )
ip_rt_send_redirect(skb); //
skb->priority = rt_tos2priority(iph->tos); // QOS
return NF_HOOK(PF_INET, NF_INET_FORWARD, skb, skb->dev, rt->u.dst.dev,
ip_forward_finish); // ip_forward_finish !!!!!!!!!!!!!!!( !!!!)
sr_failed:
/*
* Strict routing permits no gatewaying
*/
icmp_send(skb, ICMP_DEST_UNREACH, ICMP_SR_FAILED, 0); //
goto drop;
too_many_hops:
/* Tell the sender its packet died... */ // ttl<1 , icmp
IP_INC_STATS_BH(IPSTATS_MIB_INHDRERRORS);
icmp_send(skb, ICMP_TIME_EXCEEDED, ICMP_EXC_TTL, 0);
drop:
kfree_skb(skb); //
return NET_RX_DROP;
}
ip_を てforward_finish :static int ip_forward_finish(struct sk_buff *skb)
{
struct ip_options * opt = &(IPCB(skb)->opt);
IP_INC_STATS_BH(IPSTATS_MIB_OUTFORWDATAGRAMS);
if (unlikely(opt->optlen))
ip_forward_options(skb); // ip options,
return dst_output(skb); // !dst_output skb->dst_output, ip_output, ip_mc_output. , ~
}
まず、ユニキャストの を てみましょう.
ip_output :int ip_output(struct sk_buff *skb)
{
struct net_device *dev = skb->dst->dev;
IP_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
skb->dev = dev;
skb->protocol = htons(ETH_P_IP);
return NF_HOOK_COND(PF_INET, NF_INET_POST_ROUTING, skb, NULL, dev, // ip_finish_output
ip_finish_output,
!(IPCB(skb)->flags & IPSKB_REROUTED));
}
ip_を てfinish_putput :static int ip_finish_output(struct sk_buff *skb)
{
#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
/* Policy lookup after SNAT yielded a new policy */
if (skb->dst->xfrm != NULL) {
IPCB(skb)->flags |= IPSKB_REROUTED;
return dst_output(skb);
}
#endif
if (skb->len > ip_skb_dst_mtu(skb) && !skb_is_gso(skb)) // GSO && MTU, ~~~~~~, GSO,
return ip_fragment(skb, ip_finish_output2); // , ip_finish_output2
else
return ip_finish_output2(skb); // ip_finish_output2
}
スライス を てみましょう:ip_ /*
* This IP datagram is too large to be sent in one piece. Break it up into
* smaller pieces (each of size equal to IP header plus
* a block of the data of the original IP data part) that will yet fit in a
* single device frame, and queue such a frame for sending.
*/
int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*)) // output ip_finish_output2
{
struct iphdr *iph;
int raw = 0;
int ptr;
struct net_device *dev;
struct sk_buff *skb2;
unsigned int mtu, hlen, left, len, ll_rs, pad;
int offset;
__be16 not_last_frag;
struct rtable *rt = (struct rtable*)skb->dst;
int err = 0;
dev = rt->u.dst.dev;
/*
* Point into the IP datagram header.
*/
iph = ip_hdr(skb); // ip
// DF , df , ip_fragment icmp
if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) {
IP_INC_STATS(IPSTATS_MIB_FRAGFAILS);
icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, // icmp
htonl(ip_skb_dst_mtu(skb)));
kfree_skb(skb);
return -EMSGSIZE;
}
/*
* Setup starting values.
*/
hlen = iph->ihl * 4; // 32 4B, iph->ihl*4
mtu = dst_mtu(&rt->u.dst) - hlen; /* Size of data space */ // - == MTU
IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE;
/* When frag_list is given, use it. First, check its validity:
* some transformers could create wrong frag_list or break existing
* one, it is not prohibited. In this case fall back to copying.
*
* LATER: this step can be merged to real generation of fragments,
* we can switch to copy when see the first bad fragment.
*/// 4 , skb frag_list , frag_list
if (skb_shinfo(skb)->frag_list) { // skb_shinfo ?#define skb_shinfo(SKB)((struct skb_shared_info *)((SKB)->end))
struct sk_buff *frag; // , skb_shared_info, 。 ~~~~
int first_len = skb_pagelen(skb);
int truesizes = 0;
// ( mtu,mf ). , ( IP_MF 1)。 mtu hlen mtu, hlen。
if (first_len - hlen > mtu ||
((first_len - hlen) & 7) ||
(iph->frag_off & htons(IP_MF|IP_OFFSET)) ||
skb_cloned(skb))
goto slow_path; //
// frag
for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) {
/* Correct geometry. */
if (frag->len > mtu ||
((frag->len & 7) && frag->next) ||
skb_headroom(frag) < hlen)
goto slow_path; //
/* Partially cloned skb? */ // ,
if (skb_shared(frag))
goto slow_path; //
BUG_ON(frag->sk);
if (skb->sk) {
sock_hold(skb->sk); //
frag->sk = skb->sk;
frag->destructor = sock_wfree;
truesizes += frag->truesize;
}
}
/* Everything is OK. Generate! */
err = 0;
offset = 0;
frag = skb_shinfo(skb)->frag_list; // frag_list
skb_shinfo(skb)->frag_list = NULL;
skb->data_len = first_len - skb_headlen(skb); //
skb->truesize -= truesizes;
skb->len = first_len;
iph->tot_len = htons(first_len);
iph->frag_off = htons(IP_MF); // MF
ip_send_check(iph); //
for (;;) { //
/* Prepare header of the next frame,
* before previous one went down. */
if (frag) {
frag->ip_summed = CHECKSUM_NONE; // ( )
skb_reset_transport_header(frag); //
__skb_push(frag, hlen); // frag hlen ( ip OK!!!!!!!! )
skb_reset_network_header(frag); //
memcpy(skb_network_header(frag), iph, hlen); // ip ~~~ ~~~
iph = ip_hdr(frag); //
iph->tot_len = htons(frag->len); // len
ip_copy_metadata(frag, skb); // skb
if (offset == 0)
ip_options_fragment(frag); // 0, ,
offset += skb->len - hlen; // offset
iph->frag_off = htons(offset>>3); // ip offset , 3 , 8
if (frag->next != NULL) // , MF 1,
iph->frag_off |= htons(IP_MF);
/* Ready, complete checksum */ //
ip_send_check(iph);
}
err = output(skb); // ip_finish_output2( ~~~~~~ )
if (!err)
IP_INC_STATS(IPSTATS_MIB_FRAGCREATES);
if (err || !frag)
break;
skb = frag; // skb( skb ~ )
frag = skb->next; // frag
skb->next = NULL;
} // for
if (err == 0) {
IP_INC_STATS(IPSTATS_MIB_FRAGOKS);
return 0;
}
while (frag) { //
skb = frag->next;
kfree_skb(frag);
frag = skb;
}
IP_INC_STATS(IPSTATS_MIB_FRAGFAILS);
return err; //
}
slow_path:
left = skb->len - hlen; /* Space per frame */ //
ptr = raw + hlen; /* Where to start from */ //
/* for bridged IP traffic encapsulated inside f.e. a vlan header,
* we need to make room for the encapsulating header
*/
pad = nf_bridge_pad(skb); // 、VLAN、PPPOE MTU
ll_rs = LL_RESERVED_SPACE_EXTRA(rt->u.dst.dev, pad);
mtu -= pad;
/*
* Fragment the datagram.
*/
offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3; // 13 , 8 :
not_last_frag = iph->frag_off & htons(IP_MF); // MF
/*
* Keep copying data until we run out.
*/
while (left > 0) { // , skb
len = left;
/* IF: it doesn't fit, use 'mtu' - the data space left */
if (len > mtu) // len > mtu, MTU
len = mtu;
/* IF: we are not sending upto and including the packet end
then align the next start on an eight byte boundary */
if (len < left) {
len &= ~7; // : , , 8 !!!
}
/*
* Allocate buffer.
*/
if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) { // skb
NETDEBUG(KERN_INFO "IP: frag: no memory for new fragment!
");
err = -ENOMEM;
goto fail;
}
/*
* Set up data on packet
*/
// ( )
ip_copy_metadata(skb2, skb); // copy skb
skb_reserve(skb2, ll_rs);
skb_put(skb2, len + hlen); //
skb_reset_network_header(skb2);//
skb2->transport_header = skb2->network_header + hlen; //
/*
* Charge the memory for the fragment to any owner
* it might possess
*/
if (skb->sk) // ip socket
skb_set_owner_w(skb2, skb->sk);
/*
* Copy the packet header into the new buffer.
*/
skb_copy_from_linear_data(skb, skb_network_header(skb2), hlen); // skb copy
/*
* Copy a block of the IP datagram.
*/
if (skb_copy_bits(skb, ptr, skb_transport_header(skb2), len)) //
BUG();
left -= len; // copy ,
/*
* Fill in the new header fields.
*/
iph = ip_hdr(skb2); //
iph->frag_off = htons((offset >> 3)); // ip offset , 3 , 8
/* ANK: dirty, but effective trick. Upgrade options only if
* the segment to be fragmented was THE FIRST (otherwise,
* options are already fixed) and make it ONCE
* on the initial skb, so that all the following fragments
* will inherit fixed options.
*/
if (offset == 0) //
ip_options_fragment(skb);
/*
* Added AC : If we are fragmenting a fragment that's not the
* last fragment then keep MF on each bit
*/
if (left > 0 || not_last_frag)
iph->frag_off |= htons(IP_MF);
ptr += len;
offset += len;
/*
* Put this fragment into the sending queue.
*/
iph->tot_len = htons(len + hlen);
ip_send_check(iph); //
err = output(skb2); //
if (err)
goto fail;
IP_INC_STATS(IPSTATS_MIB_FRAGCREATES);
}
kfree_skb(skb);
IP_INC_STATS(IPSTATS_MIB_FRAGOKS);
return err;
fail:
kfree_skb(skb);
IP_INC_STATS(IPSTATS_MIB_FRAGFAILS);
return err;
}
skbを てshare_info: /* This data is invariant across clones and lives at
* the end of the header data, ie. at skb->end.
*/
struct skb_shared_info {
atomic_t dataref; // “ ”
unsigned short nr_frags; // frags
unsigned short gso_size; // gso size
/* Warning: this field is not always filled in (UFO)! */
unsigned short gso_segs;
unsigned short gso_type;
__be32 ip6_frag_id; //
struct sk_buff *frag_list;//
skb_frag_t frags[MAX_SKB_FRAGS];
};
OK、やっとスライスが わりました. 、どうやって るか てみましょう.output は、 はip_です.finish_output 2 :static inline int ip_finish_output2(struct sk_buff *skb)
{
struct dst_entry *dst = skb->dst;
struct rtable *rt = (struct rtable *)dst;
struct net_device *dev = dst->dev;
unsigned int hh_len = LL_RESERVED_SPACE(dev);
if (rt->rt_type == RTN_MULTICAST) //
IP_INC_STATS(IPSTATS_MIB_OUTMCASTPKTS);
else if (rt->rt_type == RTN_BROADCAST) //
IP_INC_STATS(IPSTATS_MIB_OUTBCASTPKTS);
/* Be paranoid, rather than too clever. */
if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
struct sk_buff *skb2;
skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev)); //
if (skb2 == NULL) {
kfree_skb(skb);
return -ENOMEM;
}
if (skb->sk)
skb_set_owner_w(skb2, skb->sk);
kfree_skb(skb);
skb = skb2;
}
if (dst->hh)
return neigh_hh_output(dst->hh, skb); // ! , ARP !!!!!!! ~
else if (dst->neighbour)
return dst->neighbour->output(skb); // ! , ARP !!!!!!! ~
if (net_ratelimit())
printk(KERN_DEBUG "ip_finish_output2: No header cache and no neighbour!
");
kfree_skb(skb);
return -EINVAL;
}
これで、ワンマン は !
マルチキャストの : を て:ip_mc_output int ip_mc_output(struct sk_buff *skb)
{
struct sock *sk = skb->sk;
struct rtable *rt = (struct rtable*)skb->dst;
struct net_device *dev = rt->u.dst.dev;
/*
* If the indicated interface is up and running, send the packet.
*/
IP_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
skb->dev = dev;
skb->protocol = htons(ETH_P_IP);
/*
* Multicasts are looped back for other local users
*/
if (rt->rt_flags&RTCF_MULTICAST) {
if ((!sk || inet_sk(sk)->mc_loop)
#ifdef CONFIG_IP_MROUTE
/* Small optimization: do not loopback not local frames,
which returned after forwarding; they will be dropped
by ip_mr_input in any case.
Note, that local frames are looped back to be delivered
to local recipients.
This check is duplicated in ip_mr_input at the moment.
*/
&& ((rt->rt_flags&RTCF_LOCAL) || !(IPCB(skb)->flags&IPSKB_FORWARDED))
#endif
) {
struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
if (newskb)
NF_HOOK(PF_INET, NF_INET_POST_ROUTING, newskb,
NULL, newskb->dev,
ip_dev_loopback_xmit);
}
/* Multicasts with ttl 0 must not go beyond the host */
if (ip_hdr(skb)->ttl == 0) {
kfree_skb(skb);
return 0;
}
}
if (rt->rt_flags&RTCF_BROADCAST) {
struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
if (newskb)
NF_HOOK(PF_INET, NF_INET_POST_ROUTING, newskb, NULL,
newskb->dev, ip_dev_loopback_xmit);
}
return NF_HOOK_COND(PF_INET, NF_INET_POST_ROUTING, skb, NULL, skb->dev, // ip_finish_output ~~~~~~~~
ip_finish_output,
!(IPCB(skb)->flags & IPSKB_REROUTED));
}
ローカルからデータを する 、ip_からforwardの の !
からは、パケットを の に す についてお しします!!!すなわちip_local_deliver !int ip_local_deliver(struct sk_buff *skb) // IP , ip_local_deliver_finish
{
/*
* Reassemble IP fragments.
*/
if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) { // frag_off ,MF , OFFSET
if (ip_defrag(skb, IP_DEFRAG_LOCAL_DELIVER)) // ~~~~~~
return 0;
}
return NF_HOOK(PF_INET, NF_INET_LOCAL_IN, skb, skb->dev, NULL,
ip_local_deliver_finish); // ~~~~~~~~~~~~~~~~~~~~
}
まずip_を てみましょうdefrag :/* Process an incoming IP datagram fragment. */
int ip_defrag(struct sk_buff *skb, u32 user)
{
struct ipq *qp; //
struct net *net;
IP_INC_STATS_BH(IPSTATS_MIB_REASMREQDS);
net = skb->dev ? skb->dev->nd_net : skb->dst->dev->nd_net;
/* Start by cleaning up the memory. */
if (atomic_read(&net->ipv4.frags.mem) > net->ipv4.frags.high_thresh) // IP , , ip_evictor() IP , 。
ip_evictor(net);
/* Lookup (or create) queue header */
if ((qp = ip_find(net, ip_hdr(skb), user)) != NULL) {// , , 。
int ret;
spin_lock(&qp->q.lock);
ret = ip_frag_queue(qp, skb); // , , , IP
spin_unlock(&qp->q.lock);
ipq_put(qp);
return ret;
}
IP_INC_STATS_BH(IPSTATS_MIB_REASMFAILS);
kfree_skb(skb);
return -ENOMEM;
}
ip_を てfind :static inline struct ipq *ip_find(struct net *net, struct iphdr *iph, u32 user)
{
struct inet_frag_queue *q;
struct ip4_create_arg arg;
unsigned int hash;
arg.iph = iph;
arg.user = user;
hash = ipqhashfn(iph->id, iph->saddr, iph->daddr, iph->protocol); // hash
// hash ip4_frags.rnd,, (u32) ((num_physpages ^ (num_physpages>>7)) ^ (jiffies ^ (jiffies >> 6)));
q = inet_frag_find(&net->ipv4.frags, &ip4_frags, &arg, hash); // , ,
if (q == NULL) //
goto out_nomem; // ip4_frags
return container_of(q, struct ipq, q);
out_nomem:
LIMIT_NETDEBUG(KERN_ERR "ip_frag_create: no memory left !
");
return NULL;
}
てみろfrag_find :struct inet_frag_queue *inet_frag_find(struct netns_frags *nf,
struct inet_frags *f, void *key, unsigned int hash)
{
struct inet_frag_queue *q;
struct hlist_node *n;
read_lock(&f->lock);
hlist_for_each_entry(q, n, &f->hash[hash], list) { // f
if (q->net == nf && f->match(q, key)) { //
atomic_inc(&q->refcnt);
read_unlock(&f->lock);
return q; //
}
}
read_unlock(&f->lock);
return inet_frag_create(nf, f, key, hash); /// ( )
}
OK、 ip_に りますdefragでは、 の を てみましょう.ip_frag_Queue、どのようにしてすべてのスライスを し しますか?
この についてはあまり いたくありませんが、 にはすべてのスライスが したときに、スライスをoffset で してください~~~~~ のskbを してください~~~~~
このqpキューが はグローバルであることに づいたらOK、ip 4_frags ~~~~~~~~/* Add new segment to existing queue. */
static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
{
struct sk_buff *prev, *next;
struct net_device *dev;
int flags, offset;
int ihl, end;
int err = -ENOENT;
if (qp->q.last_in & COMPLETE)
goto err;
if (!(IPCB(skb)->flags & IPSKB_FRAG_COMPLETE) &&
unlikely(ip_frag_too_far(qp)) &&
unlikely(err = ip_frag_reinit(qp))) {
ipq_kill(qp);
goto err;
}
offset = ntohs(ip_hdr(skb)->frag_off);
flags = offset & ~IP_OFFSET;
offset &= IP_OFFSET;
offset <<= 3; /* offset is in 8-byte chunks */
ihl = ip_hdrlen(skb);
/* Determine the position of this fragment. */
end = offset + skb->len - ihl;
err = -EINVAL;
/* Is this the final fragment? */
if ((flags & IP_MF) == 0) {
/* If we already have some bits beyond end
* or have different end, the segment is corrrupted.
*/
if (end < qp->q.len ||
((qp->q.last_in & LAST_IN) && end != qp->q.len))
goto err;
qp->q.last_in |= LAST_IN;
qp->q.len = end;
} else {
if (end&7) {
end &= ~7;
if (skb->ip_summed != CHECKSUM_UNNECESSARY)
skb->ip_summed = CHECKSUM_NONE;
}
if (end > qp->q.len) {
/* Some bits beyond end -> corruption. */
if (qp->q.last_in & LAST_IN)
goto err;
qp->q.len = end;
}
}
if (end == offset)
goto err;
err = -ENOMEM;
if (pskb_pull(skb, ihl) == NULL)
goto err;
err = pskb_trim_rcsum(skb, end - offset);
if (err)
goto err;
/* Find out which fragments are in front and at the back of us
* in the chain of fragments so far. We must know where to put
* this fragment, right?
*/
prev = NULL;
for (next = qp->q.fragments; next != NULL; next = next->next) {
if (FRAG_CB(next)->offset >= offset)
break; /* bingo! */
prev = next;
}
/* We found where to put this one. Check for overlap with
* preceding fragment, and, if needed, align things so that
* any overlaps are eliminated.
*/
if (prev) {
int i = (FRAG_CB(prev)->offset + prev->len) - offset;
if (i > 0) {
offset += i;
err = -EINVAL;
if (end <= offset)
goto err;
err = -ENOMEM;
if (!pskb_pull(skb, i))
goto err;
if (skb->ip_summed != CHECKSUM_UNNECESSARY)
skb->ip_summed = CHECKSUM_NONE;
}
}
err = -ENOMEM;
while (next && FRAG_CB(next)->offset < end) {
int i = end - FRAG_CB(next)->offset; /* overlap is 'i' bytes */
if (i < next->len) {
/* Eat head of the next overlapped fragment
* and leave the loop. The next ones cannot overlap.
*/
if (!pskb_pull(next, i))
goto err;
FRAG_CB(next)->offset += i;
qp->q.meat -= i;
if (next->ip_summed != CHECKSUM_UNNECESSARY)
next->ip_summed = CHECKSUM_NONE;
break;
} else {
struct sk_buff *free_it = next;
/* Old fragment is completely overridden with
* new one drop it.
*/
next = next->next;
if (prev)
prev->next = next;
else
qp->q.fragments = next;
qp->q.meat -= free_it->len;
frag_kfree_skb(qp->q.net, free_it, NULL);
}
}
FRAG_CB(skb)->offset = offset;
/* Insert this fragment in the chain of fragments. */
skb->next = next;
if (prev)
prev->next = skb;
else
qp->q.fragments = skb;
dev = skb->dev;
if (dev) {
qp->iif = dev->ifindex;
skb->dev = NULL;
}
qp->q.stamp = skb->tstamp;
qp->q.meat += skb->len;
atomic_add(skb->truesize, &qp->q.net->mem);
if (offset == 0)
qp->q.last_in |= FIRST_IN;
if (qp->q.last_in == (FIRST_IN | LAST_IN) && qp->q.meat == qp->q.len)
return ip_frag_reasm(qp, prev, dev);
write_lock(&ip4_frags.lock);
list_move_tail(&qp->q.lru_list, &qp->q.net->lru_list);
write_unlock(&ip4_frags.lock);
return -EINPROGRESS;
err:
kfree_skb(skb);
return err;
}
OK、 ip_defrag の が したらip_に りますlocal_deliver で、 のip_を てください.local_deliver_finish :static int ip_local_deliver_finish(struct sk_buff *skb)
{
__skb_pull(skb, ip_hdrlen(skb)); // ip , ip
/* Point into the IP datagram, just past the header. */
skb_reset_transport_header(skb); //
rcu_read_lock();
{
int protocol = ip_hdr(skb)->protocol; //
int hash, raw;
struct net_protocol *ipprot;
resubmit:
raw = raw_local_deliver(skb, protocol); //
hash = protocol & (MAX_INET_PROTOS - 1); //
if ((ipprot = rcu_dereference(inet_protos[hash])) != NULL) { // 4 ( ~~~~~~~~~~~)
int ret; // inet_protos , , : struct net_protocol
if (!ipprot->no_policy) { // , IPSec(IP security)
if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
kfree_skb(skb);
goto out;
}
nf_reset(skb);
}
ret = ipprot->handler(skb); // 4 !!! ~ ----> 4
if (ret < 0) { // tcp tcp_v4_rcv
protocol = -ret;
goto resubmit;
}
IP_INC_STATS_BH(IPSTATS_MIB_INDELIVERS);
} else { //
if (!raw) { // , IPSec
if (xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
IP_INC_STATS_BH(IPSTATS_MIB_INUNKNOWNPROTOS);
icmp_send(skb, ICMP_DEST_UNREACH, // icmp
ICMP_PROT_UNREACH, 0);
}
} else
IP_INC_STATS_BH(IPSTATS_MIB_INDELIVERS);
kfree_skb(skb);
}
}
out:
rcu_read_unlock();
return 0;
}
このプロトコル を てみましょう./* This is used to register protocols. */
struct net_protocol {
int (*handler)(struct sk_buff *skb); // skb
void (*err_handler)(struct sk_buff *skb, u32 info); //
int (*gso_send_check)(struct sk_buff *skb);
struct sk_buff *(*gso_segment)(struct sk_buff *skb,
int features);
int no_policy;
};
レイヤのプロトコル :net_として されています.protocol 、hashからinet_protosテーブルでの #ifdef CONFIG_IP_MULTICAST
static const struct net_protocol igmp_protocol = {
.handler = igmp_rcv,
.netns_ok = 1,
};
#endif
static const struct net_protocol tcp_protocol = {
.handler = tcp_v4_rcv,
.err_handler = tcp_v4_err,
.gso_send_check = tcp_v4_gso_send_check,
.gso_segment = tcp_tso_segment,
.gro_receive = tcp4_gro_receive,
.gro_complete = tcp4_gro_complete,
.no_policy = 1,
.netns_ok = 1,
};
static const struct net_protocol udp_protocol = {
.handler = udp_rcv,
.err_handler = udp_err,
.gso_send_check = udp4_ufo_send_check,
.gso_segment = udp4_ufo_fragment,
.no_policy = 1,
.netns_ok = 1,
};
static const struct net_protocol icmp_protocol = {
.handler = icmp_rcv,
.no_policy = 1,
.netns_ok = 1,
};
TCPプロトコルでは、tcp_v4_rcv....
OK、これでip ~~~~~~~~~~~~~~~~~~~~~~~~~