Linuxカーネルネットワークプロトコルスタック---Linuxカーネルルーティングメカニズム(二)のip層が開始する->パケットが処理されるまで


上の2編に続きます:クリックしてリンクを開きます
クリックしてリンクを開く
まずipヘッダ構造を見てみましょう.
struct iphdr {
#if defined(__LITTLE_ENDIAN_BITFIELD)   //   
         __u8    ihl:4,                 //     (4 ):       IP    32 bit    (   IP        4   -- 32 ),      
                 version:4;             //   (4 ),         4,  IPv4。(      : 4    1 1111=15,  15*32/8=60B)
 #elif defined (__BIG_ENDIAN_BITFIELD)  //   :    
         __u8    version:4,             //              ,          
                 ihl:4;
 #else
 #error  "Please fix "
 #endif
         __u8    tos;                   //       (8 ):     (TOS)      3 bit       ,4 bit TOS    1 bit       0。4 bit TOS       :    、     、          。4 bit       1 bit。    4 bit  0,           。
         __be16  tot_len;               //      (16 )    IP      ,      。
         __be16  id;                    //     (16 )                。               1。
         __be16  frag_off;              // frag_off   13  --     (Fragment offset)                    。                ,       (  )   8     。  8         。iphdr->frag_off  3 (1)   0    ,   0;(2)   1 “    ”(MF -- More Fragment)  。       ,                  1。 (3)   2 “   ”(DF -- Don't Fragment)  ,        1,IP          ,                 ,            ICMP        。
         __u8    ttl;                   //                        
         __u8    protocol;              //     (8 ):              。TCP     ,  UDP            。
         __sum16 check;                 //        (16 )   IP         。              。
         __be32  saddr;                 //  IP  
         __be32  daddr;                 //   ip  
         /*The options start here. */   // 
 };

次はスタートip_rcv関数の開始:
/*
 *      Main IP Receive routine.
 */
int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, 

struct net_device *orig_dev)
{
         struct iphdr *iph;
         u32 len;
 
         if (dev->nd_net != &init_net)     //                     ,      
                 goto drop;
 
         /* When the interface is in promisc. mode, drop all the crap
          * that it receives, do not try to analyse it.
          *///       PACKET_OTHERHOST  ,             ,          promisc    ,        3 ,         hook       ,               !
         if (skb->pkt_type == PACKET_OTHERHOST) //      if_packet.h 
                 goto drop;
 
         IP_INC_STATS_BH(IPSTATS_MIB_INRECEIVES); //           
 
         if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL) { //      skb      !    ,       skb,     skb  !
                 IP_INC_STATS_BH(IPSTATS_MIB_INDISCARDS);        //    NULL,          !
                 goto out;                              //                        skb~~~
         }
         //      skb ip    ,  skb      ,       !  error,  ok~
         if (!pskb_may_pull(skb, sizeof(struct iphdr)))
                 goto inhdr_error;
 
         iph = ip_hdr(skb);   //   ip (    ip        )
 
         /*
          *      RFC1122: 3.1.2.2 MUST silently discard any IP frame that fails the checksum.
          *
          *      Is the datagram acceptable?
          *
          *      1.      Length at least the size of an ip header
          *      2.      Version of 4
          *      3.      Checksums correctly. [Speed optimisation for later, skip loopback checksums]
          *      4.      Doesn't have a bogus length
          */
         //              ,    4 ,      32 ,    60B;      20B;    20*8bit,   ihl=20*8/32=5 0101
         if (iph->ihl < 5 || iph->version != 4)  //             
                 goto inhdr_error;
        if (!pskb_may_pull(skb, iph->ihl*4))//        ip    (  option) skb->data             ,      ip      
                 goto inhdr_error;
 
         iph = ip_hdr(skb);
         //     ip 
         if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl)))
                 goto inhdr_error;
         //      
         len = ntohs(iph->tot_len);
         if (skb->len < len) {    //      ,drop
                 IP_INC_STATS_BH(IPSTATS_MIB_INTRUNCATEDPKTS);
                 goto drop;
         } else if (len < (iph->ihl*4))   //             ip 
                 goto inhdr_error;
 
         /* Our transport medium may have padded the buffer out. Now we know it
          * is IP we can trim to the true length of the frame.
          * Note this now means skb->len holds ntohs(iph->tot_len).
          *///       , skb->len len    
         if (pskb_trim_rcsum(skb, len)) {
                 IP_INC_STATS_BH(IPSTATS_MIB_INDISCARDS);
                 goto drop;
         }
 
         /* Remove any debris in the socket control block */
         memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));
 
         return NF_HOOK(PF_INET, NF_INET_PRE_ROUTING, skb, dev, NULL,  //            !      ip_rcv_finish      !
                        ip_rcv_finish);
 
 inhdr_error:
         IP_INC_STATS_BH(IPSTATS_MIB_INHDRERRORS);
 drop:
         kfree_skb(skb);
 out:
         return NET_RX_DROP;
 }

では、主にip_を見てみましょう.rcv_finish関数!
static int ip_rcv_finish(struct sk_buff *skb)  //   :    :   ;       
{
         const struct iphdr *iph = ip_hdr(skb);   // ip 
         struct rtable *rt;
 
         /*
          *      Initialise the virtual path cache for the packet. It describes
          *      how the packet travels inside Linux networking.
          */
         if (skb->dst == NULL) {    //       
                 int err = ip_route_input(skb, iph->daddr, iph->saddr, iph->tos,//            !         !       !(1)
                                          skb->dev);
                 if (unlikely(err)) {
                         if (err == -EHOSTUNREACH)
                                 IP_INC_STATS_BH(IPSTATS_MIB_INADDRERRORS);
                         else if (err == -ENETUNREACH)
                                 IP_INC_STATS_BH(IPSTATS_MIB_INNOROUTES);
                         goto drop;
                 }
         }
         // QOS     
 #ifdef CONFIG_NET_CLS_ROUTE
         if (unlikely(skb->dst->tclassid)) {
                 struct ip_rt_acct *st = per_cpu_ptr(ip_rt_acct, smp_processor_id());
                 u32 idx = skb->dst->tclassid;
                 st[idx&0xFF].o_packets++;
                 st[idx&0xFF].o_bytes+=skb->len;
                 st[(idx>>16)&0xFF].i_packets++;
                 st[(idx>>16)&0xFF].i_bytes+=skb->len;
         }
 #endif
         //       20B,    option      skb option,    ip_rcv_options
         if (iph->ihl > 5 && ip_rcv_options(skb))   //     !!!!!!!!!!!!!!!!(2)     ~~~~~~~~
                 goto drop;
 
         rt = (struct rtable*)skb->dst;       
         if (rt->rt_type == RTN_MULTICAST)    //   
                 IP_INC_STATS_BH(IPSTATS_MIB_INMCASTPKTS);
         else if (rt->rt_type == RTN_BROADCAST)  //   
                 IP_INC_STATS_BH(IPSTATS_MIB_INBCASTPKTS);
 
         return dst_input(skb);   //     ! ip_route_input   ,    ip_rcv_options       。         ip_local_deliver(    ),   ip_forward(    ).                   // (3)
 
 drop:
         kfree_skb(skb);
         return NET_RX_DROP;
 }

では、それぞれip_route_input  、  ip_rcv_options   、   dst_input 
ip_route_input関数は次のとおりです.
int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
                    u8 tos, struct net_device *dev)
{
         struct rtable * rth;            //       ,                          !    ,        
         unsigned        hash;
         int iif = dev->ifindex;         //     
         struct net *net;                //     
 
         net = dev->nd_net;              //          
         tos &= IPTOS_RT_MASK;           //     
         hash = rt_hash(daddr, saddr, iif); //   ,    ,    ,            hash 
 
         rcu_read_lock();    //                !
         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;   //           
              rth = rcu_dereference(rth->u.dst.rt_next)) {    //         :static struct rt_hash_bucket    *rt_hash_table;
                 if (rth->fl.fl4_dst == daddr &&              //           :http://blog.csdn.net/shanshanpt/article/details/19918171
                     rth->fl.fl4_src == saddr &&              // rth->u.dst.rt_next:       ,      ~~~~~~~~    ~~~~~~~
                     rth->fl.iif == iif &&
                     rth->fl.oif == 0 &&
                     rth->fl.mark == skb->mark &&
                     rth->fl.fl4_tos == tos &&
                     rth->u.dst.dev->nd_net == net &&
                     rth->rt_genid == atomic_read(&rt_genid)) {    //      !
                         dst_use(&rth->u.dst, jiffies);            //       dist     :use++,         jiffies;__refcnt
                         RT_CACHE_STAT_INC(in_hit);
                         rcu_read_unlock();
                         skb->dst = (struct dst_entry*)rth;        //   dist  
                         return 0;                                 //     ,     OK
                 }
                 RT_CACHE_STAT_INC(in_hlist_search);
         }
         rcu_read_unlock();
 
         /* Multicast recognition logic is moved from route cache to here.
            The problem was that too many Ethernet cards have broken/missing
            hardware multicast filters :-( As result the host on multicasting
            network acquires a lot of useless route cache entries, sort of
            SDR messages from all the world. Now we try to get rid of them.
            Really, provided software IP multicast filter is organized
            reasonably (at least, hashed), it does not result in a slowdown
            comparing with route cache reject entries.
            Note, that multicast routers are not affected, because
            route cache entry is created eventually.
          */
         if (ipv4_is_multicast(daddr)) {       //        
                 struct in_device *in_dev;
 
                 rcu_read_lock();
                 if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {      // ipv4     
                         int our = ip_check_mc(in_dev, daddr, saddr,  //           
                                 ip_hdr(skb)->protocol);
                         if (our
 #ifdef CONFIG_IP_MROUTE                                              //             
                             || (!ipv4_is_local_multicast(daddr) &&   //     
                                 IN_DEV_MFORWARD(in_dev))
 #endif
                             ) {
                                 rcu_read_unlock();
                                 return ip_route_input_mc(skb, daddr, saddr,   //       !
                                                          tos, dev, our);      //     dev    
                         }
                 }
                 rcu_read_unlock();
                 return -EINVAL;
         }
         return ip_route_input_slow(skb, daddr, saddr, tos, dev);   //          !         &&      !
 }

具体的にip_を見てみましょうroute_input_slow関数:
/*
 *      NOTE. We drop all the packets that has local source
 *      addresses, because every properly looped back packet
 *      must have correct destination already attached by output routine.
 *
 *      Such approach solves two big problems:
 *      1. Not simplex devices are handled properly.
 *      2. IP spoofing attempts are filtered with 100% of guarantee.
 */
 
static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
                                u8 tos, struct net_device *dev)
{
         struct fib_result res;
         struct in_device *in_dev = in_dev_get(dev);
         struct flowi fl = { .nl_u = { .ip4_u =                  //     
                                       { .daddr = daddr,
                                         .saddr = saddr,
                                         .tos = tos,
                                         .scope = RT_SCOPE_UNIVERSE,
                                       } },
                             .mark = skb->mark,
                             .iif = dev->ifindex };
         unsigned        flags = 0;
         u32             itag = 0;
         struct rtable * rth;
         unsigned        hash;
         __be32          spec_dst;
         int             err = -EINVAL;
         int             free_res = 0;
         struct net    * net = dev->nd_net;
 
         /* IP on this device is disabled. */
 
         if (!in_dev)       //      
                 goto out;
 
         /* Check for the most weird martians, which can be not detected
            by fib_lookup.
          */
 
         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||     //         ||      ||     
             ipv4_is_loopback(saddr))
                 goto martian_source;
 
         if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0)) //         
                 goto brd_input;
 
         /* Accept zero addresses only to limited broadcast;
          * I even do not know to fix it or not. Waiting for complains :-)
          */
         if (ipv4_is_zeronet(saddr))   //     0           
                goto martian_source;
 
         if (ipv4_is_lbcast(daddr) || ipv4_is_zeronet(daddr) ||   //       ,       
             ipv4_is_loopback(daddr))
                 goto martian_destination;
 
         /*
          *      Now we are ready to route packet.
          */                                               // fib_lookup          !!!!!!!!!!!!!!!!!!!!!!!!!(1)
         if ((err = fib_lookup(net, &fl, &res)) != 0) {    //            !!!
                 if (!IN_DEV_FORWARD(in_dev))              //  fib       ,          fib_result   res 
                         goto e_hostunreach;               //             
                 goto no_route;   //    ,       
         }
         free_res = 1;    //     
 
         RT_CACHE_STAT_INC(in_slow_tot);
 
         if (res.type == RTN_BROADCAST)   //          
                 goto brd_input;
 
         if (res.type == RTN_LOCAL) {     //     :       
                 int result;
                 result = fib_validate_source(saddr, daddr, tos ,         //       
                                              net->loopback_dev->ifindex,
                                              dev, &spec_dst, &itag);
                 if (result < 0)
                         goto martian_source;       //    
                 if (result)
                         flags |= RTCF_DIRECTSRC;   //   :     src
                 spec_dst = daddr;                  //       
                 goto local_input;                  // goto     
         } 
         //          :         ,      ,     
         if (!IN_DEV_FORWARD(in_dev))   //         
                goto e_hostunreach;
         if (res.type != RTN_UNICAST)   //             !                  ,       
                 goto martian_destination;

         err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);  //      ,      !!!    !!!!!(2)
 done:   //   :               !                ,     err
         in_dev_put(in_dev);      //      dev   
         if (free_res)            //           
                 fib_res_put(&res);  //       
 out:    return err;
 
 brd_input:   //                 0          
         if (skb->protocol != htons(ETH_P_IP)) //    IP     
                 goto e_inval;
 
         if (ipv4_is_zeronet(saddr))   //     0,      
                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
         else {
                 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst, //            
                                           &itag);
                 if (err < 0)
                         goto martian_source;
                 if (err)
                         flags |= RTCF_DIRECTSRC;
         }
         flags |= RTCF_BROADCAST;   //               
         res.type = RTN_BROADCAST;  //            
         RT_CACHE_STAT_INC(in_brd); // cache  ++
 
 local_input: //     
         rth = dst_alloc(&ipv4_dst_ops); //           
         if (!rth)
                 goto e_nobufs;
 
         rth->u.dst.output= ip_rt_bug;   //        
         rth->rt_genid = atomic_read(&rt_genid); // 
 
         atomic_set(&rth->u.dst.__refcnt, 1); 
         rth->u.dst.flags= DST_HOST;   //     ,         
         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
                 rth->u.dst.flags |= DST_NOPOLICY;
         rth->fl.fl4_dst = daddr;   //                
         rth->rt_dst     = daddr;
         rth->fl.fl4_tos = tos;
         rth->fl.mark    = skb->mark;
         rth->fl.fl4_src = saddr;
         rth->rt_src     = saddr;
 #ifdef CONFIG_NET_CLS_ROUTE
         rth->u.dst.tclassid = itag;
 #endif
         rth->rt_iif     =
         rth->fl.iif     = dev->ifindex;
         rth->u.dst.dev  = net->loopback_dev;   //     
         dev_hold(rth->u.dst.dev);
         rth->idev       = in_dev_get(rth->u.dst.dev);
         rth->rt_gateway = daddr;
         rth->rt_spec_dst= spec_dst;
         rth->u.dst.input= ip_local_deliver; //             !!!             :ip_local_deliver,  ...forward
         rth->rt_flags   = flags|RTCF_LOCAL;
         if (res.type == RTN_UNREACHABLE) {  //      
                rth->u.dst.input= ip_error;
                 rth->u.dst.error= -err;
                 rth->rt_flags   &= ~RTCF_LOCAL;
         }
         rth->rt_type    = res.type;
         hash = rt_hash(daddr, saddr, fl.iif);  //                     hash 
         err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);   //    hash  (   cache )
         goto done;   // 
 
 no_route:   //           
         RT_CACHE_STAT_INC(in_no_route);  //   
         spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE); // 
         res.type = RTN_UNREACHABLE;//      
         if (err == -ESRCH)
                 err = -ENETUNREACH;
         goto local_input; //        (               ~ )
 
         /*
          *      Do not cache martian addresses: they should be logged (RFC1812)
          */
 martian_destination:   //       
         RT_CACHE_STAT_INC(in_martian_dst);
 #ifdef CONFIG_IP_ROUTE_VERBOSE
         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
                 printk(KERN_WARNING "martian destination %u.%u.%u.%u from "
                         "%u.%u.%u.%u, dev %s
", NIPQUAD(daddr), NIPQUAD(saddr), dev->name); #endif e_hostunreach: err = -EHOSTUNREACH; // , goto done; e_inval: err = -EINVAL; // goto done; e_nobufs: err = -ENOBUFS; // goto done; martian_source: // ip_handle_martian_source(dev, in_dev, skb, daddr, saddr); goto e_inval; }

マクロCONFIG_IP_MULTIPLE_TABLESマクロは2つの方式のfibテーブルの初期化をもたらすため,マルチルーティングテーブルとマルチパステーブルがない場合がある.
>マルチルーティングテーブルの状況:
最も重要なルーティングプロセス関数:fib_lookup関数
int fib_lookup(struct net *net, struct flowi *flp, struct fib_result *res)
{
         struct fib_lookup_arg arg = {
                 .result = res, 
         };
         int err;
 
         err = fib_rules_lookup(net->ipv4.rules_ops, flp, 0, &arg);  //           (             net  ipv4     )
         res->r = arg.rule;  //         (           )
 
         return err;
}

次はfib_を見てみましょうrules_lookup関数:
int fib_rules_lookup(struct fib_rules_ops *ops, struct flowi *fl,
                      int flags, struct fib_lookup_arg *arg)
{
         struct fib_rule *rule;
         int err;
 
         rcu_read_lock();
 
         list_for_each_entry_rcu(rule, &ops->rules_list, list) {  //       
jumped:
                 if (!fib_rule_match(rule, ops, fl, flags))   //          ~~~~~       (1)
                         continue;
 
                 if (rule->action == FR_ACT_GOTO) {   //        
                         struct fib_rule *target;
 
                         target = rcu_dereference(rule->ctarget);
                         if (target == NULL) {
                                 continue;
                         } else {
                                 rule = target;  //          
                                 goto jumped;    //     
                         }
                 } else if (rule->action == FR_ACT_NOP)  //       ,    
                         continue;
                 else
                         err = ops->action(rule, fl, flags, arg);   //         ?       ?     fib4_rule_action(2)
 
                 if (err != -EAGAIN) {
                         fib_rule_get(rule);
                         arg->rule = rule;
                         goto out;
                 }
         }
 
         err = -ESRCH;
out:
         rcu_read_unlock();
 
         return err;
}

fib_rule_match :

static int fib_rule_match(struct fib_rule *rule, struct fib_rules_ops *ops,
                           struct flowi *fl, int flags)
{
         int ret = 0;
 
         if (rule->ifindex && (rule->ifindex != fl->iif))   //     id
                 goto out;
 
         if ((rule->mark ^ fl->mark) & rule->mark_mask)     //     
                 goto out;
 
         ret = ops->match(rule, fl, flags);   //     ops match  ~      action  ,        ?   
out:
         return (rule->flags & FIB_RULE_INVERT) ? !ret : ret;
}

ops(ipv 4のopsですよ~~~)のaction とmatch を てみましょう.
まず を てみましょう.
static struct fib_rules_ops fib4_rules_ops_template = {
     .family          = AF_INET,
     .rule_size     = sizeof(struct fib4_rule),
     .addr_size     = sizeof(u32),
     .action          = fib4_rule_action,    //       ~
     .match          = fib4_rule_match,      //       ~
     .configure     = fib4_rule_configure,
     .compare     = fib4_rule_compare,
     .fill          = fib4_rule_fill,
     .default_pref     = fib4_rule_default_pref,
     .nlmsg_payload     = fib4_rule_nlmsg_payload,
     .flush_cache     = fib4_rule_flush_cache,
     .nlgroup     = RTNLGRP_IPV4_RULE,
     .policy          = fib4_rule_policy,
     .owner          = THIS_MODULE,
};

はfib 4_rule_Action とfib 4_rule_match :
static int fib4_rule_action(struct fib_rule *rule, struct flowi *flp,
                             int flags, struct fib_lookup_arg *arg)
{
         int err = -EAGAIN;
         struct fib_table *tbl;
 
         switch (rule->action) {   //       action,                               
         case FR_ACT_TO_TBL:       //          
                 break;
 
         case FR_ACT_UNREACHABLE:  // 
                 err = -ENETUNREACH;
                 goto errout;
 
         case FR_ACT_PROHIBIT:
                 err = -EACCES;
                 goto errout;
 
         case FR_ACT_BLACKHOLE:
         default:
                 err = -EINVAL;
                 goto errout;
         }
 
         if ((tbl = fib_get_table(rule->fr_net, rule->table)) == NULL)   //         ( 1 )
                 goto errout;
  
         err = tbl->tb_lookup(tbl, flp, (struct fib_result *) arg->result); //         !!!     ( 2 )
         if (err > 0)
                 err = -EAGAIN;
 errout:
         return err;      //    ~
 }

まずfibを てみましょうget_table :
struct fib_table *fib_get_table(struct net *net, u32 id)
{
     struct fib_table *tb;
     struct hlist_node *node;
     struct hlist_head *head;
     unsigned int h;

     if (id == 0)
          id = RT_TABLE_MAIN;                    //     
     h = id & (FIB_TABLE_HASHSZ - 1);            //   hash  idx

     rcu_read_lock();
     head = &net->ipv4.fib_table_hash[h];        //     net ipv4  hash                  (     fib_new_table    )
                                                 //                !    ?   !
     hlist_for_each_entry_rcu(tb, node, head, tb_hlist) { //        list
          if (tb->tb_id == id) {     //   tb_id    :     id(    LOCAL,   MAIN...)
               rcu_read_unlock();
               return tb;     //         
          }
     }
     rcu_read_unlock();
     return NULL;
}

ルーティングテーブルが つかったことを っています. に、このパケットをどのように するかをルーティングテーブルで す があります. を てください.
、tb_について するlookup:
どうやってtbを った?lookupはどんな ですか?それともどんな に けられていますか?tableはfibを っていると ったのを えています.new_tableが したのですが、この には か な がありますか?
この を てfib_new_table(エピソードだよ~)
struct fib_table *fib_new_table(struct net *net, u32 id)
 {
         struct fib_table *tb;
         unsigned int h;
 
         if (id == 0)
                 id = RT_TABLE_MAIN;
         tb = fib_get_table(net, id);
         if (tb)
                 return tb;
 
         tb = fib_hash_table(id);   //       ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~  ~
         if (!tb)
                 return NULL;
         h = id & (FIB_TABLE_HASHSZ - 1);
         hlist_add_head_rcu(&tb->tb_hlist, &net->ipv4.fib_table_hash[h]);
         return tb;
 }

またfib_を てhash_table :(エピソード)
struct fib_table *fib_hash_table(u32 id)
{
     struct fib_table *tb;

     tb = kmalloc(sizeof(struct fib_table) + sizeof(struct fn_hash),             fib     
               GFP_KERNEL);
     if (tb == NULL)
          return NULL;

     tb->tb_id = id;
     tb->tb_default = -1;
     tb->tb_lookup = fn_hash_lookup;    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~    ~~~~~~~
     tb->tb_insert = fn_hash_insert;    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~    ~~~~~~~
     tb->tb_delete = fn_hash_delete;    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~    ~~~~~~~
     tb->tb_flush = fn_hash_flush;
     tb->tb_select_default = fn_hash_select_default;
     tb->tb_dump = fn_hash_dump;
     memset(tb->tb_data, 0, sizeof(struct fn_hash));
     return tb;
}

ではtb_lookupは はfn_hash_lookup、 は てみましょう.
fn_hash_lookup :
static int
fn_hash_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result *res)
{
         int err;
         struct fn_zone *fz;
         struct fn_hash *t = (struct fn_hash*)tb->tb_data;  //    fib_table    fn_hash   (     )
                                                            // (   :http://blog.csdn.net/shanshanpt/article/details/19918171 )

         read_lock(&fib_hash_lock);
         for (fz = t->fn_zone_list; fz; fz = fz->fz_next) { //      [33] (         )
                 struct hlist_head *head;
                 struct hlist_node *node;
                 struct fib_node *f;
                 __be32 k = fz_key(flp->fl4_dst, fz);       //                fl4_det&((fz)->fz_mask)
 
                 head = &fz->fz_hash[fn_hash(k, fz)];       //                   (  :             )
                 hlist_for_each_entry(f, node, head, fn_hash) {   //        
                         if (f->fn_key != k)                //             :        (                     !)
                                 continue;
 
                         err = fib_semantic_match(&f->fn_alias,   //            !(          fib_info          ~~~ )
                                                  flp, res,
                                                  f->fn_key, fz->fz_mask,  //         ~~~~~~
                                                  fz->fz_order);
                         if (err <= 0)
                                 goto out;
                 }
         }
         err = 1;
out:
         read_unlock(&fib_hash_lock);
         return err;
}
 

fibを てsemantic_match :
/* Note! fib_semantic_match intentionally uses  RCU list functions. */
int fib_semantic_match(struct list_head *head, const struct flowi *flp,
                        struct fib_result *res, __be32 zone, __be32 mask,
                         int prefixlen)
{
         struct fib_alias *fa;
         int nh_sel = 0;
 
         list_for_each_entry_rcu(fa, head, fa_list) {   //        
                 int err;
 
                 if (fa->fa_tos &&                      // tos
                     fa->fa_tos != flp->fl4_tos)
                         continue;

                 if (fa->fa_scope < flp->fl4_scope)     //     
                         continue;
 
                 fa->fa_state |= FA_S_ACCESSED;
 
                 err = fib_props[fa->fa_type].error;   //          ,   err      
                 if (err == 0) {  //     
                         struct fib_info *fi = fa->fa_info;   // fib_info
 
                         if (fi->fib_flags & RTNH_F_DEAD)  //        (       )
                                continue;
 
                         switch (fa->fa_type) {    //     
                         case RTN_UNICAST:   //   
                         case RTN_LOCAL:     //   
                         case RTN_BROADCAST: //   
                         case RTN_ANYCAST:   //   
                         case RTN_MULTICAST: //   
                                 for_nexthops(fi) { //          info         ,        ,    
                                         if (nh->nh_flags&RTNH_F_DEAD)  //                 nh?   for_nexthops        ,           ~
                                                 continue;
                                         if (!flp->oif || flp->oif == nh->nh_oif) //         ,            ,          
                                                 break; //             ,                   ~          ~
                                 }
 #ifdef CONFIG_IP_ROUTE_MULTIPATH            //      
                                 if (nhsel < fi->fib_nhs) {
                                         nh_sel = nhsel;
                                         goto out_fill_res;   //      
                                 }
 #else
                                 if (nhsel < 1) {             //                1
                                         goto out_fill_res;   // 
                                 }
 #endif
                                 endfor_nexthops(fi);
                                 continue;
 
                         default:
                                 printk(KERN_WARNING "fib_semantic_match bad type %#x
", fa->fa_type); return -EINVAL; } } return err; } return 1; out_fill_res: // res->prefixlen = prefixlen; res->nh_sel = nh_sel; res->type = fa->fa_type; res->scope = fa->fa_scope; res->fi = fa->fa_info; atomic_inc(&res->fi->fib_clntref); return 0; }

この を てみましょう.
struct fib_result {
        unsigned char   prefixlen;   //        
        unsigned char   nh_sel;      //            (       1)
        unsigned char   type;        //     
        unsigned char   scope;       //     
#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED //      
            __u32           network;    //    
            __u32           netmask;    //   
#endif
        struct fib_info *fi;         //     
#ifdef CONFIG_IP_MULTIPLE_TABLES
        struct fib_rule *r;      
#endif
    };

マルチルーティング・テーブルについて します. に、 ルーティング・テーブルの を てみましょう.
> ルーティングテーブルの :
static inline int fib_lookup(struct net *net, const struct flowi *flp,
                    struct fib_result *res)
{
     struct fib_table *table;

     table = fib_get_table(net, RT_TABLE_LOCAL); //        (       ,         ~~~~~~~~ )
     if (!table->tb_lookup(table, flp, res))     //               ~~~~~~~~~~~~~~~~~~~~~~~
          return 0;

     table = fib_get_table(net, RT_TABLE_MAIN);  //       
     if (!table->tb_lookup(table, flp, res))     //               ~~~~~~~~~~~~~~~~~~~~~~~
          return 0;
     return -ENETUNREACH;
}

の の のfibを てみましょう.get_table :easy
static inline struct fib_table *fib_get_table(struct net *net, u32 id)
{
         struct hlist_head *ptr;
 
         ptr = id == RT_TABLE_LOCAL ?
                 &net->ipv4.fib_table_hash[TABLE_LOCAL_INDEX] :    //      
                 &net->ipv4.fib_table_hash[TABLE_MAIN_INDEX];      //     
         return hlist_entry(ptr->first, struct fib_table, tb_hlist);
}

というのは、ここまで てからの は のマルチルートと じですよ~~~~~~~~~~~~~~~~
ここまで ると、ルーティングが して ってきたこと( はres)、または したことがわかります~~~~では、 の の に って、その であるべきだと います!
るべきはip_route_input_slowのip_mkroute_input !!(ルーティングバッファの )
------>は、 バッファに しいルーティングアイテムを することです.
static inline int ip_mkroute_input(struct sk_buff *skb,
                                    struct fib_result* res,
                                    const struct flowi *fl,
                                    struct in_device *in_dev,
                                    __be32 daddr, __be32 saddr, u32 tos)
{
         struct rtable* rth = NULL;
         int err;
         unsigned hash;
 
#ifdef CONFIG_IP_ROUTE_MULTIPATH
         if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
                 fib_select_multipath(fl, res);
#endif
 
         /* create a routing cache entry */
         err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);  //         (1),      rth 
         if (err)
                 return err;
 
         /* put it into the cache */
         hash = rt_hash(daddr, saddr, fl->iif);   //         hash、 
         return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);  //          (2)
}

てみようmkroute_input :
static inline int __mkroute_input(struct sk_buff *skb,
                                   struct fib_result* res,
                                   struct in_device *in_dev,
                                   __be32 daddr, __be32 saddr, u32 tos,
                                   struct rtable **result)
{
 
         struct rtable *rth;
         int err;
         struct in_device *out_dev;
         unsigned flags = 0;
         __be32 spec_dst;
         u32 itag;
 
         /* get a working reference to the output device */
         out_dev = in_dev_get(FIB_RES_DEV(*res));
         if (out_dev == NULL) {
                 if (net_ratelimit())
                         printk(KERN_CRIT "Bug in ip_route_input" \
                                "_slow(). Please, report
"); return -EINVAL; } err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res), // in_dev->dev, &spec_dst, &itag); if (err < 0) { ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr, // saddr); err = -EINVAL; goto cleanup; } if (err) flags |= RTCF_DIRECTSRC; if (out_dev == in_dev && err && !(flags & RTCF_MASQ) && (IN_DEV_SHARED_MEDIA(out_dev) || inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res)))) flags |= RTCF_DOREDIRECT; if (skb->protocol != htons(ETH_P_IP)) { // ip , ARP , ~ /* Not IP (i.e. ARP). Do not create route, if it is * invalid for proxy arp. DNAT routes are always valid. */ if (out_dev == in_dev) { // dev, , err = -EINVAL; goto cleanup; } } rth = dst_alloc(&ipv4_dst_ops); // ( ) if (!rth) { err = -ENOBUFS; goto cleanup; } atomic_set(&rth->u.dst.__refcnt, 1); // 1 rth->u.dst.flags= DST_HOST; if (IN_DEV_CONF_GET(in_dev, NOPOLICY)) rth->u.dst.flags |= DST_NOPOLICY; if (IN_DEV_CONF_GET(out_dev, NOXFRM)) rth->u.dst.flags |= DST_NOXFRM; rth->fl.fl4_dst = daddr; // rtable rth->rt_dst = daddr; rth->fl.fl4_tos = tos; rth->fl.mark = skb->mark; rth->fl.fl4_src = saddr; rth->rt_src = saddr; rth->rt_gateway = daddr; rth->rt_iif = rth->fl.iif = in_dev->dev->ifindex; rth->u.dst.dev = (out_dev)->dev; dev_hold(rth->u.dst.dev); rth->idev = in_dev_get(rth->u.dst.dev); rth->fl.oif = 0; rth->rt_spec_dst= spec_dst; rth->u.dst.input = ip_forward; // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ rth->u.dst.output = ip_output; // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ rth->rt_genid = atomic_read(&rt_genid); rt_set_nexthop(rth, res, itag); rth->rt_flags = flags; *result = rth; // ~~~~~~~~~~~~~~~~~~ err = 0; cleanup: /* release the working reference to the output device */ in_dev_put(out_dev); return err; }

キャッシュをhashテーブルに するには、まずdst_を び します.allocはルートバッファを り て、 にSLABにキャッシュノードを り て、 り てるたびにゴミ を みます.
void * dst_alloc(struct dst_ops * ops)
{
        struct dst_entry * dst;

        if (ops->gc && atomic_read(&ops->entries) > ops->gc_thresh) {  //     
                if (ops->gc(ops))
                        return NULL;
        }
        dst = kmem_cache_zalloc(ops->kmem_cachep, GFP_ATOMIC); //  slab     
        if (!dst)
                return NULL;
        //       
        atomic_set(&dst->__refcnt, 0);
        dst->ops = ops;
        dst->lastuse = jiffies;
        dst->path = dst;
        dst->input = dst->output = dst_discard;
#if RT_CACHE_DEBUG >= 2
        atomic_inc(&dst_total);
#endif
        atomic_inc(&ops->entries);
        return dst;  //  
}

そして たちは の しいルートをどのように たちのhashテーブルに するかを ています.rt_を てintern_hash :
static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
{
         struct rtable   *rth, **rthp;
         unsigned long   now;
         struct rtable *cand, **candp;
         u32             min_score;
         int             chain_length;
         int attempts = !in_softirq();
 
restart:
         chain_length = 0;
         min_score = ~(u32)0;
         cand = NULL;
         candp = NULL;
         now = jiffies;
 
         rthp = &rt_hash_table[hash].chain;  //    hash        hash      
         //                    hash      。    ,               ,       ,            。
         //        hash      CPU   ,        CPU             CPU  
         spin_lock_bh(rt_hash_lock_addr(hash));
         while ((rth = *rthp) != NULL) {
			 if (rth->rt_genid != atomic_read(&rt_genid)) {  //       
                         *rthp = rth->u.dst.rt_next; //        
                         rt_free(rth);    //       
                         continue;
                 }
			 if (compare_keys(&rth->fl, &rt->fl) && compare_netns(rth, rt)) { //               
                         /* Put it first */ 
				         //       :      ,       ,
				         //                 ,                 
                         *rthp = rth->u.dst.rt_next;  
                         /*
                          * Since lookup is lockfree, the deletion
                          * must be visible to another weakly ordered CPU before
                          * the insertion at the start of the hash chain.
                          */
                         rcu_assign_pointer(rth->u.dst.rt_next,
                                            rt_hash_table[hash].chain);
                         /*
                          * Since lookup is lockfree, the update writes
                          * must be ordered for consistency on SMP.
                          */
                         rcu_assign_pointer(rt_hash_table[hash].chain, rth);
 
                         dst_use(&rth->u.dst, now);  //            
                         spin_unlock_bh(rt_hash_lock_addr(hash)); //   
 
                         rt_drop(rt);  //           
                         *rp = rth;
                         return 0;
                 }
                 //        ?               ,         cache。 
			     //         tr_score      ,            
                 if (!atomic_read(&rth->u.dst.__refcnt)) {
                         u32 score = rt_score(rth);  //   
 
                         if (score <= min_score) {  //    ,            
                                 cand = rth;
                                 candp = rthp;
                                 min_score = score;
                         }
                 }
 
                 chain_length++; 
		 //   :                  
		 // (    ,              10 (     ,    10)   ,            ,
		 //        ~      8 ,           ,     ~~~)
 
                 rthp = &rth->u.dst.rt_next;  //    ,      
         }

         if (cand) { //            
	            // (                 ,             ~~~   CPU        ~
                 /* ip_rt_gc_elasticity used to be average length of chain
                  * length, when exceeded gc becomes really aggressive.
                  *
                  * The second limit is less certain. At the moment it allows
                  * only 2 entries per bucket. We will see.
                  */
			 if (chain_length > ip_rt_gc_elasticity) { //        ?       ~ 
                         *candp = cand->u.dst.rt_next;   
                         rt_free(cand);  //   
                 }
         }
 
         /* Try to bind route to arp only if it is output
            route or unicast forwarding path.
          */
	//               ,        arp  , 
	//        ,          ,             
         if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
                 int err = arp_bind_neighbour(&rt->u.dst); //       ARP,               
                 if (err) {
                         spin_unlock_bh(rt_hash_lock_addr(hash));
 
                         if (err != -ENOBUFS) {
                                 rt_drop(rt);
                                 return err;
                         }
 
                         /* Neighbour tables are full and nothing
                            can be released. Try to shrink route cache,
                            it is most likely it holds some neighbour records.
                          */
			//             ,
			//   rt_garbage_collect        ,             
                         if (attempts-- > 0) {
                                 int saved_elasticity = ip_rt_gc_elasticity;
                                 int saved_int = ip_rt_gc_min_interval;
                                 ip_rt_gc_elasticity     = 1;
                                 ip_rt_gc_min_interval   = 0;
                                 rt_garbage_collect(&ipv4_dst_ops);
                                 ip_rt_gc_min_interval   = saved_int;
                                 ip_rt_gc_elasticity     = saved_elasticity;
                                 goto restart;
                         }
 
                         if (net_ratelimit()) //         ,    ,    ~~~~    ~~~~~~
                                 printk(KERN_WARNING "Neighbour table overflow.
"); rt_drop(rt); return -ENOBUFS; } } rt->u.dst.rt_next = rt_hash_table[hash].chain; // next chain #if RT_CACHE_DEBUG >= 2 if (rt->u.dst.rt_next) { struct rtable *trt; printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash, NIPQUAD(rt->rt_dst)); for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next) printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst)); printk("
"); } #endif // rt, rt ~~~~ rt_hash_table[hash].chain = rt; spin_unlock_bh(rt_hash_lock_addr(hash));// *rp = rt; return 0; }

OK、 バッファの で しいルートを してOKを して、 はまた のどこに るべきですか???
つまりip_route_input_slowでip_mkroute_Inputが すると、 ip_route_input_slow が しましたが、 にどこが えましたか?ip_route_Input 、
ではip_に りますrcv_finish、 は にこのルートを つけたか、あるいは、 つからなかったか、 つからなかったかは を て、ip_を なければなりません.rcv_finishの の :dstを めます.input
/* Input packet from network to transport.  */
static inline int dst_input(struct sk_buff *skb)
{
         int err;
 
         for (;;) {
                 err = skb->dst->input(skb);     //     ,     input  ,           input      ~
                                                 //           :ip_forward ip_local_deliver
                 if (likely(err == 0))           //   :rth->u.dst.input= ip_local_deliver;   rth->u.dst.input = ip_forward
                         return err; 
                 /* Oh, Jamal... Seems, I will not forgive you this mess. :-) */
                 if (unlikely(err != NET_XMIT_BYPASS))
                         return err;
         }
}

:まずローカル データを る:ip_forward
int ip_forward(struct sk_buff *skb)
{
         struct iphdr *iph;      /* Our header */
         struct rtable *rt;      /* Route we use */
         struct ip_options * opt = &(IPCB(skb)->opt);
 
         if (!xfrm4_policy_check(NULL, XFRM_POLICY_FWD, skb))   //             
                 goto drop;
         // forward       ip    ,  ip  ,    ~~~~~~~~~~~~~~~~
         if (IPCB(skb)->opt.router_alert && ip_call_ra_chain(skb)) //   ip option:     Router_alter   (      ip),  ,  ip_call_ra_chain  。   
                 return NET_RX_SUCCESS;
 
         if (skb->pkt_type != PACKET_HOST)   //                ,  drop
                 goto drop;                  //  2       ,           2      ,skb->pkt_type   PCAKET_HOST
 
         skb_forward_csum(skb);  //     :skb->ip_summed = CHECKSUM_NONE;
                                 //               ,         4    。  ip_summed CHECKSUM_NONE
         /*
          *      According to the RFC, we must first decrease the TTL field. If
          *      that reaches zero, we must reply an ICMP control message telling
          *      that the packet's lifetime expired.
          */
         if (ip_hdr(skb)->ttl <= 1)    // ttl    ,    
                 goto too_many_hops;
 
         if (!xfrm4_route_forward(skb)) //  IPsec(Internet Protocol Security)  
                 goto drop;
 
         rt = (struct rtable*)skb->dst; //      (       +      ~ )
 
         if (opt->is_strictroute && rt->rt_dst != rt->rt_gateway) //      Strict   option。  ,    option         rt_gateway(   )  
                goto sr_failed;
 
         if (unlikely(skb->len > dst_mtu(&rt->u.dst) && !skb_is_gso(skb) &&
                      (ip_hdr(skb)->frag_off & htons(IP_DF))) && !skb->local_df) { //     。     icmp,       
                 IP_INC_STATS(IPSTATS_MIB_FRAGFAILS);
                 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,  //   icmp  
                           htonl(dst_mtu(&rt->u.dst)));
                 goto drop;
         }
 
         /* We are about to mangle packet. Copy it! */
         if (skb_cow(skb, LL_RESERVED_SPACE(rt->u.dst.dev)+rt->u.dst.header_len)) //   skb   ,    copy  ,        
                 goto drop;
         iph = ip_hdr(skb);   //   ip 
 
         /* Decrease ttl after skb cow done */
         ip_decrease_ttl(iph);   // ttl  (  ip   )
 
         /*
          *      We now generate an ICMP HOST REDIRECT giving the route
          *      we calculated.
          */
         if (rt->rt_flags&RTCF_DOREDIRECT && !opt->srr && !skb->sp) //                     , host        ICMP REDIRESCT  (    host     source routing option )   
                 ip_rt_send_redirect(skb);   //      
 
         skb->priority = rt_tos2priority(iph->tos);  // QOS      
 
         return NF_HOOK(PF_INET, NF_INET_FORWARD, skb, skb->dev, rt->u.dst.dev,
                        ip_forward_finish);        //      ip_forward_finish  !!!!!!!!!!!!!!!(  !!!!)
 
 sr_failed:
         /*
          *      Strict routing permits no gatewaying
          */
          icmp_send(skb, ICMP_DEST_UNREACH, ICMP_SR_FAILED, 0);   //             
          goto drop;
 
 too_many_hops:
         /* Tell the sender its packet died... */    // ttl<1 ,    icmp  
         IP_INC_STATS_BH(IPSTATS_MIB_INHDRERRORS);
         icmp_send(skb, ICMP_TIME_EXCEEDED, ICMP_EXC_TTL, 0);
 drop:
         kfree_skb(skb);          //   
         return NET_RX_DROP;
 }
 

ip_を てforward_finish :
static int ip_forward_finish(struct sk_buff *skb)
{
         struct ip_options * opt = &(IPCB(skb)->opt);
 
         IP_INC_STATS_BH(IPSTATS_MIB_OUTFORWDATAGRAMS);
 
         if (unlikely(opt->optlen))
                 ip_forward_options(skb);   //     ip options,   
 
         return dst_output(skb);  //        !dst_output      skb->dst_output,       ip_output,    ip_mc_output.             ,          ~
}

まず、ユニキャストの を てみましょう.
ip_output :
int ip_output(struct sk_buff *skb)
{
         struct net_device *dev = skb->dst->dev;
 
         IP_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
 
         skb->dev = dev;
         skb->protocol = htons(ETH_P_IP);
 
         return NF_HOOK_COND(PF_INET, NF_INET_POST_ROUTING, skb, NULL, dev,   //       ip_finish_output  
                             ip_finish_output,
                             !(IPCB(skb)->flags & IPSKB_REROUTED));
}

ip_を てfinish_putput :
static int ip_finish_output(struct sk_buff *skb)
{
#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
         /* Policy lookup after SNAT yielded a new policy */
         if (skb->dst->xfrm != NULL) {
                 IPCB(skb)->flags |= IPSKB_REROUTED;
                 return dst_output(skb);
         }
#endif
         if (skb->len > ip_skb_dst_mtu(skb) && !skb_is_gso(skb)) //      GSO &&     MTU,        ~~~~~~,    GSO,      
                 return ip_fragment(skb, ip_finish_output2);     //   ,    ip_finish_output2    
         else
                 return ip_finish_output2(skb);   //     ip_finish_output2    
}

スライス を てみましょう:ip_
/*
 *      This IP datagram is too large to be sent in one piece.  Break it up into
 *      smaller pieces (each of size equal to IP header plus
 *      a block of the data of the original IP data part) that will yet fit in a
 *      single device frame, and queue such a frame for sending.
 */
 
int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*))   //          output      ip_finish_output2    
{
         struct iphdr *iph;
         int raw = 0;
         int ptr;
         struct net_device *dev;
         struct sk_buff *skb2;
         unsigned int mtu, hlen, left, len, ll_rs, pad;
         int offset;
         __be16 not_last_frag;
         struct rtable *rt = (struct rtable*)skb->dst;
         int err = 0;
 
         dev = rt->u.dst.dev;
 
         /*
          *      Point into the IP datagram header.
          */
 
         iph = ip_hdr(skb);  // ip 
         //   DF ,    df              ,  ip_fragment      icmp    
         if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) { 
                 IP_INC_STATS(IPSTATS_MIB_FRAGFAILS);
                 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,  // icmp    
                           htonl(ip_skb_dst_mtu(skb)));
                 kfree_skb(skb);
                 return -EMSGSIZE;
         }
 
         /*
          *      Setup starting values.
          */
 
         hlen = iph->ihl * 4;  //         32    4B,  iph->ihl*4         
         mtu = dst_mtu(&rt->u.dst) - hlen;       /* Size of data space */  //      -      == MTU 
         IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE;
 
         /* When frag_list is given, use it. First, check its validity:
          * some transformers could create wrong frag_list or break existing
          * one, it is not prohibited. In this case fall back to copying.
          *
          * LATER: this step can be merged to real generation of fragments,
          * we can switch to copy when see the first bad fragment.
          *///   4        ,            skb frag_list   ,          frag_list      
         if (skb_shinfo(skb)->frag_list) {   // skb_shinfo      ?#define skb_shinfo(SKB)((struct skb_shared_info *)((SKB)->end))
                 struct sk_buff *frag;       //          ,       skb_shared_info,            。     ~~~~
                 int first_len = skb_pagelen(skb);
                 int truesizes = 0;
                 //                 (  mtu,mf      ).              ,                 (  IP_MF  1)。      mtu    hlen mtu,         hlen。
                 if (first_len - hlen > mtu ||
                     ((first_len - hlen) & 7) ||
                     (iph->frag_off & htons(IP_MF|IP_OFFSET)) ||
                     skb_cloned(skb))
                         goto slow_path;     //        
                 //            frag
                 for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) {
                         /* Correct geometry. */
                         if (frag->len > mtu ||
                             ((frag->len & 7) && frag->next) ||
                             skb_headroom(frag) < hlen)
                             goto slow_path; //        
 
                         /* Partially cloned skb? */  //     ,        
                         if (skb_shared(frag))
                                 goto slow_path;    //        
 
                         BUG_ON(frag->sk);
                         if (skb->sk) {
                                 sock_hold(skb->sk);  // 
                                 frag->sk = skb->sk;
                                 frag->destructor = sock_wfree;
                                 truesizes += frag->truesize;
                         }
                 }
 
                 /* Everything is OK. Generate! */
 
                 err = 0;
                 offset = 0;
                 frag = skb_shinfo(skb)->frag_list;  //   frag_list   
                 skb_shinfo(skb)->frag_list = NULL;
                 skb->data_len = first_len - skb_headlen(skb); //       
                 skb->truesize -= truesizes;
                 skb->len = first_len;
                 iph->tot_len = htons(first_len);
                 iph->frag_off = htons(IP_MF);    //   MF   
                 ip_send_check(iph);   //     
                      
                 for (;;) {    //         
                         /* Prepare header of the next frame,
                          * before previous one went down. */
                         if (frag) {
                                 frag->ip_summed = CHECKSUM_NONE;   //     (       )
                                 skb_reset_transport_header(frag);  //        
                                 __skb_push(frag, hlen);            //   frag    hlen  (             ip   OK!!!!!!!! )
                                 skb_reset_network_header(frag);    //        
                                 memcpy(skb_network_header(frag), iph, hlen); //  ip         ~~~            ~~~
                                 iph = ip_hdr(frag);                //          
                                 iph->tot_len = htons(frag->len);   //   len
                                 ip_copy_metadata(frag, skb);       //  skb          
                                 if (offset == 0)
                                         ip_options_fragment(frag); //    0,         ,                
                                 offset += skb->len - hlen;         //        offset
                                 iph->frag_off = htons(offset>>3);  //   ip   offset        ,     3 ,  8 
                                 if (frag->next != NULL)            //          ,  MF  1,        
                                         iph->frag_off |= htons(IP_MF);
                                 /* Ready, complete checksum */     // 
                                 ip_send_check(iph);
                         }
  
                         err = output(skb);    //                 ip_finish_output2(         ~~~~~~ )
 
                         if (!err)
                                 IP_INC_STATS(IPSTATS_MIB_FRAGCREATES);
                         if (err || !frag)
                                 break;
 
                         skb = frag;        //       skb(         skb   ~ )
                         frag = skb->next;  // frag     
                         skb->next = NULL;
                 }  // for  
 
                 if (err == 0) {
                         IP_INC_STATS(IPSTATS_MIB_FRAGOKS);
                         return 0;
                 }
 
                 while (frag) {  //     
                         skb = frag->next;
                         kfree_skb(frag);
                         frag = skb;
                 }
                 IP_INC_STATS(IPSTATS_MIB_FRAGFAILS);
                 return err;   //   
         }
 
 slow_path:
         left = skb->len - hlen;         /* Space per frame */ //     
         ptr = raw + hlen;               /* Where to start from */ //     
 
         /* for bridged IP traffic encapsulated inside f.e. a vlan header,
          * we need to make room for the encapsulating header
          */
         pad = nf_bridge_pad(skb);  //      、VLAN、PPPOE  MTU
         ll_rs = LL_RESERVED_SPACE_EXTRA(rt->u.dst.dev, pad);
         mtu -= pad;
 
         /*
          *      Fragment the datagram.
          */
 
         offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3; //    13     ,  8       :        
         not_last_frag = iph->frag_off & htons(IP_MF);     //       MF           
 
         /*
          *      Keep copying data until we run out.
          */
 
         while (left > 0) {   //         ,         skb
                 len = left;
                 /* IF: it doesn't fit, use 'mtu' - the data space left */
                 if (len > mtu)    // len > mtu,           MTU
                         len = mtu;
                 /* IF: we are not sending upto and including the packet end
                    then align the next start on an eight byte boundary */
                 if (len < left) {
                         len &= ~7; //         :    ,        ,      8     !!!
                 }
                 /*
                  *      Allocate buffer.
                  */
 
                 if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) {   //     skb  
                         NETDEBUG(KERN_INFO "IP: frag: no memory for new fragment!
"); err = -ENOMEM; goto fail; } /* * Set up data on packet */ // ( ) ip_copy_metadata(skb2, skb); // copy skb skb_reserve(skb2, ll_rs); skb_put(skb2, len + hlen); // skb_reset_network_header(skb2);// skb2->transport_header = skb2->network_header + hlen; // /* * Charge the memory for the fragment to any owner * it might possess */ if (skb->sk) // ip socket skb_set_owner_w(skb2, skb->sk); /* * Copy the packet header into the new buffer. */ skb_copy_from_linear_data(skb, skb_network_header(skb2), hlen); // skb copy /* * Copy a block of the IP datagram. */ if (skb_copy_bits(skb, ptr, skb_transport_header(skb2), len)) // BUG(); left -= len; // copy , /* * Fill in the new header fields. */ iph = ip_hdr(skb2); // iph->frag_off = htons((offset >> 3)); // ip offset , 3 , 8 /* ANK: dirty, but effective trick. Upgrade options only if * the segment to be fragmented was THE FIRST (otherwise, * options are already fixed) and make it ONCE * on the initial skb, so that all the following fragments * will inherit fixed options. */ if (offset == 0) // ip_options_fragment(skb); /* * Added AC : If we are fragmenting a fragment that's not the * last fragment then keep MF on each bit */ if (left > 0 || not_last_frag) iph->frag_off |= htons(IP_MF); ptr += len; offset += len; /* * Put this fragment into the sending queue. */ iph->tot_len = htons(len + hlen); ip_send_check(iph); // err = output(skb2); // if (err) goto fail; IP_INC_STATS(IPSTATS_MIB_FRAGCREATES); } kfree_skb(skb); IP_INC_STATS(IPSTATS_MIB_FRAGOKS); return err; fail: kfree_skb(skb); IP_INC_STATS(IPSTATS_MIB_FRAGFAILS); return err; }

skbを てshare_info:
/* This data is invariant across clones and lives at
 * the end of the header data, ie. at skb->end.
 */
struct skb_shared_info {
         atomic_t        dataref;   //     “  ” 
         unsigned short  nr_frags;  //   frags
         unsigned short  gso_size;  // gso size
         /* Warning: this field is not always filled in (UFO)! */ 
         unsigned short  gso_segs; 
         unsigned short  gso_type;
         __be32          ip6_frag_id; //     
         struct sk_buff  *frag_list;//          
         skb_frag_t      frags[MAX_SKB_FRAGS];
};

OK、やっとスライスが わりました. 、どうやって るか てみましょう.output は、 はip_です.finish_output 2 :
static inline int ip_finish_output2(struct sk_buff *skb)
{
         struct dst_entry *dst = skb->dst;
         struct rtable *rt = (struct rtable *)dst;
         struct net_device *dev = dst->dev;
         unsigned int hh_len = LL_RESERVED_SPACE(dev);
 
         if (rt->rt_type == RTN_MULTICAST)   //   
                 IP_INC_STATS(IPSTATS_MIB_OUTMCASTPKTS);
         else if (rt->rt_type == RTN_BROADCAST) //   
                 IP_INC_STATS(IPSTATS_MIB_OUTBCASTPKTS);
 
         /* Be paranoid, rather than too clever. */
         if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
                 struct sk_buff *skb2;
 
                 skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));   // 
                 if (skb2 == NULL) {
                         kfree_skb(skb);
                         return -ENOMEM;
                 }
                 if (skb->sk)
                         skb_set_owner_w(skb2, skb->sk);
                 kfree_skb(skb);
                 skb = skb2;
         }
 
         if (dst->hh)
                 return neigh_hh_output(dst->hh, skb);    //        !         ,   ARP  !!!!!!!     ~
         else if (dst->neighbour)
                 return dst->neighbour->output(skb);  //        !         ,   ARP  !!!!!!!     ~
 
         if (net_ratelimit())
                 printk(KERN_DEBUG "ip_finish_output2: No header cache and no neighbour!
"); kfree_skb(skb); return -EINVAL; }

これで、ワンマン は !
マルチキャストの : を て:ip_mc_output
int ip_mc_output(struct sk_buff *skb)
{
         struct sock *sk = skb->sk;
         struct rtable *rt = (struct rtable*)skb->dst;
         struct net_device *dev = rt->u.dst.dev;
 
         /*
          *      If the indicated interface is up and running, send the packet.
          */
         IP_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
 
         skb->dev = dev;
         skb->protocol = htons(ETH_P_IP);
 
         /*
          *      Multicasts are looped back for other local users
          */
 
         if (rt->rt_flags&RTCF_MULTICAST) {
                 if ((!sk || inet_sk(sk)->mc_loop)
 #ifdef CONFIG_IP_MROUTE
                 /* Small optimization: do not loopback not local frames,
                    which returned after forwarding; they will be  dropped
                    by ip_mr_input in any case.
                    Note, that local frames are looped back to be delivered
                    to local recipients.
 
                    This check is duplicated in ip_mr_input at the moment.
                  */
                     && ((rt->rt_flags&RTCF_LOCAL) || !(IPCB(skb)->flags&IPSKB_FORWARDED))
 #endif
                 ) {
                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
                         if (newskb)
                                 NF_HOOK(PF_INET, NF_INET_POST_ROUTING, newskb,
                                         NULL, newskb->dev,
                                         ip_dev_loopback_xmit);
                 }
 
                 /* Multicasts with ttl 0 must not go beyond the host */
 
                 if (ip_hdr(skb)->ttl == 0) {
                         kfree_skb(skb);
                         return 0;
                 }
         }
 
         if (rt->rt_flags&RTCF_BROADCAST) {
                 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
                 if (newskb)
                         NF_HOOK(PF_INET, NF_INET_POST_ROUTING, newskb, NULL,
                                 newskb->dev, ip_dev_loopback_xmit);
         }
 
         return NF_HOOK_COND(PF_INET, NF_INET_POST_ROUTING, skb, NULL, skb->dev,   //              ip_finish_output  ~~~~~~~~
                             ip_finish_output,
                             !(IPCB(skb)->flags & IPSKB_REROUTED));
 }

ローカルからデータを する 、ip_からforwardの の !
からは、パケットを の に す についてお しします!!!すなわちip_local_deliver !
int ip_local_deliver(struct sk_buff *skb)   //   IP  ,    ip_local_deliver_finish                
{
         /*
          *      Reassemble IP fragments.
          */
 
         if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) { //   frag_off              ,MF         ,    OFFSET
                 if (ip_defrag(skb, IP_DEFRAG_LOCAL_DELIVER))    //      ~~~~~~
                         return 0;
         }
 
         return NF_HOOK(PF_INET, NF_INET_LOCAL_IN, skb, skb->dev, NULL,
                        ip_local_deliver_finish);      //            ~~~~~~~~~~~~~~~~~~~~
}

まずip_を てみましょうdefrag :
/* Process an incoming IP datagram fragment. */
int ip_defrag(struct sk_buff *skb, u32 user)
{
         struct ipq *qp;     //        
         struct net *net;
 
         IP_INC_STATS_BH(IPSTATS_MIB_REASMREQDS);
 
         net = skb->dev ? skb->dev->nd_net : skb->dst->dev->nd_net;
         /* Start by cleaning up the memory. */
         if (atomic_read(&net->ipv4.frags.mem) > net->ipv4.frags.high_thresh) //       IP                     ,   ,   ip_evictor()        IP  ,          。
                 ip_evictor(net);
 
         /* Lookup (or create) queue header */
         if ((qp = ip_find(net, ip_hdr(skb), user)) != NULL) {//                ,              ,           。

                 int ret;
 
                 spin_lock(&qp->q.lock);
 
                 ret = ip_frag_queue(qp, skb);   //          ,      ,          ,        IP 
                 spin_unlock(&qp->q.lock); 

                 ipq_put(qp);
                 return ret;
         }
 
         IP_INC_STATS_BH(IPSTATS_MIB_REASMFAILS);
         kfree_skb(skb);
         return -ENOMEM;
}

ip_を てfind :
static inline struct ipq *ip_find(struct net *net, struct iphdr *iph, u32 user)
{
         struct inet_frag_queue *q;
         struct ip4_create_arg arg;
         unsigned int hash;
 
         arg.iph = iph;
         arg.user = user;
         hash = ipqhashfn(iph->id, iph->saddr, iph->daddr, iph->protocol);   //        hash 
         //     hash       ip4_frags.rnd,,     (u32) ((num_physpages ^ (num_physpages>>7)) ^ (jiffies ^ (jiffies >> 6)));

         q = inet_frag_find(&net->ipv4.frags, &ip4_frags, &arg, hash);  //               ,       ,           
         if (q == NULL)                                                 //        
                 goto out_nomem;                                        //   ip4_frags     

         return container_of(q, struct ipq, q);
 
 out_nomem:
         LIMIT_NETDEBUG(KERN_ERR "ip_frag_create: no memory left !
"); return NULL; }

てみろfrag_find :
struct inet_frag_queue *inet_frag_find(struct netns_frags *nf,
                 struct inet_frags *f, void *key, unsigned int hash)
{
         struct inet_frag_queue *q;
         struct hlist_node *n;
 
         read_lock(&f->lock);
         hlist_for_each_entry(q, n, &f->hash[hash], list) {  //        f           
                 if (q->net == nf && f->match(q, key)) {   //     
                         atomic_inc(&q->refcnt);
                         read_unlock(&f->lock);
                         return q;   //       
                 }
         }
         read_unlock(&f->lock);
 
         return inet_frag_create(nf, f, key, hash);   ///             (        )
}

OK、 ip_に りますdefragでは、 の を てみましょう.ip_frag_Queue、どのようにしてすべてのスライスを し しますか?
この についてはあまり いたくありませんが、 にはすべてのスライスが したときに、スライスをoffset で してください~~~~~ のskbを してください~~~~~
このqpキューが はグローバルであることに づいたらOK、ip 4_frags ~~~~~~~~
/* Add new segment to existing queue. */
static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
{
         struct sk_buff *prev, *next;
         struct net_device *dev;
         int flags, offset;
         int ihl, end;
         int err = -ENOENT;
 
         if (qp->q.last_in & COMPLETE)
                 goto err;
 
         if (!(IPCB(skb)->flags & IPSKB_FRAG_COMPLETE) &&
             unlikely(ip_frag_too_far(qp)) &&
             unlikely(err = ip_frag_reinit(qp))) {
                 ipq_kill(qp);
                 goto err;
         }
 
         offset = ntohs(ip_hdr(skb)->frag_off);
         flags = offset & ~IP_OFFSET;
         offset &= IP_OFFSET;
         offset <<= 3;           /* offset is in 8-byte chunks */
         ihl = ip_hdrlen(skb);
 
         /* Determine the position of this fragment. */
         end = offset + skb->len - ihl;
         err = -EINVAL;
 
         /* Is this the final fragment? */
         if ((flags & IP_MF) == 0) {
                 /* If we already have some bits beyond end
                  * or have different end, the segment is corrrupted.
                  */
                 if (end < qp->q.len ||
                     ((qp->q.last_in & LAST_IN) && end != qp->q.len))
                         goto err;
                 qp->q.last_in |= LAST_IN;
                 qp->q.len = end;
         } else {
                 if (end&7) {
                         end &= ~7;
                         if (skb->ip_summed != CHECKSUM_UNNECESSARY)
                                 skb->ip_summed = CHECKSUM_NONE;
                 }
                 if (end > qp->q.len) {
                         /* Some bits beyond end -> corruption. */
                         if (qp->q.last_in & LAST_IN)
                                 goto err;
                         qp->q.len = end;
                 }
         }
         if (end == offset)
                 goto err;
 
         err = -ENOMEM;
         if (pskb_pull(skb, ihl) == NULL)
                 goto err;
 
         err = pskb_trim_rcsum(skb, end - offset);
         if (err)
                 goto err;
 
         /* Find out which fragments are in front and at the back of us
          * in the chain of fragments so far.  We must know where to put
          * this fragment, right?
          */
         prev = NULL;
         for (next = qp->q.fragments; next != NULL; next = next->next) {
                 if (FRAG_CB(next)->offset >= offset)
                         break;  /* bingo! */
                 prev = next;
         }
 
         /* We found where to put this one.  Check for overlap with
          * preceding fragment, and, if needed, align things so that
          * any overlaps are eliminated.
          */
         if (prev) {
                 int i = (FRAG_CB(prev)->offset + prev->len) - offset;
 
                 if (i > 0) {
                         offset += i;
                         err = -EINVAL;
                         if (end <= offset)
                                 goto err;
                         err = -ENOMEM;
                         if (!pskb_pull(skb, i))
                                 goto err;
                         if (skb->ip_summed != CHECKSUM_UNNECESSARY)
                                 skb->ip_summed = CHECKSUM_NONE;
                 }
         }
 
         err = -ENOMEM;
 
         while (next && FRAG_CB(next)->offset < end) {
                 int i = end - FRAG_CB(next)->offset; /* overlap is 'i' bytes */
 
                 if (i < next->len) {
                         /* Eat head of the next overlapped fragment
                          * and leave the loop. The next ones cannot overlap.
                          */
                         if (!pskb_pull(next, i))
                                 goto err;
                         FRAG_CB(next)->offset += i;
                         qp->q.meat -= i;
                         if (next->ip_summed != CHECKSUM_UNNECESSARY)
                                 next->ip_summed = CHECKSUM_NONE;
                         break;
                 } else {
                         struct sk_buff *free_it = next;
 
                         /* Old fragment is completely overridden with
                          * new one drop it.
                          */
                         next = next->next;
 
                         if (prev)
                                 prev->next = next;
                         else
                                 qp->q.fragments = next;
 
                         qp->q.meat -= free_it->len;
                         frag_kfree_skb(qp->q.net, free_it, NULL);
                 }
         }
 
         FRAG_CB(skb)->offset = offset;
 
         /* Insert this fragment in the chain of fragments. */
         skb->next = next;
         if (prev)
                 prev->next = skb;
         else
                 qp->q.fragments = skb;
 
         dev = skb->dev;
         if (dev) {
                 qp->iif = dev->ifindex;
                 skb->dev = NULL;
        }
         qp->q.stamp = skb->tstamp;
         qp->q.meat += skb->len;
         atomic_add(skb->truesize, &qp->q.net->mem);
         if (offset == 0)
                 qp->q.last_in |= FIRST_IN;
 
         if (qp->q.last_in == (FIRST_IN | LAST_IN) && qp->q.meat == qp->q.len)
                 return ip_frag_reasm(qp, prev, dev);
 
         write_lock(&ip4_frags.lock);
         list_move_tail(&qp->q.lru_list, &qp->q.net->lru_list);
         write_unlock(&ip4_frags.lock);
         return -EINPROGRESS;
 
 err:
         kfree_skb(skb);
         return err;
}

OK、 ip_defrag の が したらip_に りますlocal_deliver で、 のip_を てください.local_deliver_finish :
static int ip_local_deliver_finish(struct sk_buff *skb)
{
         __skb_pull(skb, ip_hdrlen(skb));   //   ip   ,     ip 
 
         /* Point into the IP datagram, just past the header. */
         skb_reset_transport_header(skb);   //          
 
         rcu_read_lock();
         {
                 int protocol = ip_hdr(skb)->protocol;    //       
                 int hash, raw;
                 struct net_protocol *ipprot;
 
         resubmit:
                 raw = raw_local_deliver(skb, protocol);  //           
 
                 hash = protocol & (MAX_INET_PROTOS - 1); //         
                 if ((ipprot = rcu_dereference(inet_protos[hash])) != NULL) { //      4       (          ~~~~~~~~~~~)
                         int ret;                            //   inet_protos     ,          ,   : struct net_protocol     
 
                         if (!ipprot->no_policy) {   //        ,   IPSec(IP security)
                                 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
                                         kfree_skb(skb);
                                         goto out;
                                 }
                                 nf_reset(skb);
                         }
                         ret = ipprot->handler(skb);    // 4     !!!          ~      ----> 4        
                         if (ret < 0) {                 //     tcp     tcp_v4_rcv
                                 protocol = -ret;
                                 goto resubmit;
                         }
                        IP_INC_STATS_BH(IPSTATS_MIB_INDELIVERS);
                 } else {     //          
                         if (!raw) {   //       ,   IPSec
                                 if (xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
                                         IP_INC_STATS_BH(IPSTATS_MIB_INUNKNOWNPROTOS);
                                         icmp_send(skb, ICMP_DEST_UNREACH,     //   icmp    
                                                   ICMP_PROT_UNREACH, 0);
                                 }
                         } else
                                 IP_INC_STATS_BH(IPSTATS_MIB_INDELIVERS);
                         kfree_skb(skb);
                 }
         }
out:
         rcu_read_unlock();
 
         return 0;
}

このプロトコル を てみましょう.
/* This is used to register protocols. */
 struct net_protocol {
         int                     (*handler)(struct sk_buff *skb);                //            skb
         void                    (*err_handler)(struct sk_buff *skb, u32 info);  //       
         int                     (*gso_send_check)(struct sk_buff *skb);         
         struct sk_buff         *(*gso_segment)(struct sk_buff *skb,
                                                int features);
         int                     no_policy;
 };

レイヤのプロトコル :net_として されています.protocol 、hashからinet_protosテーブルでの
#ifdef CONFIG_IP_MULTICAST
static const struct net_protocol igmp_protocol = {
	.handler =	igmp_rcv,
	.netns_ok =	1,
};
#endif

static const struct net_protocol tcp_protocol = {
	.handler =	tcp_v4_rcv,
	.err_handler =	tcp_v4_err,
	.gso_send_check = tcp_v4_gso_send_check,
	.gso_segment =	tcp_tso_segment,
	.gro_receive =	tcp4_gro_receive,
	.gro_complete =	tcp4_gro_complete,
	.no_policy =	1,
	.netns_ok =	1,
};

static const struct net_protocol udp_protocol = {
	.handler =	udp_rcv,
	.err_handler =	udp_err,
	.gso_send_check = udp4_ufo_send_check,
	.gso_segment = udp4_ufo_fragment,
	.no_policy =	1,
	.netns_ok =	1,
};

static const struct net_protocol icmp_protocol = {
	.handler =	icmp_rcv,
	.no_policy =	1,
	.netns_ok =	1,
};

TCPプロトコルでは、tcp_v4_rcv....
OK、これでip ~~~~~~~~~~~~~~~~~~~~~~~~~