Pysotトレーニングプロセス整理(詳細コメント)-SiamRPN+&ResNet 50


準備フェーズ
#       :   、       。
rank, world_size = dist_init()
#world_size:    /    /GPU  
#rank:  ID
#    :  args   (           experiments   config.yaml    ) cfg    。
cfg.merge_from_file(args.cfg)
#merge_from_file(*.yaml):    yaml        cfgNode  
#  local process(process ID:0)  log files。
if rank == 0:
    if not os.path.exists(cfg.TRAIN.LOG_DIR):
        os.makedirs(cfg.TRAIN.LOG_DIR)
    init_log('global', logging.INFO)
    if cfg.TRAIN.LOG_DIR:
        add_file_handler('global',
                         os.path.join(cfg.TRAIN.LOG_DIR, 'logs.txt'),
                         logging.INFO)
    logger.info("Version Information: 
{}
"
.format(commit())) logger.info("config
{}"
.format(json.dumps(cfg, indent=4))) # : ModelBuilder , get_backbone/get_neck/get_rpn_head/get_mask_head( ) #ModelBuilder forward(data) , data template patch search patch, label:label_cls、 label:label_loc, :label_loc_weight。 outputs total_loss/ cls_loss/ loc_loss; #ModelBuilder template(z)(backbone neck ) track(x)(backbone neck , rpn_head ), cls/loc/mask( )。
model = ModelBuilder().cuda().train()
dist_model = DistModule(model)
#model.cuda().train():      GPU , set mode=training mode ;
#DistModule(model):       ,       copy   GPU ;

以下はModelBuilder集積の各モジュールの実現である.backbone構築
self.backbone = get_backbone(cfg.BACKBONE.TYPE, **cfg.BACKBONE.KWARGS) #model_builder.py
BACKBONES = {
     
              'alexnetlegacy': alexnetlegacy,
              'mobilenetv2': mobilenetv2,
              'resnet18': resnet18,
              'resnet34': resnet34,
              'resnet50': resnet50,
              'alexnet': alexnet,
            } #__init__.py
def resnet50:model = ResNet(Bottleneck, [3, 4, 6, 3], **kwargs) #resnet_atrous.py

ここで、ResNet 50の構築は主に2つの部分から構成される:ResNet全体構築とBottleneckモジュール構築.
* Bottleneck(inplanes, planes, stride=1,downsample=None, dilation=1)
             channels=inplanes;  channels=4*planes;stride=1;dilation=1(default)
             x => conv1(inplanes, planes) => bn1 => relu => conv2(planes,planes) => bn2 => relu => conv3(planes,planes*4) => bn3 => out+residual(x x  downsample       ) ,  conv2       。
           
* ResNet(block, layers, used_layers)
                => conv1(3,64) => bn1 => x_:relu => maxpool(ksz=3,s=2,p=1) => p1:layer1 => p2:layer2 => p3:layer3 => p4:layer4 => out:[x_,p1,p2,p3,p4] => out[p2,p3,p4]
           layer1=_make_layer(block, planes=64, blocks=layers[0], stride=1, dilation=1):block Bottleneck  ,layers[0]=3
           downsample = nn.Sequential(nn.Conv2d(self.inplanes=64, planes * block.expansion=64*4,kernel_size=1, stride=1, bias=False),nn.BatchNorm2d(planes * block.expansion=64*4),)
           block(64,64,1,downsample,1) => Bottleneck(inplanes, planes, stride=1,downsample, dilation=1) =>   64channels,  256channels
           self.inplanes=64*4=256
           block(256,64,dilation=1) => Bottleneck(inplanes, planes, stride=1,downsample=None, dilation=1) =>  256channels,  256channels
           block(256,64,dilation=1) => Bottleneck(inplanes, planes, stride=1,downsample=None, dilation=1) =>  256channels,  256channels
             :nn.Sequential(Bottleneck*3)
             :   、     64=>256
           
           layer2=_make_layer(block, planes=128, blocks=layers[1], stride=2,dilation=1):block Bottleneck  ,layers[1]=4
           dd=1,padding=0
           downsample = nn.Sequential(nn.Conv2d(self.inplanes=256, planes * block.expansion=128*4,kernel_size=3, stride=2, bias=False,padding=0, dilation=1),nn.BatchNorm2d(planes * block.expansion=128*4),)
           block(256,128,2,downsample,1) => Bottleneck(inplanes, planes, stride=2,downsample, dilation=1) =>   256channels,  512channels
           self.inplanes = 128*4 = 512
           block(512,128,dilation=1) => Bottleneck(inplanes, planes, stride=1,downsample=None, dilation=1) =>  512channels,  512channels
           block(512,128,dilation=1) => Bottleneck(inplanes, planes, stride=1,downsample=None, dilation=1) =>  512channels,  512channels
           block(512,128,dilation=1) => Bottleneck(inplanes, planes, stride=1,downsample=None, dilation=1) =>  512channels,  512channels
             :nn.Sequential(Bottleneck*4)
             :   、     256=>512
           
           self.feature_size = 128 * block.expansion = 128*4 = 512
           
           layer3=_make_layer(block, planes=256, blocks=layers[2], stride=1, dilation=2):block Bottleneck  ,layers[2]=6
           dd = dilation // 2 = 1,padding = dd =1
           downsample = nn.Sequential(nn.Conv2d(self.inplanes=512, planes * block.expansion=256*4,kernel_size=3, stride=1, bias=False, padding=1, dilation=1),nn.BatchNorm2d(planes * block.expansion=256*4),)
           block(512,256,1,downsample,2) => Bottleneck(inplanes, planes, stride=1,downsample, dilation=2) =>   512channels,  1024channels
           self.inplanes = 256*4 = 1024
           block(1024,256,dilation=2) => Bottleneck(inplanes, planes, stride=1,downsample=None, dilation=2) =>  1024channels,  1024channels
           block(1024,256,dilation=2) => Bottleneck(inplanes, planes, stride=1,downsample=None, dilation=2) =>  1024channels,  1024channels
           block(1024,256,dilation=2) => Bottleneck(inplanes, planes, stride=1,downsample=None, dilation=2) =>  1024channels,  1024channels
           block(1024,256,dilation=2) => Bottleneck(inplanes, planes, stride=1,downsample=None, dilation=2) =>  1024channels,  1024channels
           block(1024,256,dilation=2) => Bottleneck(inplanes, planes, stride=1,downsample=None, dilation=2) =>  1024channels,  1024channels
             :nn.Sequential(Bottleneck*6)
             :   、     512=>1024、     dilation=2
           
           self.feature_size = (256 + 128) * 4 = 1536
           
           layer4=_make_layer(block, planes=512, blocks=layers[3], stride=1, dilation=4):block Bottleneck  ,layers[3]=3
           dd = dilation // 2 = 2,padding = dd =2
           downsample = nn.Sequential(nn.Conv2d(self.inplanes=1024, planes * block.expansion=512*4,kernel_size=3, stride=1, bias=False, padding=2, dilation=2),nn.BatchNorm2d(planes * block.expansion=512*4),)
           block(1024,512,1,downsample,4) => Bottleneck(inplanes, planes, stride=1,downsample, dilation=4) =>   1024channels,  2048channels
           self.inplanes = 512*4 = 2048
           block(2048,512,dilation=4) => Bottleneck(inplanes, planes, stride=1,downsample=None, dilation=4) =>  2048channels,  2048channels
           block(2048,512,dilation=4) => Bottleneck(inplanes, planes, stride=1,downsample=None, dilation=4) =>  2048channels,  2048channels
             :nn.Sequential(Bottleneck*3)
             :   、     1024=>2048、     dilation=4
           
           self.feature_size = 512 * 4 = 2048
      [  ] dilation=1          ,    ;dilation=2     cell cell   1 cell。

2.neck構築
if cfg.ADJUST.ADJUST:
    self.neck = get_neck(cfg.ADJUST.TYPE, **cfg.ADJUST.KWARGS) #model_builder.py

NECKS = {
     
         'AdjustLayer': AdjustLayer,
         'AdjustAllLayer': AdjustAllLayer
        } #__init__.py

ADJUST:
    ADJUST: true
    TYPE: "AdjustAllLayer"
    KWARGS:
        in_channels: [512, 1024, 2048]
        out_channels: [256, 256, 256] #config.yaml

neck層はAdjustAllLayerタイプであるが,その構築もAdjustLayerという関数によって実現される.config.yamlファイルには、入出力channelsが長さ3のリストであることがわかります.従ってneck層にはselfを通過する3つのAdjustLayerモジュールも存在する.add_module()メソッドは、downsample 2/downsample 3/downsample 4と呼ばれるモジュールを追加します.AdjustLayer関数の実装を見てみましょう.AdjustLayer(in_channels,out_channels,center_size=7)で、in_チャンネルとout_channelsはconfigからそれぞれ受信.yamlファイルの対応する3つの値(512256)/(1024256)/(2048256)center_size対応論文SiamRPN++で述べた「計算量を減らすために特徴crop center sizeサイズ」についてcenter_size=7はデフォルト値で、コードで定義されます.AdjustLayerの実装はdownsampleレイヤです.
self.downsample = nn.Sequential(
    nn.Conv2d(in_channels, out_channels, kernel_size=1, bias=False),
    nn.BatchNorm2d(out_channels),
    )

トレーニングでforward関数を呼び出すと、downsampleレイヤで処理されたフィーチャーがcropを実行するかどうかを判断します.
def forward(self, x):
    x = self.downsample(x)
    if x.size(3) < 20:
        l = (x.size(3) - self.center_size) // 2
        r = l + self.center_size
        x = x[:, :, l:r, l:r]
    return x

3.rpn_head構築
self.rpn_head = get_rpn_head(cfg.RPN.TYPE, **cfg.RPN.KWARGS) #model_builder.py
RPNS = {
     
        'UPChannelRPN': UPChannelRPN,
        'DepthwiseRPN': DepthwiseRPN,
        'MultiRPN': MultiRPN
       } #__init__.py
RPN:
    TYPE: 'MultiRPN'
    KWARGS:
        anchor_num: 5
        in_channels: [256, 256, 256]
        weighted: true #config.yaml
MultiRPN(anchor_num, in_channels, weighted=False) #rpn.py

ここで、RPNの入力はいずれも256 channels、SiamRPN++のrpn_headは3つあり、それぞれrpn 2/rpn 3/rpn 4である.DepthWiseCorr後、重み付け融合(重み計算:F.softmax(cls_weight)/F.softmax(loc_weight);DepthwiseRPNクラスはそれぞれ分類と回帰についてDepthWise-Corr計算を行う.
DepthwiseRPN(anchor_num, in_channels[i], in_channels[i]))
self.cls = DepthwiseXCorr(in_channels, out_channels, 2 * anchor_num)
self.loc = DepthwiseXCorr(in_channels, out_channels, 4 * anchor_num)

ただし、in_channels=256, out_channels=256;DepthwiseXCorrに対応init(in_channels=256、hidden=256、out_channels=10(または20)、kernel_size=3)テンプレートブランチの場合(in:256,out:256):
self.conv_kernel = nn.Sequential(
        nn.Conv2d(in_channels, hidden, kernel_size=kernel_size, bias=False),
        nn.BatchNorm2d(hidden),
        nn.ReLU(inplace=True),
        ) 

検索ブランチの場合(in:256,out:256):
self.conv_search = nn.Sequential(
        nn.Conv2d(in_channels, hidden, kernel_size=kernel_size, bias=False),
        nn.BatchNorm2d(hidden),
        nn.ReLU(inplace=True),
        )

depth-wise correlation:
def xcorr_depthwise(x, kernel):
    batch = kernel.size(0)
    channel = kernel.size(1)
    x = x.view(1, batch*channel, x.size(2), x.size(3))
    kernel = kernel.view(batch*channel, 1, kernel.size(2), kernel.size(3))
    out = F.conv2d(x, kernel, groups=batch*channel)
    out = out.view(batch, channel, out.size(2), out.size(3))
    return out

最後にhead層を介して出力が得られた(in:256,out:10/20):
self.head = nn.Sequential(
        nn.Conv2d(hidden, hidden, kernel_size=1, bias=False),
        nn.BatchNorm2d(hidden),
        nn.ReLU(inplace=True),
        nn.Conv2d(hidden, out_channels, kernel_size=1)
        )

最後にModelBuilderの順方向伝播関数forward(data):転送されたdataにはtrainingイメージpairs、label_が含まれています.cls、label_loc、label_loc_Weight****(これらのパラメータkey-value pairsはデータセットの構築時に書き込まれます)
まずtemplate/search pairsを上記構築モデルに転送して分類結果clsと位置結果locを得,その後clsをlog_に送るsoftmaxはsoftmax value:clsを得る;cls、label_をcls送り込みselect_cross_entropy_loss得られた分類損失cls_loss;loc,label_loc, label_loc_Weight送り込みweight_l1_lossは回帰損失loc_を得たloss.ouputs=[total_loss,cls_loss,loc_loss]を返します.
これで、ネットワークモデル構築部は終了する.話を続ける.
#  backbone           model
if cfg.BACKBONE.PRETRAINED:
    cur_path = os.path.dirname(os.path.realpath(__file__))
    backbone_path = os.path.join(cur_path, '../', cfg.BACKBONE.PRETRAINED)
    load_pretrain(model.backbone, backbone_path)

#挿入:モデルのロードについては、一般的に次の5つのステップがあります.
1.   device
device = torch.device('cuda' if cfg.CUDA else 'cpu’) #  GPU  CPU
   device = torch.cuda.current_device() #    GPU
2.    
pretrained_dict = torch.load(pretrained_path, map_location=lambda storage, loc: storage.cuda(device)) #  GPU
   ckpt = torch.load(args.snapshot, map_location=lambda storage,loc:storage.cpu()) #  CPU
3.    (     GPU         )
pretrained_dict = remove_prefix(pretrained_dict['state_dict’], 'module.)  #if "state_dict" in pretrained_dict.keys()
   pretrained_dict = remove_prefix(pretrained_dict, 'module.) #if “state_dict" not in pretrained_dict.keys()
4.      :                         
check_keys(model, pretrained_dict)
5.    
model.load_state_dict(pretrained_dict, strict=False)

#data loader#1を構築する.トレーニングセットの構築
train_dataset = TrkDataset()

データセットがどのように構築されているかを見てみましょう.
まず説明したいのは、Datasetクラスがインデックスメソッドを実装する必要があることです.getitem__Dataloaderがデータを容易に取得し、データセットの長さlength(実装_len_メソッド)を提供するには、DataLoaderが自動的にDataSetから小ロットのデータセットを取り出してトレーニングすることができます.
dataset.py=>class:TrkData=>初期化関数_init__
#     
def __init__(self,):
    super(TrkDataset, self).__init__()
    desired_size = (cfg.TRAIN.SEARCH_SIZE - cfg.TRAIN.EXEMPLAR_SIZE) / cfg.ANCHOR.STRIDE + 1 + cfg.TRAIN.BASE_SIZE
    if desired_size != cfg.TRAIN.OUTPUT_SIZE:
        raise Exception('size not match!')
    
    # create anchor targer(  :  anchors)
       #1.  stride/ratios/scales  anchors(5,4),5:anchor  ; 4:x1,y1,x2,y2.
       #2. 25x25             5 anchor_box,  anchor_box  (x1,y1,x2,y2)  ,    x1,y1,x2,y2   5 anchor_box 25          
        #3.       anchors  Anchors    all_anchors  anchor   
    self.anchor_target = AnchorTarget()   # TrkDataset:anchor_target => AnchorTarget:anchors => Anchors:all_anchors
    
    # create sub dataset
    self.all_dataset = []
    start = 0
    self.num = 0
    for name in cfg.DATASET.NAMES:
        subdata_cfg = getattr(cfg.DATASET, name)
        #SubDataset           ,      :
        #1.      train.json;
        #2.                ;
        #3.       ,          ‘frames’:meta_data[video][track]['frames'] = frames
        #4.                :self.labels = meta_data
        #5.    :self.num(     )/self.num_use(        ,     -1   self.num)/self.videos(      )/self.pick(      index     num_use   index  )/self.start_idx(      index)/self.frame_range(   )
        sub_dataset = SubDataset(
                name,
                subdata_cfg.ROOT,
                subdata_cfg.ANNO,
                subdata_cfg.FRAME_RANGE,
                subdata_cfg.NUM_USE, #VID=10000,others=-1
                start #from 0 to +num_VID/+num_COCO/+num_YTB/+num_DET
            )
        start += sub_dataset.num #          
        self.num += sub_dataset.num_use #from 0 to +num_use_VID/+num_use_COCO/+num_use_YTB/+num_use_DET
        sub_dataset.log()
        self.all_dataset.append(sub_dataset) #self.all_dataset           
    
    # data augmentation
    self.template_aug = Augmentation(
            cfg.DATASET.TEMPLATE.SHIFT,
            cfg.DATASET.TEMPLATE.SCALE,
            cfg.DATASET.TEMPLATE.BLUR,
            cfg.DATASET.TEMPLATE.FLIP,
            cfg.DATASET.TEMPLATE.COLOR
        )
    self.search_aug = Augmentation(
            cfg.DATASET.SEARCH.SHIFT,
            cfg.DATASET.SEARCH.SCALE,
            cfg.DATASET.SEARCH.BLUR,
            cfg.DATASET.SEARCH.FLIP,
            cfg.DATASET.SEARCH.COLOR
        )
    videos_per_epoch = cfg.DATASET.VIDEOS_PER_EPOCH #600000
    self.num = videos_per_epoch if videos_per_epoch > 0 else self.num
    self.num *= cfg.TRAIN.EPOCH #60000*20=120,0000
    self.pick = self.shuffle() #     index  

dataset.py=>class:TrkDataset=>データ取得方法
def __getitem__(self, index):
    index = self.pick[index] #          index
    dataset, index = self._find_dataset(index) #  index         index         index
    #         (gray)      (neg)
    gray = cfg.DATASET.GRAY and cfg.DATASET.GRAY > np.random.random()
    neg = cfg.DATASET.NEG and cfg.DATASET.NEG > np.random.random()
    # get one dataset
    if neg: #         patch pair,     ,    negative pairs
        #  dataset   index     ,   img_path & img_anno
        template = dataset.get_random_target(index)
        #      dataset     ,   index   img_path & img_anno
        search = np.random.choice(self.all_dataset).get_random_target()
    else:#    video    image  patch pair,    positive pairs
        template, search = dataset.get_positive_pair(index) #                img_path & img_anno
	# get image
	template_image = cv2.imread(template[0])
	search_image = cv2.imread(search[0])
	# get bounding box (BBox Ground-Truth)
	template_box = self._get_bbox(template_image, template[1]) #x1,y1,x2,y2
	search_box = self._get_bbox(search_image, search[1])
	# augmentation
	# template/search  image
	template, _ = self.template_aug(template_image,
	                                template_box,
	                                cfg.TRAIN.EXEMPLAR_SIZE,
	                                gray=gray)
	search, bbox = self.search_aug(search_image,
	                               search_box,
	                               cfg.TRAIN.SEARCH_SIZE,
	                               gray=gray)
	# get labels
	   #   AnchorTarget  __call__  
	   # __call__    :
	   #    1.      cls
	   #    2.      delta
	   #    3.        delta_weight
	#  4.    anchors bbox IOU overlap
	cls, delta, delta_weight, overlap = self.anchor_target(bbox, cfg.TRAIN.OUTPUT_SIZE, neg)
	template = template.transpose((2, 0, 1)).astype(np.float32)
	search = search.transpose((2, 0, 1)).astype(np.float32)
	return {
	        'template': template,
	        'search': search,
	        'label_cls': cls,
	        'label_loc': delta,
	        'label_loc_weight': delta_weight,
	        'bbox': np.array(bbox)
	        }

#2(オプション、分散)データセットスライスの作成
if get_world_size() > 1:
    train_sampler = DistributedSampler(train_dataset)
#3.  data loader
train_loader = DataLoader(train_dataset,
                          batch_size=cfg.TRAIN.BATCH_SIZE,
                          num_workers=cfg.TRAIN.NUM_WORKERS,
                          pin_memory=True,
                          sampler=train_sampler)

#オプティマイザと学習率調整器を構築SGDオプティマイザoptimizerを構築するために必要なパラメータは、トレーニング可能なパラメータリスト、運動量、重み減衰構築学習率調整器lr_schedulerに必要なパラメータは、オプティマイザ、トレーニングepochsの総数です.
1.         python  
trainable_params = []
trainable_params += [{
     'params': filter(lambda x: x.requires_grad,model.backbone.parameters()),
                      'lr': cfg.BACKBONE.LAYERS_LR * cfg.TRAIN.BASE_LR}]
2.     
optimizer = torch.optim.SGD(trainable_params, momentum=cfg.TRAIN.MOMENTUM, weight_decay=cfg.TRAIN.WEIGHT_DECAY)
3.        
lr_scheduler = build_lr_scheduler(optimizer, epochs=cfg.TRAIN.EPOCH)
4.     
lr_scheduler.step(cfg.TRAIN.START_EPOCH)

トレーニングフェーズコアコード:def train(train_loader,model,optimizer,lr_scheduler,tb_writer)
for idx, data in enumerate(train_loader):
    if epoch != idx // num_per_epoch + start_epoch:
        epoch = idx // num_per_epoch + start_epoch
        if get_rank() == 0:
            torch.save(
                    {
     'epoch': epoch,
                     'state_dict': model.module.state_dict(),
                     'optimizer': optimizer.state_dict()},
                    cfg.TRAIN.SNAPSHOT_DIR+'/checkpoint_e%d.pth' % (epoch))
        if epoch == cfg.TRAIN.EPOCH:
            return
        if cfg.BACKBONE.TRAIN_EPOCH == epoch:
            optimizer, lr_scheduler = build_opt_lr(model.module, epoch)
        
        #   epoch,     
        lr_scheduler.step(epoch)
        cur_lr = lr_scheduler.get_cur_lr()
    
    outputs = model(data)
    loss = outputs['total_loss’]
    if is_valid_number(loss.data.item()):
        optimizer.zero_grad()
        loss.backward()
        reduce_gradients(model)
        if rank == 0 and cfg.TRAIN.LOG_GRADS:
            log_grads(model.module, tb_writer, tb_idx)
        # clip gradient
        clip_grad_norm_(model.parameters(), cfg.TRAIN.GRAD_CLIP)
        optimizer.step()

これでpysotの訓練過程の整理が完了した.訓練はデータのロードからモデルの最適化、保存までの全過程を貫き、これらの任務過程を明らかにし、基本的にプロジェクト全体を理解した.もし何か質問があれば、一緒に交流してください.皆さんに少し助けてほしいです.