目標検出-Faster R-CNN訓練過程のソースコード理解
58081 ワード
Faster R-CNNトレーニングプロセスのソース理解
トレーニングスクリプト./tools/train_net.pyメイン関数が開始します.
データ読取りレイヤーRoIDataLayer
まず、
そして、関数combined_roidb:
pascal_vocデータセットに対応するクラスのオブジェクト:
得られたimdb=pascal_voc(‘trainval’,‘2007’)に記録された内容は以下の通りである.
[1] - _class_to_ind,dictタイプ,keyはクラス別名,valueはlabel値(0から)であり,(key[0],value[0])=[background,]
[2] - _classes,objectクラス別名,合計20(object classes)+1(background)=21 classes.
[3] - _data_path、データセットパス
[4] - _image_ext,’.jpg’データ型
[5] - _image_index、ピクチャインデックスリスト
[6] - _image_set,’trainval’
[7] - _name,データセット名voc_2007_trainval
[8] - _num_classes,0
[9] - _obj_proposer,selective_search
[10] - _roidb,None
[11]-classes,classes同じ
[12] - image_index,image_index同様
[13]-name、データセット名、および_name同じ
[14] - num_classes,カテゴリ数,21
[15] - num_images,ピクチャ数
[16]-config,dictタイプ,PASCALデータセット指定の構成
imdbを読み込むと、
config.py中cfg.TRAIN.PROPOSAL_METHOD値はselective_search
experiments/cfgs/faster_rcnn_end2end.yml中cfg.TRAIN.PROPOSAL_METHOD値gt
set_proposal_method関数
続行、
get_へtraining_roidb関数fast_rcnn内のtrain.pyで、
rdl_へrxingoidb.prepare_roidb関数、
得られたimdb.roidbはlist形式で、すべてのサンプルの情報を含む、imdb.roidb[index]は各サンプルのデータであり、dictタイプであり、その内容情報は以下の通りである.
roidb[index]、dictタイプ、keyとvalueのそれぞれに対応するデータは、
[1]-boxes,boxの位置データ,box_num×4のndarray
Hanshu[2]-flipped、画像が反転しているかどうか、True or False
[3] - gt_classes、画像内のすべてのboxesの実際のカテゴリの寸法、box_num×1のndarray
[4] - gt_overlaps、画像内のすべてのboxesは異なるカテゴリに対応するscore、box_num×classes_numのmatrix
[5]-height,画像の高さ
[6]-width、画像の幅
[7]-イメージ、ピクチャパス
[8] - max_classes,各boxの最高scoreに対応するカテゴリ,box_num×1のndarray
[9] - max_overlaps、各boxはすべてのカテゴリのscoreの最大値、box_に対してnum×1のndarray
関数呼び出しが終了し、train_を返します.net.pyのcombined_roidb(imdb_names)関数、
roidbはimdbに属するはずだ.
roidbは本当にデータを読み取るものではなく、関連するデータ情報を確立するだけである.
次は、
train_に入りますnet,
ここでroidbを先に処理する、すなわち関数filter_ロイド、無駄なロイズを除去し、
次に、
SolverWrapperでsolverクラスのオブジェクトをカプセル化し、
ここで呼び出される最初の関数はadd_です.bbox_regression_targets,
ここではSolverWrapperの_init__ の最後の行self.solver.net.layers[0].set_roidb(roidb)理解、
ここまでroidbは処理するが、画像データはまだ読み取られていない.
RoIDataLayerのforwardセクション:
ここでget_minibatchはmini-batchのピクチャデータとroidbデータを読み出し、
大体順番に整理して、回りやすくて、酔いやすいです.
実際には、大まかな流れは次のとおりです. pascal vocから提供されたデータをroidbに読み出す. bbox_を追加regression_targetsからroidbまで; 訓練を開始し、mini-batchのデータ層blobsを読み出し、ピクチャデータとgt_を読み出す.boxesデータ
トレーニングスクリプト./tools/train_net.pyメイン関数が開始します.
データ読取りレイヤーRoIDataLayer
まず、
imdb, roidb = combined_roidb(args.imdb_name) # imdb_name, voc_2007_trainval( )
print '{:d} roidb entries'.format(len(roidb))
そして、関数combined_roidb:
def combined_roidb(imdb_names):
def get_roidb(imdb_name):
imdb = get_imdb(imdb_name) # factory.py , pascal_voc
# get_imdb pascal_voc('trainval', '2007')
# imdb , , ,
print 'Loaded dataset `{:s}` for training'.format(imdb.name)
imdb.set_proposal_method(cfg.TRAIN.PROPOSAL_METHOD)
print 'Set proposal method: {:s}'.format(cfg.TRAIN.PROPOSAL_METHOD)
roidb = get_training_roidb(imdb)
return roidb
roidbs = [get_roidb(s) for s in imdb_names.split('+')]
# imdb_names.split('+') voc_2007_trainval
# get_roidb
roidb = roidbs[0]
if len(roidbs) > 1:
for r in roidbs[1:]:
roidb.extend(r)
imdb = datasets.imdb.imdb(imdb_names)
else:
imdb = get_imdb(imdb_names)
return imdb, roidb
pascal_vocデータセットに対応するクラスのオブジェクト:
class pascal_voc(imdb): # imdb
def __init__(self, image_set, year, devkit_path=None):
imdb.__init__(self, 'voc_' + year + '_' + image_set)
self._year = year
self._image_set = image_set
self._devkit_path = '/data/VOCdevkit'
self._data_path = os.path.join(self._devkit_path, 'VOC' + self._year)
self._classes = ('__background__', # always index 0
'aeroplane', 'bicycle', 'bird', 'boat',
'bottle', 'bus', 'car', 'cat', 'chair',
'cow', 'diningtable', 'dog', 'horse',
'motorbike', 'person', 'pottedplant',
'sheep', 'sofa', 'train', 'tvmonitor')
self._class_to_ind = dict(zip(self.classes, xrange(self.num_classes)))
self._image_ext = '.jpg'
self._image_index = self._load_image_set_index()
# Default to roidb handler
self._roidb_handler = self.selective_search_roidb
self._salt = str(uuid.uuid4()) # ?
self._comp_id = 'comp4' # ?
# PASCAL specific config options
self.config = {'cleanup' : True,
'use_salt' : True,
'use_diff' : False,
'matlab_eval' : False,
'rpn_file' : None,
'min_size' : 2}
assert os.path.exists(self._devkit_path), \
'VOCdevkit path does not exist: {}'.format(self._devkit_path)
assert os.path.exists(self._data_path), \
'Path does not exist: {}'.format(self._data_path)
#
class imdb(object):
"""Image database."""
def __init__(self, name):
self._name = name
self._num_classes = 0
self._classes = []
self._image_index = []
self._obj_proposer = 'selective_search'
self._roidb = None
self._roidb_handler = self.default_roidb
# Use this dict for storing dataset specific config options
self.config = {}
得られたimdb=pascal_voc(‘trainval’,‘2007’)に記録された内容は以下の通りである.
[1] - _class_to_ind,dictタイプ,keyはクラス別名,valueはlabel値(0から)であり,(key[0],value[0])=[background,]
[2] - _classes,objectクラス別名,合計20(object classes)+1(background)=21 classes.
[3] - _data_path、データセットパス
[4] - _image_ext,’.jpg’データ型
[5] - _image_index、ピクチャインデックスリスト
[6] - _image_set,’trainval’
[7] - _name,データセット名voc_2007_trainval
[8] - _num_classes,0
[9] - _obj_proposer,selective_search
[10] - _roidb,None
[11]-classes,classes同じ
[12] - image_index,image_index同様
[13]-name、データセット名、および_name同じ
[14] - num_classes,カテゴリ数,21
[15] - num_images,ピクチャ数
[16]-config,dictタイプ,PASCALデータセット指定の構成
imdbを読み込むと、
imdb.set_proposal_method(cfg.TRAIN.PROPOSAL_METHOD)
print 'Set proposal method: {:s}'.format(cfg.TRAIN.PROPOSAL_METHOD)
config.py中cfg.TRAIN.PROPOSAL_METHOD値はselective_search
experiments/cfgs/faster_rcnn_end2end.yml中cfg.TRAIN.PROPOSAL_METHOD値gt
set_proposal_method関数
def set_proposal_method(self, method):
method = eval('self.' + method + '_roidb') # eval ,self.gt_roidb/pascal_voc
self.roidb_handler = method
def gt_roidb(self):
"""
Return the database of ground-truth regions of interest.
This function loads/saves from/to a cache file to speed up future calls.
"""
cache_file = os.path.join(self.cache_path, self.name + '_gt_roidb.pkl')
if os.path.exists(cache_file):
with open(cache_file, 'rb') as fid:
roidb = cPickle.load(fid)
print '{} gt roidb loaded from {}'.format(self.name, cache_file)
return roidb
gt_roidb = [self._load_pascal_annotation(index)
for index in self.image_index]
with open(cache_file, 'wb') as fid:
cPickle.dump(gt_roidb, fid, cPickle.HIGHEST_PROTOCOL)
print 'wrote gt roidb to {}'.format(cache_file)
return gt_roidb
def _load_pascal_annotation(self, index):
"""
Load image and bounding boxes info from XML file in the PASCAL VOC
format.
"""
filename = os.path.join(self._data_path, 'Annotations', index + '.xml')
tree = ET.parse(filename)
objs = tree.findall('object')
if not self.config['use_diff']:
# Exclude the samples labeled as difficult
non_diff_objs = [
obj for obj in objs if int(obj.find('difficult').text) == 0]
# if len(non_diff_objs) != len(objs):
# print 'Removed {} difficult objects'.format(
# len(objs) - len(non_diff_objs))
objs = non_diff_objs
num_objs = len(objs)
boxes = np.zeros((num_objs, 4), dtype=np.uint16)
gt_classes = np.zeros((num_objs), dtype=np.int32)
overlaps = np.zeros((num_objs, self.num_classes), dtype=np.float32)
# "Seg" area for pascal is just the box area
seg_areas = np.zeros((num_objs), dtype=np.float32)
# Load object bounding boxes into a data frame.
for ix, obj in enumerate(objs):
bbox = obj.find('bndbox')
# Make pixel indexes 0-based
x1 = float(bbox.find('xmin').text) - 1
y1 = float(bbox.find('ymin').text) - 1
x2 = float(bbox.find('xmax').text) - 1
y2 = float(bbox.find('ymax').text) - 1
cls = self._class_to_ind[obj.find('name').text.lower().strip()]
boxes[ix, :] = [x1, y1, x2, y2]
gt_classes[ix] = cls
overlaps[ix, cls] = 1.0
seg_areas[ix] = (x2 - x1 + 1) * (y2 - y1 + 1)
overlaps = scipy.sparse.csr_matrix(overlaps)
return {'boxes' : boxes,
'gt_classes': gt_classes,
'gt_overlaps' : overlaps,
'flipped' : False,
'seg_areas' : seg_areas}
続行、
roidb = get_training_roidb(imdb)
get_へtraining_roidb関数fast_rcnn内のtrain.pyで、
def get_training_roidb(imdb):
"""Returns a roidb (Region of Interest database) for use in training."""
if cfg.TRAIN.USE_FLIPPED: #
print 'Appending horizontally-flipped training examples...'
imdb.append_flipped_images()
# imdb , trainval 5011 , 10022 ;
print 'done'
print 'Preparing training data...'
rdl_roidb.prepare_roidb(imdb) #
print 'done'
return imdb.roidb
rdl_へrxingoidb.prepare_roidb関数、
def prepare_roidb(imdb):
"""Enrich the imdb's roidb by adding some derived quantities that
are useful for training. This function precomputes the maximum
overlap, taken over ground-truth boxes, between each ROI and
each ground-truth box. The class with maximum overlap is also
recorded.
"""
sizes = [PIL.Image.open(imdb.image_path_at(i)).size
for i in xrange(imdb.num_images)] # width heigth
roidb = imdb.roidb
for i in xrange(len(imdb.image_index)):
roidb[i]['image'] = imdb.image_path_at(i) #
roidb[i]['width'] = sizes[i][0] # width
roidb[i]['height'] = sizes[i][1] # height
# need gt_overlaps as a dense array for argmax
gt_overlaps = roidb[i]['gt_overlaps'].toarray()
# max overlap with gt over classes (columns)
max_overlaps = gt_overlaps.max(axis=1)
# gt class that had the max overlap
max_classes = gt_overlaps.argmax(axis=1)
roidb[i]['max_classes'] = max_classes
roidb[i]['max_overlaps'] = max_overlaps
# sanity checks
# max overlap of 0 => class should be zero (background)
zero_inds = np.where(max_overlaps == 0)[0]
assert all(max_classes[zero_inds] == 0)
# max overlap > 0 => class should not be zero (must be a fg class)
nonzero_inds = np.where(max_overlaps > 0)[0]
assert all(max_classes[nonzero_inds] != 0)
得られたimdb.roidbはlist形式で、すべてのサンプルの情報を含む、imdb.roidb[index]は各サンプルのデータであり、dictタイプであり、その内容情報は以下の通りである.
roidb[index]、dictタイプ、keyとvalueのそれぞれに対応するデータは、
[1]-boxes,boxの位置データ,box_num×4のndarray
Hanshu[2]-flipped、画像が反転しているかどうか、True or False
[3] - gt_classes、画像内のすべてのboxesの実際のカテゴリの寸法、box_num×1のndarray
[4] - gt_overlaps、画像内のすべてのboxesは異なるカテゴリに対応するscore、box_num×classes_numのmatrix
[5]-height,画像の高さ
[6]-width、画像の幅
[7]-イメージ、ピクチャパス
[8] - max_classes,各boxの最高scoreに対応するカテゴリ,box_num×1のndarray
[9] - max_overlaps、各boxはすべてのカテゴリのscoreの最大値、box_に対してnum×1のndarray
関数呼び出しが終了し、train_を返します.net.pyのcombined_roidb(imdb_names)関数、
roidb = roidbs[0]
if len(roidbs) > 1: #
for r in roidbs[1:]:
roidb.extend(r)
imdb = datasets.imdb.imdb(imdb_names)
else:
imdb = get_imdb(imdb_names) # imdb=pascal_voc('trainval', '2007')
roidbはimdbに属するはずだ.
roidbは本当にデータを読み取るものではなく、関連するデータ情報を確立するだけである.
次は、
train_net(args.solver, roidb, output_dir, pretrained_model=args.pretrained_model, max_iters=args.max_iters)
train_に入りますnet,
def train_net(solver_prototxt, roidb, output_dir,
pretrained_model=None, max_iters=40000):
"""Train a Fast R-CNN network."""
roidb = filter_roidb(roidb)
sw = SolverWrapper(solver_prototxt, roidb, output_dir,
pretrained_model=pretrained_model)
print 'Solving...'
model_paths = sw.train_model(max_iters)
print 'done solving'
return model_paths
ここでroidbを先に処理する、すなわち関数filter_ロイド、無駄なロイズを除去し、
def filter_roidb(roidb):
"""Remove roidb entries that have no usable RoIs."""
def is_valid(entry):
# Valid images have:
# (1) At least one foreground RoI OR
# (2) At least one background RoI
overlaps = entry['max_overlaps']
# find boxes with sufficient overlap
fg_inds = np.where(overlaps >= cfg.TRAIN.FG_THRESH)[0]
# Select background RoIs as those within [BG_THRESH_LO, BG_THRESH_HI)
bg_inds = np.where((overlaps < cfg.TRAIN.BG_THRESH_HI) &
(overlaps >= cfg.TRAIN.BG_THRESH_LO))[0]
# image is only valid if such boxes exist
valid = len(fg_inds) > 0 or len(bg_inds) > 0
return valid
num = len(roidb)
filtered_roidb = [entry for entry in roidb if is_valid(entry)]
num_after = len(filtered_roidb)
print 'Filtered {} roidb entries: {} -> {}'.format(num - num_after,
num, num_after)
return filtered_roidb
次に、
sw = SolverWrapper(solver_prototxt, roidb, output_dir,
pretrained_model=pretrained_model)
SolverWrapperでsolverクラスのオブジェクトをカプセル化し、
class SolverWrapper(object):
"""A simple wrapper around Caffe's solver.
This wrapper gives us control over he snapshotting process, which we
use to unnormalize the learned bounding-box regression weights.
"""
def __init__(self, solver_prototxt, roidb, output_dir,
pretrained_model=None):
"""Initialize the SolverWrapper."""
self.output_dir = output_dir
if (cfg.TRAIN.HAS_RPN and cfg.TRAIN.BBOX_REG and
cfg.TRAIN.BBOX_NORMALIZE_TARGETS):
# RPN can only use precomputed normalization because there are no
# fixed statistics to compute a priori
assert cfg.TRAIN.BBOX_NORMALIZE_TARGETS_PRECOMPUTED
if cfg.TRAIN.BBOX_REG:
print 'Computing bounding-box regression targets...'
self.bbox_means, self.bbox_stds = \
rdl_roidb.add_bbox_regression_targets(roidb)
# box ,gt, roidb dict bbox_targets key.
print 'done'
self.solver = caffe.SGDSolver(solver_prototxt) # solver.prototxt
# train.prototxt data_top:
# top[o] - data (1, 3, 600, 1000)
# top[1] - im_info (1, 3)
# top[2] - gt_boxes (1, 4)
if pretrained_model is not None:
print ('Loading pretrained model '
'weights from {:s}').format(pretrained_model)
self.solver.net.copy_from(pretrained_model) #
self.solver_param = caffe_pb2.SolverParameter()
with open(solver_prototxt, 'rt') as f:
pb2.text_format.Merge(f.read(), self.solver_param)
self.solver.net.layers[0].set_roidb(roidb) # , RoIDataLayer set_roidb , roidb .
ここで呼び出される最初の関数はadd_です.bbox_regression_targets,
def add_bbox_regression_targets(roidb):
"""Add information needed to train bounding-box regressors."""
assert len(roidb) > 0
assert 'max_classes' in roidb[0], 'Did you call prepare_roidb first?'
num_images = len(roidb)
# Infer number of classes from the number of columns in gt_overlaps
num_classes = roidb[0]['gt_overlaps'].shape[1]
for im_i in xrange(num_images):
rois = roidb[im_i]['boxes']
max_overlaps = roidb[im_i]['max_overlaps']
max_classes = roidb[im_i]['max_classes']
roidb[im_i]['bbox_targets'] = \
_compute_targets(rois, max_overlaps, max_classes)
if cfg.TRAIN.BBOX_NORMALIZE_TARGETS_PRECOMPUTED:
# Use fixed / precomputed "means" and "stds" instead of empirical values
# /
means = np.tile(
np.array(cfg.TRAIN.BBOX_NORMALIZE_MEANS), (num_classes, 1))
stds = np.tile(
np.array(cfg.TRAIN.BBOX_NORMALIZE_STDS), (num_classes, 1))
else:
# Compute values needed for means and stds
# var(x) = E(x^2) - E(x)^2
class_counts = np.zeros((num_classes, 1)) + cfg.EPS
sums = np.zeros((num_classes, 4))
squared_sums = np.zeros((num_classes, 4))
for im_i in xrange(num_images):
targets = roidb[im_i]['bbox_targets']
for cls in xrange(1, num_classes):
cls_inds = np.where(targets[:, 0] == cls)[0]
if cls_inds.size > 0:
class_counts[cls] += cls_inds.size
sums[cls, :] += targets[cls_inds, 1:].sum(axis=0)
squared_sums[cls, :] += \
(targets[cls_inds, 1:] ** 2).sum(axis=0)
means = sums / class_counts
stds = np.sqrt(squared_sums / class_counts - means ** 2)
print 'bbox target means:'
print means
print means[1:, :].mean(axis=0) # ignore bg class
print 'bbox target stdevs:'
print stds
print stds[1:, :].mean(axis=0) # ignore bg class
# Normalize targets
#
if cfg.TRAIN.BBOX_NORMALIZE_TARGETS:
print "Normalizing targets"
for im_i in xrange(num_images):
targets = roidb[im_i]['bbox_targets']
for cls in xrange(1, num_classes):
cls_inds = np.where(targets[:, 0] == cls)[0]
roidb[im_i]['bbox_targets'][cls_inds, 1:] -= means[cls, :]
roidb[im_i]['bbox_targets'][cls_inds, 1:] /= stds[cls, :]
else:
print "NOT normalizing targets"
# These values will be needed for making predictions
# (the predicts will need to be unnormalized and uncentered)
return means.ravel(), stds.ravel()
def _compute_targets(rois, overlaps, labels):
"""Compute bounding-box regression targets for an image."""
# Indices of ground-truth ROIs
gt_inds = np.where(overlaps == 1)[0]
if len(gt_inds) == 0:
# Bail if the image has no ground-truth ROIs
return np.zeros((rois.shape[0], 5), dtype=np.float32)
# Indices of examples for which we try to make predictions
ex_inds = np.where(overlaps >= cfg.TRAIN.BBOX_THRESH)[0]
# Get IoU overlap between each ex ROI and gt ROI
# ex RoI gt RoI IoU
ex_gt_overlaps = bbox_overlaps(
np.ascontiguousarray(rois[ex_inds, :], dtype=np.float),
np.ascontiguousarray(rois[gt_inds, :], dtype=np.float))
# Find which gt ROI each ex ROI has max overlap with:
# this will be the ex ROI's gt target
gt_assignment = ex_gt_overlaps.argmax(axis=1)
gt_rois = rois[gt_inds[gt_assignment], :]
ex_rois = rois[ex_inds, :]
targets = np.zeros((rois.shape[0], 5), dtype=np.float32)
targets[ex_inds, 0] = labels[ex_inds] #
targets[ex_inds, 1:] = bbox_transform(ex_rois, gt_rois) # bbox
return targets
def bbox_transform(ex_rois, gt_rois):
ex_widths = ex_rois[:, 2] - ex_rois[:, 0] + 1.0
ex_heights = ex_rois[:, 3] - ex_rois[:, 1] + 1.0
ex_ctr_x = ex_rois[:, 0] + 0.5 * ex_widths
ex_ctr_y = ex_rois[:, 1] + 0.5 * ex_heights
gt_widths = gt_rois[:, 2] - gt_rois[:, 0] + 1.0
gt_heights = gt_rois[:, 3] - gt_rois[:, 1] + 1.0
gt_ctr_x = gt_rois[:, 0] + 0.5 * gt_widths
gt_ctr_y = gt_rois[:, 1] + 0.5 * gt_heights
targets_dx = (gt_ctr_x - ex_ctr_x) / ex_widths
targets_dy = (gt_ctr_y - ex_ctr_y) / ex_heights
targets_dw = np.log(gt_widths / ex_widths)
targets_dh = np.log(gt_heights / ex_heights)
targets = np.vstack(
(targets_dx, targets_dy, targets_dw, targets_dh)).transpose()
return targets
ここではSolverWrapperの_init__ の最後の行self.solver.net.layers[0].set_roidb(roidb)理解、
def set_roidb(self, roidb):
"""Set the roidb to be used by this layer during training."""
self._roidb = roidb
self._shuffle_roidb_inds() # roidb
if cfg.TRAIN.USE_PREFETCH: # ,
self._blob_queue = Queue(10)
self._prefetch_process = BlobFetcher(self._blob_queue,
self._roidb,
self._num_classes)
self._prefetch_process.start()
# Terminate the child process when the parent exists
def cleanup():
print 'Terminating BlobFetcher'
self._prefetch_process.terminate()
self._prefetch_process.join()
import atexit
atexit.register(cleanup)
def _shuffle_roidb_inds(self):
"""Randomly permute the training roidb."""
if cfg.TRAIN.ASPECT_GROUPING:
widths = np.array([r['width'] for r in self._roidb])
heights = np.array([r['height'] for r in self._roidb])
horz = (widths >= heights)
vert = np.logical_not(horz)
horz_inds = np.where(horz)[0]
vert_inds = np.where(vert)[0]
inds = np.hstack((
np.random.permutation(horz_inds),
np.random.permutation(vert_inds)))
inds = np.reshape(inds, (-1, 2))
row_perm = np.random.permutation(np.arange(inds.shape[0]))
inds = np.reshape(inds[row_perm, :], (-1,))
self._perm = inds
else:
self._perm = np.random.permutation(np.arange(len(self._roidb)))
self._cur = 0
ここまでroidbは処理するが、画像データはまだ読み取られていない.
RoIDataLayerのforwardセクション:
class RoIDataLayer(caffe.Layer):
"""Fast R-CNN data layer used for training."""
def _shuffle_roidb_inds(self):
"""Randomly permute the training roidb."""
if cfg.TRAIN.ASPECT_GROUPING:
widths = np.array([r['width'] for r in self._roidb])
heights = np.array([r['height'] for r in self._roidb])
horz = (widths >= heights)
vert = np.logical_not(horz)
horz_inds = np.where(horz)[0]
vert_inds = np.where(vert)[0]
inds = np.hstack((
np.random.permutation(horz_inds),
np.random.permutation(vert_inds)))
inds = np.reshape(inds, (-1, 2))
row_perm = np.random.permutation(np.arange(inds.shape[0]))
inds = np.reshape(inds[row_perm, :], (-1,))
self._perm = inds
else:
self._perm = np.random.permutation(np.arange(len(self._roidb)))
self._cur = 0
def _get_next_minibatch_inds(self):
"""Return the roidb indices for the next minibatch."""
if self._cur + cfg.TRAIN.IMS_PER_BATCH >= len(self._roidb):
self._shuffle_roidb_inds() # , roidb
db_inds = self._perm[self._cur:self._cur + cfg.TRAIN.IMS_PER_BATCH] # mini-batch
self._cur += cfg.TRAIN.IMS_PER_BATCH
return db_inds
def _get_next_minibatch(self):
"""Return the blobs to be used for the next minibatch.
If cfg.TRAIN.USE_PREFETCH is True, then blobs will be computed in a
separate process and made available through self._blob_queue.
"""
if cfg.TRAIN.USE_PREFETCH: #
return self._blob_queue.get()
else:
db_inds = self._get_next_minibatch_inds() # next minibatch index
minibatch_db = [self._roidb[i] for i in db_inds] # mini-batch roidb
return get_minibatch(minibatch_db, self._num_classes) # mini-batch
def setup(self, bottom, top):
"""Setup the RoIDataLayer."""
# parse the layer parameter string, which must be valid YAML
layer_params = yaml.load(self.param_str_)
self._num_classes = layer_params['num_classes']
self._name_to_top_map = {}
# data blob: holds a batch of N images, each with 3 channels
idx = 0
top[idx].reshape(cfg.TRAIN.IMS_PER_BATCH, 3,
max(cfg.TRAIN.SCALES), cfg.TRAIN.MAX_SIZE)
self._name_to_top_map['data'] = idx
idx += 1
if cfg.TRAIN.HAS_RPN:
top[idx].reshape(1, 3)
self._name_to_top_map['im_info'] = idx
idx += 1
top[idx].reshape(1, 4)
self._name_to_top_map['gt_boxes'] = idx
idx += 1
else: # not using RPN
# rois blob: holds R regions of interest, each is a 5-tuple
# (n, x1, y1, x2, y2) specifying an image batch index n and a
# rectangle (x1, y1, x2, y2)
top[idx].reshape(1, 5)
self._name_to_top_map['rois'] = idx
idx += 1
# labels blob: R categorical labels in [0, ..., K] for K foreground
# classes plus background
top[idx].reshape(1)
self._name_to_top_map['labels'] = idx
idx += 1
if cfg.TRAIN.BBOX_REG:
# bbox_targets blob: R bounding-box regression targets with 4
# targets per class
top[idx].reshape(1, self._num_classes * 4)
self._name_to_top_map['bbox_targets'] = idx
idx += 1
# bbox_inside_weights blob: At most 4 targets per roi are active;
# thisbinary vector sepcifies the subset of active targets
top[idx].reshape(1, self._num_classes * 4)
self._name_to_top_map['bbox_inside_weights'] = idx
idx += 1
top[idx].reshape(1, self._num_classes * 4)
self._name_to_top_map['bbox_outside_weights'] = idx
idx += 1
print 'RoiDataLayer: name_to_top:', self._name_to_top_map
assert len(top) == len(self._name_to_top_map)
def forward(self, bottom, top):
"""Get blobs and copy them into this layer's top blob vector."""
blobs = self._get_next_minibatch()
# blobs - data, im_info, gt_boxes
for blob_name, blob in blobs.iteritems():
top_ind = self._name_to_top_map[blob_name]
# Reshape net's input blobs
top[top_ind].reshape(*(blob.shape))
# Copy data into net's input blobs
top[top_ind].data[...] = blob.astype(np.float32, copy=False) # blobs top
def backward(self, top, propagate_down, bottom):
"""This layer does not propagate gradients."""
pass
def reshape(self, bottom, top):
"""Reshaping happens during the call to forward."""
pass
ここでget_minibatchはmini-batchのピクチャデータとroidbデータを読み出し、
def get_minibatch(roidb, num_classes):
"""Given a roidb, construct a minibatch sampled from it."""
num_images = len(roidb) # mini-batch
# Sample random scales to use for each image in this batch
random_scale_inds = npr.randint(0, high=len(cfg.TRAIN.SCALES),
size=num_images)
assert(cfg.TRAIN.BATCH_SIZE % num_images == 0), \
'num_images ({}) must divide BATCH_SIZE ({})'. \
format(num_images, cfg.TRAIN.BATCH_SIZE)
rois_per_image = cfg.TRAIN.BATCH_SIZE / num_images # rois
fg_rois_per_image = np.round(cfg.TRAIN.FG_FRACTION * rois_per_image) # foreground rois
# cfg.TRAIN.FG_FRACTION=0.25 foreground (class>0)
# Get the input image blob, formatted for caffe
im_blob, im_scales = _get_image_blob(roidb, random_scale_inds)
blobs = {'data': im_blob} # data blob
if cfg.TRAIN.HAS_RPN:
assert len(im_scales) == 1, "Single batch only" # RPN mini-batch scale
assert len(roidb) == 1, "Single batch only"
# gt boxes: (x1, y1, x2, y2, cls)
gt_inds = np.where(roidb[0]['gt_classes'] != 0)[0]
gt_boxes = np.empty((len(gt_inds), 5), dtype=np.float32)
gt_boxes[:, 0:4] = roidb[0]['boxes'][gt_inds, :] * im_scales[0]
gt_boxes[:, 4] = roidb[0]['gt_classes'][gt_inds]
blobs['gt_boxes'] = gt_boxes
blobs['im_info'] = np.array(
[[im_blob.shape[2], im_blob.shape[3], im_scales[0]]],
dtype=np.float32)
else: # not using RPN
# Now, build the region of interest and label blobs
rois_blob = np.zeros((0, 5), dtype=np.float32)
labels_blob = np.zeros((0), dtype=np.float32)
bbox_targets_blob = np.zeros((0, 4 * num_classes), dtype=np.float32)
bbox_inside_blob = np.zeros(bbox_targets_blob.shape, dtype=np.float32)
# all_overlaps = []
for im_i in xrange(num_images):
labels, overlaps, im_rois, bbox_targets, bbox_inside_weights \
= _sample_rois(roidb[im_i], fg_rois_per_image, rois_per_image,
num_classes)
# Add to RoIs blob
rois = _project_im_rois(im_rois, im_scales[im_i])
batch_ind = im_i * np.ones((rois.shape[0], 1))
rois_blob_this_image = np.hstack((batch_ind, rois))
rois_blob = np.vstack((rois_blob, rois_blob_this_image))
# Add to labels, bbox targets, and bbox loss blobs
labels_blob = np.hstack((labels_blob, labels))
bbox_targets_blob = np.vstack((bbox_targets_blob, bbox_targets))
bbox_inside_blob = np.vstack((bbox_inside_blob, bbox_inside_weights))
# all_overlaps = np.hstack((all_overlaps, overlaps))
# For debug visualizations
# _vis_minibatch(im_blob, rois_blob, labels_blob, all_overlaps)
blobs['rois'] = rois_blob
blobs['labels'] = labels_blob
if cfg.TRAIN.BBOX_REG:
blobs['bbox_targets'] = bbox_targets_blob
blobs['bbox_inside_weights'] = bbox_inside_blob
blobs['bbox_outside_weights'] = \
np.array(bbox_inside_blob > 0).astype(np.float32)
return blobs
def _get_image_blob(roidb, scale_inds):
"""Builds an input blob from the images in the roidb at the specified
scales.
"""
num_images = len(roidb)
processed_ims = []
im_scales = []
for i in xrange(num_images):
im = cv2.imread(roidb[i]['image']) #
if roidb[i]['flipped']:
im = im[:, ::-1, :]
target_size = cfg.TRAIN.SCALES[scale_inds[i]] # 600
im, im_scale = prep_im_for_blob(im, cfg.PIXEL_MEANS, target_size,
cfg.TRAIN.MAX_SIZE)
im_scales.append(im_scale)
processed_ims.append(im)
# Create a blob to hold the input images
# data_blobs
blob = im_list_to_blob(processed_ims)
return blob, im_scales
def prep_im_for_blob(im, pixel_means, target_size, max_size):
"""Mean subtract and scale an image for use in a blob."""
im = im.astype(np.float32, copy=False)
im -= pixel_means #
im_shape = im.shape
im_size_min = np.min(im_shape[0:2])
im_size_max = np.max(im_shape[0:2])
im_scale = float(target_size) / float(im_size_min) #
# Prevent the biggest axis from being more than MAX_SIZE
if np.round(im_scale * im_size_max) > max_size:
im_scale = float(max_size) / float(im_size_max)
im = cv2.resize(im, None, None, fx=im_scale, fy=im_scale,
interpolation=cv2.INTER_LINEAR)
return im, im_scale
def im_list_to_blob(ims):
"""Convert a list of images into a network input.
Assumes images are already prepared (means subtracted, BGR order, ...).
"""
max_shape = np.array([im.shape for im in ims]).max(axis=0)
num_images = len(ims)
blob = np.zeros((num_images, max_shape[0], max_shape[1], 3),
dtype=np.float32)
for i in xrange(num_images):
im = ims[i]
blob[i, 0:im.shape[0], 0:im.shape[1], :] = im
# Move channels (axis 3) to axis 1
# Axis order will become: (batch elem, channel, height, width)
channel_swap = (0, 3, 1, 2)
blob = blob.transpose(channel_swap)
return blob
大体順番に整理して、回りやすくて、酔いやすいです.
実際には、大まかな流れは次のとおりです.