Tensorflowはbatch normalizationを構築する時変数gamaが訓練されない問題

36532 ワード

tenssor flow python

GANを走った後、モデルを再ロードし、トレーニング変数のリストを観察します.

tvars = tf.trainable_variables()
for i in tvars:
    print(i)


### output ###






###

変な名前空間だg'の下でどうしてもgamaがあるべきで、betaだけがどんな話に似ていますか?
原文を探す

# network.py 
g2 = tf.contrib.layers.batch_norm(g2, epsilon=1e-5, scope='bn2_g')


#  contrib.layers.batch_norm()   
#\lib\site-packages\tensorflow\contrib\layers.py

contrib.layers.pyファイル

...
#          
  layer = normalization_layers.BatchNormalization(
	  axis=axis,
	  momentum=decay,
	  epsilon=epsilon,
	  center=center,
	  scale=scale,
	  beta_initializer=beta_initializer,
	  gamma_initializer=gamma_initializer,
	  moving_mean_initializer=moving_mean_initializer,
	  moving_variance_initializer=moving_variance_initializer,
	  beta_regularizer=beta_regularizer,
	  gamma_regularizer=gamma_regularizer,
	  trainable=trainable,
	  renorm=renorm,
	  renorm_clipping=renorm_clipping,
	  renorm_momentum=renorm_decay,
	  adjustment=adjustment,
	  name=sc.name,
	  _scope=sc,
	  _reuse=reuse,
	  fused=fused)
  outputs = layer.apply(inputs, training=is_training)
  print("==========i am here========") #    
  # Add variables to collections.
  _add_variable_to_collections(layer.moving_mean, variables_collections,
							   'moving_mean')
  _add_variable_to_collections(layer.moving_variance, variables_collections,
							   'moving_variance')
  if layer.beta is not None:
	_add_variable_to_collections(layer.beta, variables_collections, 'beta')
  if layer.gamma is not None:
	print("==========hello gama is added==============") #    
	_add_variable_to_collections(layer.gamma, variables_collections,
								 'gamma')
  else:
	  print("==========sorry layer.gama is none========") #

訓練変数にgamaが存在しない根本的な原因はnormalization_を呼び出すことであることがわかる.layers.BatchNormalization()layerオブジェクトを作成中にエラーが発生しました.

#       
from tensorflow.python.layers import normalization as normalization_layers

#      
\Lib\site-packages\tensorflow\python\layers
ormalization.py

#      BatchNormalization()       


@tf_export(v1=['layers.BatchNormalization'])
class BatchNormalization(keras_layers.BatchNormalization, base.Layer):
    def __init__(self,
               axis=-1,
               momentum=0.99,
               epsilon=1e-3,
               center=True,
               scale=True,
               beta_initializer=init_ops.zeros_initializer(),
               gamma_initializer=init_ops.ones_initializer(),
               moving_mean_initializer=init_ops.zeros_initializer(),
               moving_variance_initializer=init_ops.ones_initializer(),
               beta_regularizer=None,
               gamma_regularizer=None,
               beta_constraint=None,
               gamma_constraint=None,
               renorm=False,
               renorm_clipping=None,
               renorm_momentum=0.99,
               fused=None,
               trainable=True,
               virtual_batch_size=None,
               adjustment=None,
               name=None,
               **kwargs):
    super(BatchNormalization, self).__init__(
        axis=axis,
        momentum=momentum,
        epsilon=epsilon,
        center=center,
        scale=scale,
        beta_initializer=beta_initializer,
        gamma_initializer=gamma_initializer,
        moving_mean_initializer=moving_mean_initializer,
        moving_variance_initializer=moving_variance_initializer,
        beta_regularizer=beta_regularizer,
        gamma_regularizer=gamma_regularizer,
        beta_constraint=beta_constraint,
        gamma_constraint=gamma_constraint,
        renorm=renorm,
        renorm_clipping=renorm_clipping,
        renorm_momentum=renorm_momentum,
        fused=fused,
        trainable=trainable,
        virtual_batch_size=virtual_batch_size,
        adjustment=adjustment,
        name=name,
        **kwargs)

長いように見えますが、コアは実は一言です.

     BatchNormalization     ，
           
 super(BatchNormalization, self).__init__()

そこで私たちはBatchNormalizationの親を探しています.
pythonという依存は本当につらいです.多くの依存関係が見つからず、直接ジャンプできません.

#     ，       keras_layers.BatchNormalization
class BatchNormalization(keras_layers.BatchNormalization, base.Layer):

#     
from tensorflow.python.keras import layers as keras_layers

#        
\Lib\site-packages\tensorflow\python\keras\layers\

#     ，   
\Lib\site-packages\tensorflow\python\keras\layers
ormalization.py


#      
@tf_export('keras.layers.BatchNormalization', v1=[])
class BatchNormalizationV2(Layer):
    def __init__(self,
               axis=-1,
               momentum=0.99,
               epsilon=1e-3,
               center=True,
               scale=True,
               beta_initializer='zeros',
               gamma_initializer='ones',
               moving_mean_initializer='zeros',
               moving_variance_initializer='ones',
               beta_regularizer=None,
               gamma_regularizer=None,
               beta_constraint=None,
               gamma_constraint=None,
               renorm=False,
               renorm_clipping=None,
               renorm_momentum=0.99,
               fused=None,
               trainable=True,
               virtual_batch_size=None,
               adjustment=None,
               name=None,
               **kwargs):
        super(BatchNormalizationV2, self).__init__(
            name=name, trainable=trainable, **kwargs)

その結果,このBatchNormalizationV 2()のコンストラクション関数も親クラスを呼び出すコンストラクションである.
続行します.

#     
from tensorflow.python.keras.engine.base_layer import Layer

#    
\Lib\site-packages\tensorflow\python\keras\engine\base_layer.py


#   BatchNormalizationV2   Layer   

@tf_export('keras.layers.Layer')
class Layer(checkpointable.CheckpointableBase):

Layerのコンストラクション関数を見てみるとgamaとは何の関係もないことがわかりました

  @checkpointable.no_automatic_dependency_tracking
  def __init__(self, trainable=True, name=None, dtype=None, **kwargs):
    # These properties should be set by the user via keyword arguments.
    # note that 'dtype', 'input_shape' and 'batch_input_shape'
    # are only applicable to input layers: do not pass these keywords
    # to non-input layers.
    allowed_kwargs = {
        'input_shape',
        'batch_input_shape',
        'batch_size',
        'weights',
        'activity_regularizer',
    }
    # Validate optional keyword arguments.
    for kwarg in kwargs:
      if kwarg not in allowed_kwargs:
        raise TypeError('Keyword argument not understood:', kwarg)

    # Mutable properties
    # Indicates whether the layer's weights are updated during training
    # and whether the layer's updates are run during training
    self.trainable = trainable
    # A stateful layer is a layer whose updates are run during inference too,
    # for instance stateful RNNs.
    self.stateful = False
    # Indicates whether `build` needs to be called upon layer call, to create
    # the layer's weights.
    self.built = False
    # Provides information about which inputs are compatible with the layer.
    self.input_spec = None
    self.supports_masking = False

    self._init_set_name(name)
    self._activity_regularizer = kwargs.pop('activity_regularizer', None)
    if not hasattr(self, '_trainable_weights'):
      self._trainable_weights = []
    if not hasattr(self, '_non_trainable_weights'):
      self._non_trainable_weights = []
    self._updates = []
    # A list of zero-argument lambdas which return Tensors, used for variable
    # regularizers.
    self._callable_losses = []
    # A list of symbolic Tensors containing activity regularizers and losses
    # manually added through `add_loss` in graph-building mode.
    self._losses = []
    # A list of loss values containing activity regularizers and losses
    # manually added through `add_loss` during eager execution. It is cleared
    # after every batch.
    # Because we plan on eventually allowing a same model instance to be trained
    # in eager mode or graph mode alternatively, we need to keep track of
    # eager losses and symbolic losses via separate attributes.
    self._eager_losses = []
    # A list of metric instances corresponding to the symbolic metric tensors
    # added using the `add_metric` API.
    self._metrics = []
    # TODO(psv): Remove this property.
    # A dictionary that maps metric names to metric result tensors. The results
    # are the running averages of metric values over an epoch.
    self._metrics_tensors = {}
    self._dtype = None if dtype is None else dtypes.as_dtype(dtype).name
    self._call_fn_args = function_utils.fn_args(self.call)
    self._compute_previous_mask = ('mask' in self._call_fn_args or
                                   hasattr(self, 'compute_mask'))
    self._call_convention = (base_layer_utils
                             .CallConvention.EXPLICIT_INPUTS_ARGUMENT)
    if not hasattr(self, '_layers'):
      self._layers = []  # Dependencies tracked via attribute assignment.

    # These lists will be filled via successive calls
    # to self._add_inbound_node().
    self._inbound_nodes = []
    self._outbound_nodes = []

    call_argspec = tf_inspect.getfullargspec(self.call)
    if 'training' in call_argspec.args:
      self._expects_training_arg = True
    else:
      self._expects_training_arg = False

    # Whether the `call` method can be used to build a TF graph without issues.
    self._call_is_graph_friendly = True

    # Manage input shape information if passed.
    if 'input_shape' in kwargs or 'batch_input_shape' in kwargs:
      # In this case we will later create an input layer
      # to insert before the current layer
      if 'batch_input_shape' in kwargs:
        batch_input_shape = tuple(kwargs['batch_input_shape'])
      elif 'input_shape' in kwargs:
        if 'batch_size' in kwargs:
          batch_size = kwargs['batch_size']
        else:
          batch_size = None
        batch_input_shape = (batch_size,) + tuple(kwargs['input_shape'])
      self._batch_input_shape = batch_input_shape

    # Manage initial weight values if passed.
    if 'weights' in kwargs:
      self._initial_weights = kwargs['weights']
    else:
      self._initial_weights = None

そこで私たちは再び...

#  BatchNormalizationV2 
#\Lib\site-packages\tensorflow\python\keras\layers
ormalization.py

"""
    #      
"""
def __init__(...):  #    
    super(BatchNormalizationV2, self).__init__(
        name=name, trainable=trainable, **kwargs)
    #       gama  ，    

"""
    #         
"""
    #   
    self.beta_initializer = initializers.get(beta_initializer)
    self.gamma_initializer = initializers.get(gamma_initializer)
	
"""
    #      
"""

メインの芝居を分解して、実は1行です.

self.gamma_initializer = initializers.get(gamma_initializer)

注目すべきは、このファイルはkerasフレームワークに基づいていることです.
だから定義はkerasの中で探します.

#       
#\Lib\site-packages\tensorflow\python\keras\initializers.py


@tf_export('keras.initializers.get')
def get(identifier):
  if identifier is None:
    return None
  if isinstance(identifier, dict):
    return deserialize(identifier)
  elif isinstance(identifier, six.string_types):
    config = {'class_name': str(identifier), 'config': {}}
    return deserialize(config)
  elif callable(identifier):
    return identifier
  else:
    raise ValueError('Could not interpret initializer identifier: ' +
                     str(identifier))

この定義を見るとgammaしかないようですinitializerはnoneのためにnoneに戻るでしょう.

#  
    self.gamma_initializer = initializers.get(gamma_initializer)
    if self.gamma_initializer is None:
        print("after get , gama_initializer is None")
    else:
        print("gama_ini is " + str(self.gamma_initializer))

#  
gama_ini is

しかし、テストの結果、gamma_initializerは大丈夫・・・

僕らは最初のスタート地点に戻る

#layers.py
#     layer   ，    gamma

layer = normalization_layers.BatchNormalization(...)

if layer.gamma is None:
          print("==========sorry layer.gama is none========")

print("going to make layer.apply")
outputs = layer.apply(inputs, training=is_training)



#  
'BatchNormalization' object has no attribute 'gamma'

#     layer  ，gamma     
#         ，  beta     
#          layer.apply

layer.を研究するためにapply、私たちは探し続けます.

#layer.py
#   layer        outputs
outputs = layer.apply(inputs, training=is_training)


#  ，apply    
#\Lib\site-packages\tensorflow\python\keras\engine\base_layer.py
#Layer 

def apply(self, inputs, *args, **kwargs):
    """Apply the layer on a input.

    This is an alias of `self.__call__`.

    Arguments:
      inputs: Input tensor(s).
      *args: additional positional arguments to be passed to `self.call`.
      **kwargs: additional keyword arguments to be passed to `self.call`.

    Returns:
      Output tensor(s).
    """
    return self.__call__(inputs, *args, **kwargs)


#  self.__call__()

def __call__(self, inputs, *args, **kwargs):
    """
        ，     base_layer.py   。
               ，           call()。
    call      ，      "    "。
      call()           ，call() return  output
       BN   ，     BN  output。
      BN                。
    """
    #     
    input_list = nest.flatten(inputs)

    if context.executing_eagerly():
      # Accept NumPy inputs by converting to Tensors when executing eagerly.
      if all(isinstance(x, (np.ndarray, float, int)) for x in input_list):
        inputs = nest.map_structure(ops.convert_to_tensor, inputs)
        input_list = nest.flatten(inputs)

    #     input        tf   symbolic_tensor
    build_graph = tf_utils.are_all_symbolic_tensors(input_list)
    executing_eagerly = context.executing_eagerly()

    previous_mask = None
    if build_graph and (not hasattr(self, '_compute_previous_mask') or
                        self._compute_previous_mask):
      previous_mask = base_layer_utils.collect_previous_mask(inputs)
      if not hasattr(self, '_call_fn_args'):
        self._call_fn_args = function_utils.fn_args(self.call)
      if ('mask' in self._call_fn_args and 'mask' not in kwargs and
          not generic_utils.is_all_none(previous_mask)):
        # The previous layer generated a mask, and mask was not explicitly pass
        # to __call__, hence we set previous_mask as the default value.
        kwargs['mask'] = previous_mask

    input_shapes = None

    with ops.name_scope(self._name_scope()):
"""
      #   
"""
      if not self.built:
        # Build layer if applicable (if the `build` method has been overridden).
"""
        #                 build()
"""
        self._maybe_build(inputs)

        # We must set self.built since user defined build functions are not
        # constrained to set self.built.

        #      ，         build      built  
        self.built = True  

      # Check input assumptions set after layer building, e.g. input shape.
      if build_graph:
        # Symbolic execution on symbolic tensors. We will attempt to build
        # the corresponding TF subgraph inside `backend.get_graph()`
        input_spec.assert_input_compatibility(
            self.input_spec, inputs, self.name)
        graph = backend.get_graph()
"""
        #  
"""
        with graph.as_default():
          if not executing_eagerly:
            # In graph mode, failure to build the layer's graph
            # implies a user-side bug. We don't catch exceptions.
            outputs = self.call(inputs, *args, **kwargs)
"""
            #  __call__              call      output
"""
          else:
            try:
              outputs = self.call(inputs, *args, **kwargs)
"""
              #  。         
"""
            except Exception:  # pylint: disable=broad-except
              # Any issue during graph-building means we will later run the
              # model in eager mode, whether the issue was related to
              # graph mode or not. This provides a nice debugging experience.
              self._call_is_graph_friendly = False
              # We will use static shape inference to return symbolic tensors
              # matching the specifications of the layer outputs.
              # Since we have set `self._call_is_graph_friendly = False`,
              # we will never attempt to run the underlying TF graph (which is
              # disconnected).
              # TODO(fchollet): consider py_func as an alternative, which
              # would enable us to run the underlying graph if needed.
              input_shapes = nest.map_structure(lambda x: x.shape, inputs)
              output_shapes = self.compute_output_shape(input_shapes)
              outputs = nest.map_structure(
                  lambda shape: backend.placeholder(shape, dtype=self.dtype),
                  output_shapes)

          if outputs is None:
            raise ValueError('A layer\'s `call` method should return a '
                             'Tensor or a list of Tensors, not None '
                             '(layer: ' + self.name + ').')
          self._handle_activity_regularization(inputs, outputs)
          self._set_mask_metadata(inputs, outputs, previous_mask)
          if base_layer_utils.have_all_keras_metadata(inputs):
            inputs, outputs = self._set_connectivity_metadata_(
                inputs, outputs, args, kwargs)
          if hasattr(self, '_set_inputs') and not self.inputs:
            # Subclassed network: explicitly set metadata normally set by
            # a call to self._set_inputs().
            # This is not relevant in eager execution.
            self._set_inputs(inputs, outputs)
      else:
        # Eager execution on data tensors.
        outputs = self.call(inputs, *args, **kwargs)
        self._handle_activity_regularization(inputs, outputs)
        return outputs

    if not context.executing_eagerly():
      # Optionally load weight values specified at layer instantiation.
      # TODO(fchollet): consider enabling this with eager execution too.
      if (hasattr(self, '_initial_weights') and
          self._initial_weights is not None):
        self.set_weights(self._initial_weights)
        del self._initial_weights
    return outputs

なぜlayer.がpyファイルでは、作成したばかりのlayerにはbetaとgammaはありません.
作成したばかりのlayerはbase_を使っているのでlayer.pyにおける下位クラスLayer()の構造関数は,その構造が非常に単純であることは明らかである.
base_を書きますlayerの時は明らかに知らなかったが,後でBN層が作成されるのかConvolution層が作成されるのか,それとも何層が作成されるのか.
どうしてgammaとbetaを作成する必要があることを知っていますか?

したがって,BN層に密接に関連する2つのパラメータgammaとbetaの作成は,BN層に直接関連するクラスで定義されることは明らかである.
この考え方に沿って、上の_call__()には、ユーザー定義に関連する関数が2つあります.
一つはbuild()で、一つはcall()です.
まずcall()を見てみましょう.

#      
#\Lib\site-packages\tensorflow\python\keras\layers
ormalization.py
#   Layer       
class BatchNormalizationV2(Layer):
"""
"""
  def call(self, inputs, training=None):
    if training is None:
      training = K.learning_phase()

    in_eager_mode = context.executing_eagerly()
	
    #         batch_size ，        v_b_size，    BN
    if self.virtual_batch_size is not None:
      # Virtual batches (aka ghost batches) can be simulated by reshaping the
      # Tensor and reusing the existing batch norm implementation

      #    0     ，   -1  ，                
      original_shape = [-1] + inputs.shape.as_list()[1:] 
      #>> [-1,dim1.dim2,dim3,...]

      #    virtual_batch_size
      expanded_shape = [self.virtual_batch_size, -1] + original_shape[1:]
      #>> [v_b_size,-1,dim1,dim2,dim3,...]

      # Will cause errors if virtual_batch_size does not divide the batch size
      #   [b_size,dim1,dim2,...] reshape  [v_b_size,-1,dim1,dim2,dim3]
      #   -1           ，    batch_size   v_b_size  
      #      , batch_size = v_b_size*(-1) ，  -1           
      inputs = array_ops.reshape(inputs, expanded_shape)

      #         
      def undo_virtual_batching(outputs):
        outputs = array_ops.reshape(outputs, original_shape)
        return outputs
"""
    #  fused==True，      faster & fused implementation
    #  output，  return，   call()
"""
    if self.fused:
	  #             _fused_batch_norm
      outputs = self._fused_batch_norm(inputs, training=training)
      if self.virtual_batch_size is not None:
        # Currently never reaches here since fused_batch_norm does not support
        # virtual batching
		#    
        outputs = undo_virtual_batching(outputs)
      return outputs
"""
    #  fused==False or None
    #            
"""  
    # Compute the axes along which to reduce the mean / variance
    input_shape = inputs.get_shape()
    ndims = len(input_shape)
    reduction_axes = [i for i in range(ndims) if i not in self.axis]
    if self.virtual_batch_size is not None:
      del reduction_axes[1]     # Do not reduce along virtual batch dim

    # Broadcasting only necessary for single-axis batch norm where the axis is
    # not the last dimension
    broadcast_shape = [1] * ndims
    broadcast_shape[self.axis[0]] = input_shape.dims[self.axis[0]].value
    def _broadcast(v):
      if (v is not None and
          len(v.get_shape()) != ndims and
          reduction_axes != list(range(ndims - 1))):
        return array_ops.reshape(v, broadcast_shape)
      return v

    scale, offset = _broadcast(self.gamma), _broadcast(self.beta)

    def _compose_transforms(scale, offset, then_scale, then_offset):
      if then_scale is not None:
        scale *= then_scale
        offset *= then_scale
      if then_offset is not None:
        offset += then_offset
      return (scale, offset)

    # Determine a boolean value for `training`: could be True, False, or None.
    training_value = tf_utils.constant_value(training)
    if training_value is not False:
      if self.adjustment:
        adj_scale, adj_bias = self.adjustment(array_ops.shape(inputs))
        # Adjust only during training.
        adj_scale = tf_utils.smart_cond(training,
                                        lambda: adj_scale,
                                        lambda: array_ops.ones_like(adj_scale))
        adj_bias = tf_utils.smart_cond(training,
                                       lambda: adj_bias,
                                       lambda: array_ops.zeros_like(adj_bias))
        scale, offset = _compose_transforms(adj_scale, adj_bias, scale, offset)

      # Some of the computations here are not necessary when training==False
      # but not a constant. However, this makes the code simpler.
      keep_dims = self.virtual_batch_size is not None or len(self.axis) > 1
      mean, variance = self._moments(
          inputs, reduction_axes, keep_dims=keep_dims)

      moving_mean = self.moving_mean
      moving_variance = self.moving_variance

      mean = tf_utils.smart_cond(training,
                                 lambda: mean,
                                 lambda: moving_mean)
      variance = tf_utils.smart_cond(training,
                                     lambda: variance,
                                     lambda: moving_variance)

      if self.virtual_batch_size is not None:
        # This isn't strictly correct since in ghost batch norm, you are
        # supposed to sequentially update the moving_mean and moving_variance
        # with each sub-batch. However, since the moving statistics are only
        # used during evaluation, it is more efficient to just update in one
        # step and should not make a significant difference in the result.
        new_mean = math_ops.reduce_mean(mean, axis=1, keepdims=True)
        new_variance = math_ops.reduce_mean(variance, axis=1, keepdims=True)
      else:
        new_mean, new_variance = mean, variance

      if self.renorm:
        r, d, new_mean, new_variance = self._renorm_correction_and_moments(
            new_mean, new_variance, training)
        # When training, the normalized values (say, x) will be transformed as
        # x * gamma + beta without renorm, and (x * r + d) * gamma + beta
        # = x * (r * gamma) + (d * gamma + beta) with renorm.
        r = _broadcast(array_ops.stop_gradient(r, name='renorm_r'))
        d = _broadcast(array_ops.stop_gradient(d, name='renorm_d'))
        scale, offset = _compose_transforms(r, d, scale, offset)

      def _do_update(var, value):
        if in_eager_mode and not self.trainable:
          return

        return self._assign_moving_average(var, value, self.momentum)

      mean_update = tf_utils.smart_cond(
          training,
          lambda: _do_update(self.moving_mean, new_mean),
          lambda: self.moving_mean)
      variance_update = tf_utils.smart_cond(
          training,
          lambda: _do_update(self.moving_variance, new_variance),
          lambda: self.moving_variance)
      if not context.executing_eagerly():
        self.add_update(mean_update, inputs=True)
        self.add_update(variance_update, inputs=True)

    else:
      #training_value is False
      mean, variance = self.moving_mean, self.moving_variance

    mean = math_ops.cast(mean, inputs.dtype)
    variance = math_ops.cast(variance, inputs.dtype)
    if offset is not None:
      offset = math_ops.cast(offset, inputs.dtype)
    outputs = nn.batch_normalization(inputs,
                                     _broadcast(mean),
                                     _broadcast(variance),
                                     offset,
                                     scale,
                                     self.epsilon)
    # If some components of the shape got lost due to adjustments, fix that.
    outputs.set_shape(input_shape)

    if self.virtual_batch_size is not None:
      outputs = undo_virtual_batching(outputs)
    return outputs

keras->BatchNormalizationV 2クラスのcall()関数はfusedパラメータに関係し,分岐選択が可能であることが分かった.
しかし残念ながらcall()は私たちが研究するgammaとは連絡がありません.

そこでbuild()に目を向けました

#        ，build() call()   
#\Lib\site-packages\tensorflow\python\keras\layers
ormalization.py
#   Layer       
class BatchNormalizationV2(Layer):

  def build(self, input_shape):
    input_shape = tensor_shape.TensorShape(input_shape)
    if not input_shape.ndims:
      raise ValueError('Input has undefined rank:', input_shape)
    ndims = len(input_shape)

    # Convert axis to list and resolve negatives
    if isinstance(self.axis, int):
      self.axis = [self.axis]

    for idx, x in enumerate(self.axis):
      if x < 0:
        self.axis[idx] = ndims + x

    # Validate axes
    for x in self.axis:
      if x < 0 or x >= ndims:
        raise ValueError('Invalid axis: %d' % x)
    if len(self.axis) != len(set(self.axis)):
      raise ValueError('Duplicate axis: %s' % self.axis)

    if self.virtual_batch_size is not None:
      if self.virtual_batch_size <= 0:
        raise ValueError('virtual_batch_size must be a positive integer that '
                         'divides the true batch size of the input Tensor')
      # If using virtual batches, the first dimension must be the batch
      # dimension and cannot be the batch norm axis
      if 0 in self.axis:
        raise ValueError('When using virtual_batch_size, the batch dimension '
                         'must be 0 and thus axis cannot include 0')
      if self.adjustment is not None:
        raise ValueError('When using virtual_batch_size, adjustment cannot '
                         'be specified')

    if self.fused in (None, True):
      # TODO(yaozhang): if input is not 4D, reshape it to 4D and reshape the
      # output back to its original shape accordingly.
      if self._USE_V2_BEHAVIOR:
        if self.fused is None:
          self.fused = (ndims == 4)
        elif self.fused and ndims != 4:
          raise ValueError('Batch normalization layers with fused=True only '
                           'support 4D input tensors.')
      else:
        assert self.fused is not None
        self.fused = (ndims == 4 and self._fused_can_be_used())
      # TODO(chrisying): fused batch norm is currently not supported for
      # multi-axis batch norm and by extension virtual batches. In some cases,
      # it might be possible to use fused batch norm but would require reshaping
      # the Tensor to 4D with the axis in 1 or 3 (preferred 1) which is
      # particularly tricky. A compromise might be to just support the most
      # common use case (turning 5D w/ virtual batch to NCHW)

    if self.fused:
      if self.axis == [1]:
        self._data_format = 'NCHW'
      elif self.axis == [3]:
        self._data_format = 'NHWC'
      else:
        raise ValueError('Unsupported axis, fused batch norm only supports '
                         'axis == [1] or axis == [3]')

    # Raise parameters of fp16 batch norm to fp32
    if self.dtype == dtypes.float16 or self.dtype == dtypes.bfloat16:
      param_dtype = dtypes.float32
    else:
      param_dtype = self.dtype or dtypes.float32

    axis_to_dim = {x: input_shape.dims[x].value for x in self.axis}
    for x in axis_to_dim:
      if axis_to_dim[x] is None:
        raise ValueError('Input has undefined `axis` dimension. Input shape: ',
                         input_shape)
    self.input_spec = InputSpec(ndim=ndims, axes=axis_to_dim)

    if len(axis_to_dim) == 1 and self.virtual_batch_size is None:
      # Single axis batch norm (most common/default use-case)
      param_shape = (list(axis_to_dim.values())[0],)
    else:
      # Parameter shape is the original shape but with 1 in all non-axis dims
      param_shape = [axis_to_dim[i] if i in axis_to_dim
                     else 1 for i in range(ndims)]
      if self.virtual_batch_size is not None:
        # When using virtual batches, add an extra dim at index 1
        param_shape.insert(1, 1)
        for idx, x in enumerate(self.axis):
          self.axis[idx] = x + 1      # Account for added dimension

"""
#   
"""
    if self.scale:
      #     scale True，  gama      
      self.gamma = self.add_weight(
          name='gamma',
          shape=param_shape,
          dtype=param_dtype,
          initializer=self.gamma_initializer,
          regularizer=self.gamma_regularizer,
          constraint=self.gamma_constraint,
          trainable=True)
    else:
      #   gamma None，       constant=1.0     
      #   keras， trainable_variables  
      # None      tensorflow   ，      gamma 
      self.gamma = None
      if self.fused:
        self._gamma_const = array_ops.constant(
            1.0, dtype=param_dtype, shape=param_shape)

    if self.center:
      self.beta = self.add_weight(
          name='beta',
          shape=param_shape,
          dtype=param_dtype,
          initializer=self.beta_initializer,
          regularizer=self.beta_regularizer,
          constraint=self.beta_constraint,
          trainable=True)
    else:
      self.beta = None
      if self.fused:
        self._beta_const = array_ops.constant(
            0.0, dtype=param_dtype, shape=param_shape)

    try:
      # Disable variable partitioning when creating the moving mean and variance
      if hasattr(self, '_scope') and self._scope:
        partitioner = self._scope.partitioner
        self._scope.set_partitioner(None)
      else:
        partitioner = None
      self.moving_mean = self.add_weight(
          name='moving_mean',
          shape=param_shape,
          dtype=param_dtype,
          initializer=self.moving_mean_initializer,
          synchronization=tf_variables.VariableSynchronization.ON_READ,
          trainable=False,
          aggregation=tf_variables.VariableAggregation.MEAN)

      self.moving_variance = self.add_weight(
          name='moving_variance',
          shape=param_shape,
          dtype=param_dtype,
          initializer=self.moving_variance_initializer,
          synchronization=tf_variables.VariableSynchronization.ON_READ,
          trainable=False,
          aggregation=tf_variables.VariableAggregation.MEAN)

      if self.renorm:
        # Create variables to maintain the moving mean and standard deviation.
        # These are used in training and thus are different from the moving
        # averages above. The renorm variables are colocated with moving_mean
        # and moving_variance.
        # NOTE: below, the outer `with device` block causes the current device
        # stack to be cleared. The nested ones use a `lambda` to set the desired
        # device and ignore any devices that may be set by the custom getter.
        def _renorm_variable(name, shape):
          var = self.add_weight(
              name=name,
              shape=shape,
              dtype=param_dtype,
              initializer=init_ops.zeros_initializer(),
              synchronization=tf_variables.VariableSynchronization.ON_READ,
              trainable=False,
              aggregation=tf_variables.VariableAggregation.MEAN)
          return var

        with distribution_strategy_context.get_distribution_strategy(
        ).colocate_vars_with(self.moving_mean):
          self.renorm_mean = _renorm_variable('renorm_mean', param_shape)
          self.renorm_mean_weight = _renorm_variable('renorm_mean_weight', ())
        # We initialize renorm_stddev to 0, and maintain the (0-initialized)
        # renorm_stddev_weight. This allows us to (1) mix the average
        # stddev with the minibatch stddev early in training, and (2) compute
        # the unbiased average stddev by dividing renorm_stddev by the weight.
        with distribution_strategy_context.get_distribution_strategy(
        ).colocate_vars_with(self.moving_variance):
          self.renorm_stddev = _renorm_variable('renorm_stddev', param_shape)
          self.renorm_stddev_weight = _renorm_variable('renorm_stddev_weight',
                                                       ())
    finally:
      if partitioner:
        self._scope.set_partitioner(partitioner)
    self.built = True

次にgamma,betaがbuild()で定義されていることを決定した.
gammaはself.scaleパラメータ制御、betaはself.センターパラメータ制御.

しかし、なぜgammaサイレントが発生したのか、betaは通常の訓練の結果を知りたい.
class BatchNormalizationV 2(Layer)のパラメータ紹介では
center: If True, add offset of `beta` to normalized tensor. If False, `beta` is ignored. scale: If True, multiply by `gamma`. If False, `gamma` is not used. When the next layer is linear (also e.g. `nn.relu`), this can be disabled since the scaling will be done by the next layer.
説明を読んではっと悟ったのは、私が使っているGANモデルで、各層のgeneratorが畳み込まれた後、leakyrelu()関数を使ってアクティブにします.

scaleパラメータは設定されていませんがbatch_norm()関数のデフォルトパラメータは

def batch_norm(inputs,
               decay=0.999,
               center=True,
               scale=False,
               ...)

そのため、center対応のbetaが有効になり、scale対応のgammaがignoreされた.

batch_を呼び出している場合norm()のところを直してください

g1 = tf.contrib.layers.batch_norm(g1, epsilon=1e-5, scope='bn1_g')

#   

g1 = tf.contrib.layers.batch_norm(g1, scale=True, epsilon=1e-5, scope='bn1_g')

これによりgammaもトレーニング変数リストに追加されることが保証されます.

C++中_cdecl _stdcall _fastcall _thiscall関数呼び出し要約(照合アセンブリコード)

【小ネタ】echo $PATHとするとPATHが見れるが、小文字でecho $pathとすると。。。