TensorFlow 2.0ノート(六)——オーバーフィットとアンダーフィット


Mismatch: ground-truth VS estimated
  • model capacity y = β 0 + β 1 x + β 2 x 2 + β 3 x 3 + . . . + β n x n + ϵ . y=\beta_0+\beta_1x+\beta_2x^2+\beta_3x^3+...+\beta_nx^n+\epsilon. y=β0​+β1​x+β2​x2+β3​x3+...+βn​xn+ϵ.
  • under-fitting: Estimated < ground-truth
  • train acc. is bad
  • test acc. is bad as well

  • over-fitting: Estimated > ground-truth
  • train loss and acc. is much better
  • test acc. is worse
  • => Generalization Performance


  • Detect over-fitting split dataset: Train Set, Val Set, Test Set
    dataset, metadata = tfds.load('fashion_mnist', as_supervised=True, with_info=True)
    train_dataset, test_data = dataset['train'], dataset['test']
    
    num_val_examples = 10000
    num_train_examples = metadata.splits['train'].num_examples - num_val_examples
    num_test_examples = metadata.splits['test'].num_examples
    
    #   train  train  validate 
    train_data = train_dataset.take(num_train_examples)
    val_data = train_dataset.skip(num_train_examples).take(num_val_examples)
    
    (x, y), (x_test, y_test) = tf.keras.datasets.fashion_mnist.load_data()
    
    idx = tf.range(60000)
    idx = tf.random.shuffle(idx)
    x_train, y_train = tf.gather(x, idx[:50000]), tf.gather(y, idx[:50000])
    x_val, y_val = tf.gather(x, idx[-10000:]), tf.gather(y, idx[-10000:])
    
    train_data = tf.data.Dataset.from_tensor_slices((x_train, y_train))
    val_data = tf.data.Dataset.from_tensor_slices((x_val, y_val))
    test_data = tf.data.Dataset.from_tensor_slices((x_test, y_test))
    

    train test trade-off K-fold cross-validation
  • merge train/val sets
  • randomly sample 1/k as val set
  • model.fit(train_data, epochs=10, validation_split=0.1, validation_freq=1)
    

    Reduce Overfitting
  • More data
  • Constraint model complexity
  • shallow
  • regularization

  • Dropout
  • Data argumentation
  • Early Stopping
  • import os
    import tensorflow as tf
    
    os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
    
    (x, y), (x_test, y_test) = tf.keras.datasets.fashion_mnist.load_data()
    
    
    idx = tf.range(60000)
    idx = tf.random.shuffle(idx)
    x_train, y_train = tf.gather(x, idx[:50000]), tf.gather(y, idx[:50000])
    x_val, y_val = tf.gather(x, idx[-10000:]), tf.gather(y, idx[-10000:])
    
    train_data = tf.data.Dataset.from_tensor_slices((x_train, y_train))
    val_data = tf.data.Dataset.from_tensor_slices((x_val, y_val))
    test_data = tf.data.Dataset.from_tensor_slices((x_test, y_test))
    train_val_data = zip(tf.convert_to_tensor(x),tf.convert_to_tensor(y))
    
    
    def normalize(images, labels):
        images = tf.cast(images, tf.float32)
        images /= 255.
        images = tf.reshape(images, [28*28])
        labels = tf.one_hot(labels, depth=10)
        return images, labels
    
    
    BATCH_SIZE = 100
    train_data = train_data.map(normalize).shuffle(50000).batch(BATCH_SIZE)\
        .prefetch(tf.data.experimental.AUTOTUNE)
    val_data = val_data.map(normalize).shuffle(10000).batch(BATCH_SIZE)\
        .prefetch(tf.data.experimental.AUTOTUNE)
    test_data = test_data.map(normalize).batch(BATCH_SIZE)
    
    
    class MyDense(tf.keras.layers.Layer):
    
        def __init__(self, inp_dim, outp_dim):
            super(MyDense, self).__init__()
    
            self.kernel = self.add_variable('w', [inp_dim, outp_dim])
            self.bias = self.add_variable('b', [outp_dim])
    
        def call(self, inputs, training=None):
            out = inputs @ self.kernel + self.bias
    
            return out
    
    
    class MyModel(tf.keras.Model):
    
        def __init__(self):
            super(MyModel, self).__init__()
            self.fc1 = MyDense(28*28, 256)
            self.fc2 = MyDense(256, 128)
            self.fc3 = MyDense(128, 64)
            self.fc4 = MyDense(64, 32)
            self.fc5 = MyDense(32, 10)
    
        def call(self, inputs, training=None, mask=None):
            x = self.fc1(inputs)
            x = tf.nn.relu(x)
            x = self.fc2(x)
            x = tf.nn.relu(x)
            x = self.fc3(x)
            x = tf.nn.relu(x)
            x = self.fc4(x)
            x = tf.nn.relu(x)
            x = self.fc5(x)
    
            return x
    
    
    model = MyModel()
    model.build(input_shape=[None, [28*28]])
    model.summary()
    model.compile(optimizer=tf.keras.optimizers.Adam(lr=0.01),
                  loss=tf.losses.CategoricalCrossentropy(from_logits=True),
                  metrics=['accuracy'])
    #model.fit(train_data, epochs=5, validation_data=val_data, validation_freq=1)
    model.fit(train_val_x, train_val_y, epochs=5, validation_split=0.1, validation_freq=1)
    
    model.evaluate(test_data)
    

    Regularization J ( θ ) = − 1 m ∑ i = 1 m [ y i l n y i ^ + ( 1 − y i ) l n ( 1 − y i ^ ) ] + λ ∑ i = 1 n ∣ θ i ∣ J(\theta)=-\frac{1}{m}\sum_{i=1}^m[y_iln\hat{y_i}+(1-y_i)ln(1-\hat{y_i})]+\lambda\sum_{i=1}^n|\theta_i| J(θ)=−m1​i=1∑m​[yi​lnyi​^​+(1−yi​)ln(1−yi​^​)]+λi=1∑n​∣θi​∣
  • L1-regularization J ( θ ) = − 1 m ∑ i = 1 m [ y i l n y i ^ + ( 1 − y i ) l n ( 1 − y i ^ ) ] + λ ∑ i = 1 n ∣ θ i ∣ J(\theta)=-\frac{1}{m}\sum_{i=1}^m[y_iln\hat{y_i}+(1-y_i)ln(1-\hat{y_i})]+\lambda\sum_{i=1}^n|\theta_i| J(θ)=−m1​i=1∑m​[yi​lnyi​^​+(1−yi​)ln(1−yi​^​)]+λi=1∑n​∣θi​∣
  • L2-regularization J ( W ; X , y ) + 1 2 λ ⋅ ∣ ∣ W ∣ ∣ 2 J(W;X,y)+\frac{1}{2}\lambda\cdot||W||^2 J(W;X,y)+21​λ⋅∣∣W∣∣2
  • l2_model = tf.keras.models.Sequential([
        tf.keras.layers.Dense(16, kernel_regularizer=tf.keras.regularizers.l2(0.001), 
                              activation=tf.nn.relu,
                              input_shape=([None, 28*28])),
        tf.keras.layers.Dense(16, kernel_regularizer=tf.keras.regularizers.l2(0.001), 
                              activation=tf.nn.relu),
        tf.keras.layers.Dense(1, activation=tf.nn.sigmoid)
    ])
    
    for step, (x,y) in enumerate(train_data):
        with tf.GradientTape() as tape:
            loss = tf.reduce_mean(tf.losses.categorical_crossentropy(y_onehot, out, from_logits=true))
            loss_regularization = []
            for p in model.trainable_variables:
                loss_regularization.append(tf.nn.l2_loss(p))
            loss_regularization = tf.reduce_sum(tf.stack(loss_regularization))
    
            loss = loss + 0.0001 * loss_regularization
    
        grads = tape.gradient(loss, model.trainable_variables)
        optimizer.apply_gradients(zip(grads, model.trainable_variables))
    

    momentum w k + 1 = w k − α ∇ f ( w k ) w^{k+1}=w^k-\alphaabla f(w^k) wk+1=wk−α∇f(wk) z k + 1 = β z k + ∇ f ( w k ) z^{k+1}=\beta z^k+abla f(w^k) zk+1=βzk+∇f(wk) w k + 1 = w k − α z k + 1 w^{k+1}=w^k-\alpha z^{k+1} wk+1=wk−αzk+1
    optimizer = tf.keras.optimizers.SGD(learning_rate=0.02, momentum=0.9)
    optimizer = tf.keras.optimizers.RMSprop(learning_rate=0.02, momentum=0.9)
    optimizer = tf.keras.optimizers.Adam(learning_rate=0.02, beta_1=0.9, beta_2=0.999)
    
    

    learning rate decay
    optimizer = tf.keras.optimizers.SGD(learning_rate=0.2)
    for epoch in range(100):
        #get loss
        
        #change learning rate
        optimizer.learning_rate = 0.2 *(100 - epoch) / 100
        
        #update weights
    

    Early Stopping
  • Validation set to select parameters
  • Monitor validation performance
  • Stop at the highest val perf

  • Dropout
  • Learning less to learn better
  • Each connection has p = [ 0 , 1 ] p=[0,1] p=[0,1] to lose
  • model = tf.keras.Sequential([
        tf.keras.layers.Dense(256, activation=tf.nn.relu),
        tf.keras.layers.Dropout(0.5),
        tf.keras.layers.Dense(128, activation=tf.nn.relu),
        tf.keras.layers.Dropout(0.5),
        tf.keras.layers.Dense(64, activation=tf.nn.relu),
        tf.keras.layers.Dense(32, activation=tf.nn.relu),
        tf.keras.layers.Dense(10)
    ])
    for step, (x,y) in enumerate(train_data):
        with tf.GradientTape() as tape:
            x = tf.reshape(x, (-1, 28*28))
            # train
            out = model(x, training=True)
            # val out = model(x, training=False)
        # test
        out = model(x, training=False)
    

    Stochastic Gradient Descent
  • Stochastic
  • not random!

  • Deterministic
  • Not single usually
  • batch = 16, 32, 64, 128…
  • import os
    import tensorflow as tf
    
    
    os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
    
    (x, y), (x_test, y_test) = tf.keras.datasets.fashion_mnist.load_data()
    
    
    def normalize(images, labels):
        images = tf.cast(images, tf.float32)
        images /= 255.
        images = tf.reshape(images, [28 * 28])
        return images, labels
    
    
    model = tf.keras.Sequential([
        tf.keras.layers.Dense(256, activation=tf.nn.relu),
        tf.keras.layers.Dense(128, activation=tf.nn.relu),
        tf.keras.layers.Dense(64, activation=tf.nn.relu),
        tf.keras.layers.Dense(32, activation=tf.nn.relu),
        tf.keras.layers.Dense(10)
    ])
    
    model.build(input_shape=[None, 28*28])
    model.summary()
    optimizer = tf.keras.optimizers.Adam(lr=1e-3)
    
    acc_meter = tf.keras.metrics.Accuracy()
    loss_meter = tf.keras.metrics.Mean()
    test_meter = tf.keras.metrics.Accuracy()
    
    BATCH_SIZE = 128
    test_data = tf.data.Dataset.from_tensor_slices((x_test, y_test))
    test_data = test_data.map(normalize).batch(BATCH_SIZE)
    
    for epoch in range(5):
        idx = tf.range(60000)
        idx = tf.random.shuffle(idx)
        x_train, y_train = tf.gather(x, idx[:50000]), tf.gather(y, idx[:50000])
        x_val, y_val = tf.gather(x, idx[-10000:]), tf.gather(y, idx[-10000:])
    
        #  
        train_data = tf.data.Dataset.from_tensor_slices((x_train, y_train))
        train_data = train_data.map(normalize).shuffle(50000).batch(BATCH_SIZE)
    
        val_data = tf.data.Dataset.from_tensor_slices((x_val, y_val))
        val_data = val_data.map(normalize).shuffle(10000).batch(BATCH_SIZE)
    
        for step, (x_train, y_train) in enumerate(train_data):
            with tf.GradientTape() as tape:
                y_one = tf.one_hot(y_train, depth=10)
                logits = model(x_train)
                # loss_mse = tf.reduce_mean(tf.losses.MSE(y_, logits))
                loss_ce = tf.reduce_mean(tf.losses.categorical_crossentropy(y_one, logits, from_logits=True))
                loss_meter.update_state(loss_ce)
    
            grads = tape.gradient(loss_ce, model.trainable_variables)
            optimizer.apply_gradients(zip(grads, model.trainable_variables))
            
            #  
            if step % 10 == 0:
                for _, (x_val, y_val) in enumerate(val_data):
                    logits = model(x_val)
                    prob = tf.nn.softmax(logits, axis=1)
                    pred = tf.argmax(prob, axis=1)
                    acc_meter.update_state(y_val, pred)
    
                print(epoch, step, 'loss:', loss_meter.result().numpy(),
                      'Evaluate Acc:', acc_meter.result().numpy())
                loss_meter.reset_states()
                acc_meter.reset_states()
    
        #  
        for _, (x_test, y_test) in enumerate(test_data):
            logits = model(x_test)
            prob = tf.nn.softmax(logits, axis=1)
            pred = tf.argmax(prob, axis=1)
            test_meter.update_state(y_test, pred)
        print(epoch, 'Test Acc:', test_meter.result().numpy())
        test_meter.reset_states()