TensorFlow 2.0ノート(六)——オーバーフィットとアンダーフィット
61182 ワード
Mismatch: ground-truth VS estimated model capacity y = β 0 + β 1 x + β 2 x 2 + β 3 x 3 + . . . + β n x n + ϵ . y=\beta_0+\beta_1x+\beta_2x^2+\beta_3x^3+...+\beta_nx^n+\epsilon. y=β0+β1x+β2x2+β3x3+...+βnxn+ϵ. under-fitting: Estimated < ground-truth train acc. is bad test acc. is bad as well
over-fitting: Estimated > ground-truth train loss and acc. is much better test acc. is worse => Generalization Performance
Detect over-fitting split dataset: Train Set, Val Set, Test Set
train test trade-off K-fold cross-validation merge train/val sets randomly sample 1/k as val set
Reduce Overfitting More data Constraint model complexity shallow regularization
Dropout Data argumentation Early Stopping
Regularization J ( θ ) = − 1 m ∑ i = 1 m [ y i l n y i ^ + ( 1 − y i ) l n ( 1 − y i ^ ) ] + λ ∑ i = 1 n ∣ θ i ∣ J(\theta)=-\frac{1}{m}\sum_{i=1}^m[y_iln\hat{y_i}+(1-y_i)ln(1-\hat{y_i})]+\lambda\sum_{i=1}^n|\theta_i| J(θ)=−m1i=1∑m[yilnyi^+(1−yi)ln(1−yi^)]+λi=1∑n∣θi∣ L1-regularization J ( θ ) = − 1 m ∑ i = 1 m [ y i l n y i ^ + ( 1 − y i ) l n ( 1 − y i ^ ) ] + λ ∑ i = 1 n ∣ θ i ∣ J(\theta)=-\frac{1}{m}\sum_{i=1}^m[y_iln\hat{y_i}+(1-y_i)ln(1-\hat{y_i})]+\lambda\sum_{i=1}^n|\theta_i| J(θ)=−m1i=1∑m[yilnyi^+(1−yi)ln(1−yi^)]+λi=1∑n∣θi∣ L2-regularization J ( W ; X , y ) + 1 2 λ ⋅ ∣ ∣ W ∣ ∣ 2 J(W;X,y)+\frac{1}{2}\lambda\cdot||W||^2 J(W;X,y)+21λ⋅∣∣W∣∣2
momentum w k + 1 = w k − α ∇ f ( w k ) w^{k+1}=w^k-\alphaabla f(w^k) wk+1=wk−α∇f(wk) z k + 1 = β z k + ∇ f ( w k ) z^{k+1}=\beta z^k+abla f(w^k) zk+1=βzk+∇f(wk) w k + 1 = w k − α z k + 1 w^{k+1}=w^k-\alpha z^{k+1} wk+1=wk−αzk+1
learning rate decay
Early Stopping Validation set to select parameters Monitor validation performance Stop at the highest val perf
Dropout Learning less to learn better Each connection has p = [ 0 , 1 ] p=[0,1] p=[0,1] to lose
Stochastic Gradient Descent Stochastic not random!
Deterministic Not single usually batch = 16, 32, 64, 128…
Detect over-fitting split dataset: Train Set, Val Set, Test Set
dataset, metadata = tfds.load('fashion_mnist', as_supervised=True, with_info=True)
train_dataset, test_data = dataset['train'], dataset['test']
num_val_examples = 10000
num_train_examples = metadata.splits['train'].num_examples - num_val_examples
num_test_examples = metadata.splits['test'].num_examples
# train train validate
train_data = train_dataset.take(num_train_examples)
val_data = train_dataset.skip(num_train_examples).take(num_val_examples)
(x, y), (x_test, y_test) = tf.keras.datasets.fashion_mnist.load_data()
idx = tf.range(60000)
idx = tf.random.shuffle(idx)
x_train, y_train = tf.gather(x, idx[:50000]), tf.gather(y, idx[:50000])
x_val, y_val = tf.gather(x, idx[-10000:]), tf.gather(y, idx[-10000:])
train_data = tf.data.Dataset.from_tensor_slices((x_train, y_train))
val_data = tf.data.Dataset.from_tensor_slices((x_val, y_val))
test_data = tf.data.Dataset.from_tensor_slices((x_test, y_test))
train test trade-off K-fold cross-validation
model.fit(train_data, epochs=10, validation_split=0.1, validation_freq=1)
Reduce Overfitting
import os
import tensorflow as tf
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
(x, y), (x_test, y_test) = tf.keras.datasets.fashion_mnist.load_data()
idx = tf.range(60000)
idx = tf.random.shuffle(idx)
x_train, y_train = tf.gather(x, idx[:50000]), tf.gather(y, idx[:50000])
x_val, y_val = tf.gather(x, idx[-10000:]), tf.gather(y, idx[-10000:])
train_data = tf.data.Dataset.from_tensor_slices((x_train, y_train))
val_data = tf.data.Dataset.from_tensor_slices((x_val, y_val))
test_data = tf.data.Dataset.from_tensor_slices((x_test, y_test))
train_val_data = zip(tf.convert_to_tensor(x),tf.convert_to_tensor(y))
def normalize(images, labels):
images = tf.cast(images, tf.float32)
images /= 255.
images = tf.reshape(images, [28*28])
labels = tf.one_hot(labels, depth=10)
return images, labels
BATCH_SIZE = 100
train_data = train_data.map(normalize).shuffle(50000).batch(BATCH_SIZE)\
.prefetch(tf.data.experimental.AUTOTUNE)
val_data = val_data.map(normalize).shuffle(10000).batch(BATCH_SIZE)\
.prefetch(tf.data.experimental.AUTOTUNE)
test_data = test_data.map(normalize).batch(BATCH_SIZE)
class MyDense(tf.keras.layers.Layer):
def __init__(self, inp_dim, outp_dim):
super(MyDense, self).__init__()
self.kernel = self.add_variable('w', [inp_dim, outp_dim])
self.bias = self.add_variable('b', [outp_dim])
def call(self, inputs, training=None):
out = inputs @ self.kernel + self.bias
return out
class MyModel(tf.keras.Model):
def __init__(self):
super(MyModel, self).__init__()
self.fc1 = MyDense(28*28, 256)
self.fc2 = MyDense(256, 128)
self.fc3 = MyDense(128, 64)
self.fc4 = MyDense(64, 32)
self.fc5 = MyDense(32, 10)
def call(self, inputs, training=None, mask=None):
x = self.fc1(inputs)
x = tf.nn.relu(x)
x = self.fc2(x)
x = tf.nn.relu(x)
x = self.fc3(x)
x = tf.nn.relu(x)
x = self.fc4(x)
x = tf.nn.relu(x)
x = self.fc5(x)
return x
model = MyModel()
model.build(input_shape=[None, [28*28]])
model.summary()
model.compile(optimizer=tf.keras.optimizers.Adam(lr=0.01),
loss=tf.losses.CategoricalCrossentropy(from_logits=True),
metrics=['accuracy'])
#model.fit(train_data, epochs=5, validation_data=val_data, validation_freq=1)
model.fit(train_val_x, train_val_y, epochs=5, validation_split=0.1, validation_freq=1)
model.evaluate(test_data)
Regularization J ( θ ) = − 1 m ∑ i = 1 m [ y i l n y i ^ + ( 1 − y i ) l n ( 1 − y i ^ ) ] + λ ∑ i = 1 n ∣ θ i ∣ J(\theta)=-\frac{1}{m}\sum_{i=1}^m[y_iln\hat{y_i}+(1-y_i)ln(1-\hat{y_i})]+\lambda\sum_{i=1}^n|\theta_i| J(θ)=−m1i=1∑m[yilnyi^+(1−yi)ln(1−yi^)]+λi=1∑n∣θi∣
l2_model = tf.keras.models.Sequential([
tf.keras.layers.Dense(16, kernel_regularizer=tf.keras.regularizers.l2(0.001),
activation=tf.nn.relu,
input_shape=([None, 28*28])),
tf.keras.layers.Dense(16, kernel_regularizer=tf.keras.regularizers.l2(0.001),
activation=tf.nn.relu),
tf.keras.layers.Dense(1, activation=tf.nn.sigmoid)
])
for step, (x,y) in enumerate(train_data):
with tf.GradientTape() as tape:
loss = tf.reduce_mean(tf.losses.categorical_crossentropy(y_onehot, out, from_logits=true))
loss_regularization = []
for p in model.trainable_variables:
loss_regularization.append(tf.nn.l2_loss(p))
loss_regularization = tf.reduce_sum(tf.stack(loss_regularization))
loss = loss + 0.0001 * loss_regularization
grads = tape.gradient(loss, model.trainable_variables)
optimizer.apply_gradients(zip(grads, model.trainable_variables))
momentum w k + 1 = w k − α ∇ f ( w k ) w^{k+1}=w^k-\alphaabla f(w^k) wk+1=wk−α∇f(wk) z k + 1 = β z k + ∇ f ( w k ) z^{k+1}=\beta z^k+abla f(w^k) zk+1=βzk+∇f(wk) w k + 1 = w k − α z k + 1 w^{k+1}=w^k-\alpha z^{k+1} wk+1=wk−αzk+1
optimizer = tf.keras.optimizers.SGD(learning_rate=0.02, momentum=0.9)
optimizer = tf.keras.optimizers.RMSprop(learning_rate=0.02, momentum=0.9)
optimizer = tf.keras.optimizers.Adam(learning_rate=0.02, beta_1=0.9, beta_2=0.999)
learning rate decay
optimizer = tf.keras.optimizers.SGD(learning_rate=0.2)
for epoch in range(100):
#get loss
#change learning rate
optimizer.learning_rate = 0.2 *(100 - epoch) / 100
#update weights
Early Stopping
Dropout
model = tf.keras.Sequential([
tf.keras.layers.Dense(256, activation=tf.nn.relu),
tf.keras.layers.Dropout(0.5),
tf.keras.layers.Dense(128, activation=tf.nn.relu),
tf.keras.layers.Dropout(0.5),
tf.keras.layers.Dense(64, activation=tf.nn.relu),
tf.keras.layers.Dense(32, activation=tf.nn.relu),
tf.keras.layers.Dense(10)
])
for step, (x,y) in enumerate(train_data):
with tf.GradientTape() as tape:
x = tf.reshape(x, (-1, 28*28))
# train
out = model(x, training=True)
# val out = model(x, training=False)
# test
out = model(x, training=False)
Stochastic Gradient Descent
import os
import tensorflow as tf
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
(x, y), (x_test, y_test) = tf.keras.datasets.fashion_mnist.load_data()
def normalize(images, labels):
images = tf.cast(images, tf.float32)
images /= 255.
images = tf.reshape(images, [28 * 28])
return images, labels
model = tf.keras.Sequential([
tf.keras.layers.Dense(256, activation=tf.nn.relu),
tf.keras.layers.Dense(128, activation=tf.nn.relu),
tf.keras.layers.Dense(64, activation=tf.nn.relu),
tf.keras.layers.Dense(32, activation=tf.nn.relu),
tf.keras.layers.Dense(10)
])
model.build(input_shape=[None, 28*28])
model.summary()
optimizer = tf.keras.optimizers.Adam(lr=1e-3)
acc_meter = tf.keras.metrics.Accuracy()
loss_meter = tf.keras.metrics.Mean()
test_meter = tf.keras.metrics.Accuracy()
BATCH_SIZE = 128
test_data = tf.data.Dataset.from_tensor_slices((x_test, y_test))
test_data = test_data.map(normalize).batch(BATCH_SIZE)
for epoch in range(5):
idx = tf.range(60000)
idx = tf.random.shuffle(idx)
x_train, y_train = tf.gather(x, idx[:50000]), tf.gather(y, idx[:50000])
x_val, y_val = tf.gather(x, idx[-10000:]), tf.gather(y, idx[-10000:])
#
train_data = tf.data.Dataset.from_tensor_slices((x_train, y_train))
train_data = train_data.map(normalize).shuffle(50000).batch(BATCH_SIZE)
val_data = tf.data.Dataset.from_tensor_slices((x_val, y_val))
val_data = val_data.map(normalize).shuffle(10000).batch(BATCH_SIZE)
for step, (x_train, y_train) in enumerate(train_data):
with tf.GradientTape() as tape:
y_one = tf.one_hot(y_train, depth=10)
logits = model(x_train)
# loss_mse = tf.reduce_mean(tf.losses.MSE(y_, logits))
loss_ce = tf.reduce_mean(tf.losses.categorical_crossentropy(y_one, logits, from_logits=True))
loss_meter.update_state(loss_ce)
grads = tape.gradient(loss_ce, model.trainable_variables)
optimizer.apply_gradients(zip(grads, model.trainable_variables))
#
if step % 10 == 0:
for _, (x_val, y_val) in enumerate(val_data):
logits = model(x_val)
prob = tf.nn.softmax(logits, axis=1)
pred = tf.argmax(prob, axis=1)
acc_meter.update_state(y_val, pred)
print(epoch, step, 'loss:', loss_meter.result().numpy(),
'Evaluate Acc:', acc_meter.result().numpy())
loss_meter.reset_states()
acc_meter.reset_states()
#
for _, (x_test, y_test) in enumerate(test_data):
logits = model(x_test)
prob = tf.nn.softmax(logits, axis=1)
pred = tf.argmax(prob, axis=1)
test_meter.update_state(y_test, pred)
print(epoch, 'Test Acc:', test_meter.result().numpy())
test_meter.reset_states()