TensorFlow 2.0ノート(四)-ランダム勾配降下


どうすうへんびぶんこうばい
#tf.GradientTape
w = tf.constant(1.)
x = tf.constant(2.)
y = x*w

with tf.GradientTape() as tape:
    tape.watch([w])
    y2 = x * w
    
grad1 = tape.gradient(y, [w])

with tf.GradientTape() as tape:
    tape.watch([w])
    y2 = x * w
    
grad2 = tape.gradient(y2, [w])

with tf.GradientTape(persistent=True) as tape:
    tape.watch([w])
    y2 = x * w

grad3 = tape.gradient(y2, [w])
grad4 = tape.gradient(y2, [w])

にじどうけいすう
import tensorflow as tf
w = tf.Variable(1.)
b = tf.Variable(2.)
x = tf.Variable(3.)
with tf.GradientTape() as t1:

    with tf.GradientTape() as t2:
        y = x * w + b
    dy_dw, dy_db = t2.gradient(y, [w, b])

d2y_dw2 = t1.gradient(dy_dw, w)

print(dy_dw)
print(dy_db)
print(d2y_dw2)

assert dy_dw.numpy() == 3.0
assert d2y_dw2 is None

アクティブ化関数
Sigmoid/Logistic
f ( x ) = σ ( x ) = 1 1 + e − x f(x) =\sigma(x)=\frac1{1+e^{-x}} f(x)=σ(x)=1+e−x1​ σ ′ = σ ( 1 − σ )\sigma^\prime=\sigma(1-\sigma) σ′=σ(1−σ)
a = tf.linspace(-10., 10., 10)
with tf.GradientTape() as tape:
    tape.watch(a)
    y = tf.nn.sigmoid(a)

grads = tape.gradient(y, [a])

Tanh f ( x ) = t a n h ( x ) = e x − e − x e x + e − x = 2 s i g m o i d ( 2 x ) − 1 f(x)=tanh(x)=\frac{e^x-e^{-x}}{e^x+e^{-x}}=2sigmoid(2x)-1 f(x)=tanh(x)=ex+e−xex−e−x​=2sigmoid(2x)−1 d d x t a n h ( x ) = 1 − t a n h 2 ( x )\frac{d}{dx}tanh(x) = 1-tanh^2(x) dxd​tanh(x)=1−tanh2(x)
a = tf.linspace(-5., 5., 10)
with tf.GradientTape() as tape:
    tape.watch(a)
    y = tf.nn.tanh(a)

grads = tape.gradient(y, [a])

Rectified Linear Unit f ( x ) = { 0 f o r x < 0 x f o r x ⩾ 0 f(x)=\left\{\begin{array}{rcl} 0 & for & x<0\\\\x &for & x\geqslant0\end{array}\right. f(x)=⎩⎨⎧​0x​forfor​x<0x⩾0​ f ′ ( x ) = { 0 f o r x < 0 1 f o r x ⩾ 0 f^\prime(x)=\left\{\begin{array}{rcl} 0 & for & x<0\\\\1 &for & x\geqslant0\end{array}\right. f′(x)=⎩⎨⎧​01​forfor​x<0x⩾0​
a = tf.linspace(-1., 1., 10)
tf.nn.relu(a)
tf.nn.leaky_relu(a)

Typical Loss
  • Mean Squared Error
  • l o s s = ∑ [ y − ( x w + b ) ] 2 loss =\sum[y-(xw+b)]^2 loss=∑[y−(xw+b)]2
  • L 2 − n o r m = ∣ ∣ y − ( x w + b ) ∣ ∣ 2 L_{2-norm}=||y-(xw+b)||_2 L2−norm​=∣∣y−(xw+b)∣∣2​
  • l o s s = n o r m ( y − ( x w + b ) ) 2 loss=norm(y-(xw+b))^2 loss=norm(y−(xw+b))2

  • MSE Derivative
  • l o s s = ∑ [ y − f θ ( x ) ] 2 loss=\sum[y-f_\theta(x)]^2 loss=∑[y−fθ​(x)]2
  • ∇ l o s s ∇ θ = 2 ∑ [ y − f θ ( x ) ] ∗ ∇ f θ ( x ) ∇ θ\frac{abla loss}{abla\theta}=2\sum[y-f_\theta(x)]*\frac{abla f_\theta(x)}{abla\theta} ∇θ∇loss​=2∑[y−fθ​(x)]∗∇θ∇fθ​(x)​

  • x = tf.random.normal([2, 4])
    w = tf.random.normal([4, 3])
    b = tf.zeros([3])
    y = tf.constant([2, 0])
    
    with tf.GradientTape() as tape:
        tape.watch([w, b]) #tf.Variable 
        prob = tf.nn.softmax(x@w+b, axis=1)
        loss = tf.reduce_mean(tf.losses.MSE(tf.one_hot(y, depth=3), prob))
    
    grads = tape.gradient(loss, [w, b])
    grads[0]
    grads[1]
    
  • Cross Entropy Loss
  • binary
  • multi-class
  • softmax
  • soft version of max S ( y i ) = e y i ∑ j e y i S(y_i)=\frac{e^{y_i}}{\sum_{j}e^{y_i}} S(yi​)=∑j​eyi​eyi​​

  • Leave it to Logistic Regression Part

  • SoftMax Derivative p i = e a i ∑ k = 1 N e a k p_i=\frac{e^{a_i}}{\sum_{k=1}^Ne^{a_k}} pi​=∑k=1N​eak​eai​​ ∂ p i ∂ a j = { p i ( 1 − p j ) i f i = j − p j . p i i f i ≠ j\frac{\partial p_i}{\partial a_j} =\left\{\begin{array}{rcl} p_i(1-p_j) & if & i=j\\\\-p_j.p_i &if & ieq j\end{array}\right. ∂aj​∂pi​​=⎩⎨⎧​pi​(1−pj​)−pj​.pi​​ifif​i=ji​=j​ Or using Kronecker delta δ i j = { 1 i f i = j 0 i f i ≠ j\delta_{ij}=\left\{\begin{array}{rcl} 1 & if & i=j\\\\0 &if & ieq j\end{array}\right. δij​=⎩⎨⎧​10​ifif​i=ji​=j​

  • ∂ p i ∂ a j = p i ( δ i j − p j )\frac{\partial p_i}{\partial a_j} = p_i(\delta_{ij}-p_j) ∂aj​∂pi​​=pi​(δij​−pj​)
    x = tf.random.normal([2, 4])
    w = tf.random.normal([4, 3])
    b = tf.zeros([3])
    y = tf.constant([2, 0])
    
    with tf.GradientTape() as tape:
        tape.watch([w, b])
        logits = x@w+b
        loss = tf.reduce_mean(tf.losses.categorical_crossentropy(tf.one_hot(y, depth=3), logits, from_logits=True))
    
    grads = tape.gradient(loss, [w, b])
    grads[0]
    grads[1]
    

    感知機とその勾配


    たんしゅつりょくセンサ
    Perceptron
  • y = X W + b y=XW+b y=XW+b
  • y = ∑ x i ∗ w i + b y=\sum x_i*w_i+b y=∑xi​∗wi​+b

  • Derivative E = 1 2 ( O 0 1 − t ) 2 E=\frac{1}{2}(O_0^1-t)^2 E=21​(O01​−t)2 ∂ E ∂ w j 0 = ( O 0 − t ) ∂ O 0 ∂ w j 0\frac{\partial E}{\partial w_{j0}}=(O_0-t)\frac{\partial O_0}{\partial w_{j0}} ∂wj0​∂E​=(O0​−t)∂wj0​∂O0​​ ∂ E ∂ w j 0 = ( O 0 − t ) ∂ σ ( x 0 ) ∂ w j 0\frac{\partial E}{\partial w_{j0}}=(O_0-t)\frac{\partial\sigma(x_0)}{\partial w_{j0}} ∂wj0​∂E​=(O0​−t)∂wj0​∂σ(x0​)​ ∂ E ∂ w j 0 = ( O 0 − t ) σ ( x 0 ) ( 1 − σ ( x 0 ) ) ∂ x 0 1 ∂ w j 0\frac{\partial E}{\partial w_{j0}}=(O_0-t)\sigma(x_0)(1-\sigma(x_0))\frac{\partial x_0^1}{\partial w_{j0}} ∂wj0​∂E​=(O0​−t)σ(x0​)(1−σ(x0​))∂wj0​∂x01​​ ∂ E ∂ w j 0 = ( O 0 − t ) O 0 ( 1 − O 0 ) ∂ x 0 1 ∂ w j 0\frac{\partial E}{\partial w_{j0}}=(O_0-t)O_0(1-O_0)\frac{\partial x_0^1}{\partial w_{j0}} ∂wj0​∂E​=(O0​−t)O0​(1−O0​)∂wj0​∂x01​​ ∂ E ∂ w j 0 = ( O 0 − t ) O 0 ( 1 − O 0 ) x j 0\frac{\partial E}{\partial w_{j0}}=(O_0-t)O_0(1-O_0)x_j^0 ∂wj0​∂E​=(O0​−t)O0​(1−O0​)xj0​
    x = tf.random.normal([1, 3])
    w = tf.ones([3, 1])
    b = tf.ones([1])
    y = tf.constant([1])
    
    with tf.GradientTape() as tape:
        tape.watch([w, b])
        prob = tf.nn.sigmoid(x@w+b)
        loss = tf.reduce_mean(tf.losses.MSE(y, prob))
    
    grads = tape.gradient(loss, [w, b])
    grads[0]
    grads[1]
    

    多出力感知機Derivative E=1 2Σ(O i 1−t i)2 E=frac{1}{1}{2}sum(O_i^1−t_i)^2 E=21Σ(Oi 1−ti)2∂E∂w j k=(O k−t k)∂O k∂w j kfrac{ partial E}{partial w_{jk}=(O_k−t_k)frac{ partial O_k}{k}{k}=(O_k−t_k)frac{ partial O_k}{k}{k}{k}{k}{k}partial w_{jk}∂wjk∂E=(Ok−tk)∂wjk∂Ok∂E∂wjk=(Ok−tk)∂σ ( x k ) ∂ w j k\frac{\partial E}{\partial w_{jk}}=(O_k-t_k)\frac{\partial\sigma(x_k)}{\partial w_{jk}} ∂wjk​∂E​=(Ok​−tk​)∂wjk​∂σ(xk​)​ ∂ E ∂ w j k = ( O k − t k ) σ ( x k ) ( 1 − σ ( x k ) ) ∂ x k 1 ∂ w j k\frac{\partial E}{\partial w_{jk}}=(O_k-t_k)\sigma(x_k)(1-\sigma(x_k))\frac{\partial x_k^1}{\partial w_{jk}} ∂wjk​∂E​=(Ok​−tk​)σ(xk​)(1−σ(xk​))∂wjk​∂xk1​​ ∂ E ∂ w j k = ( O k − t k ) O k ( 1 − O k ) ∂ x k 1 ∂ w j k\frac{\partial E}{\partial w_{jk}}=(O_k-t_k)O_k(1-O_k)\frac{\partial x_k^1}{\partial w_{jk}} ∂wjk​∂E​=(Ok​−tk​)Ok​(1−Ok​)∂wjk​∂xk1​​ ∂ E ∂ w j k = ( O k − t k ) O k ( 1 − O k ) x j 0\frac{\partial E}{\partial w_{jk}}=(O_k-t_k)O_k(1-O_k)x_j^0 ∂wjk​∂E​=(Ok​−tk​)Ok​(1−Ok​)xj0​
    x = tf.random.normal([2, 4])
    w = tf.random.normal([4, 3])
    b = tf.zeros([3])
    y = tf.constant([2, 0])
    
    with tf.GradientTape() as tape:
        tape.watch([w, b])
        prob = tf.nn.softmax(x@w+b, axis=1)
        loss = tf.reduce_mean(tf.losses.MSE(tf.one_hot(y, depth=3), prob))
    
    grads = tape.gradient(loss, [w, b])
    grads[0]
    grads[1]
    

    チェーン法則
    x = tf.Variable(1.)
    w1 = tf.Variable(2.)
    b1 = tf.Variable(1.)
    w2 = tf.Variable(2.)
    b2 = tf.Variable(1.)
    
    with tf.GradientTape(persistent=True) as tape:
        y1 = x * w1 + b1
        y2 = y1 * w2 + b2
    
    dy2_dy1 = tape.gradient(y2, [y1])[0]
    dy1_dw1 = tape.gradient(y1, [w1])[0]
    dy2_dw1 = tape.gradient(y2, [w1])[0]
    

    たそうセンサモデル
    ∂ E ∂ w j k = ( O k − t k ) O k ( 1 − O k ) O j J\frac{\partial E}{\partial w_{jk}}=(O_k-t_k)O_k(1-O_k)O_j^J ∂wjk​∂E​=(Ok​−tk​)Ok​(1−Ok​)OjJ​ ∂ E ∂ w j k = δ k K O j J\frac{\partial E}{\partial w_{jk}}=\qquad\delta_k^K\qquad O_j^J ∂wjk​∂E​=δkK​OjJ​ ∂ E ∂ w i j = ∂ ∂ w i j 1 2 ∑ k ∈ K ( O k − t k ) 2\frac{\partial E}{\partial w_{ij}}=\frac{\partial }{\partial w_{ij}}\frac{1}{2}\sum_{k\in K}(O_k -t_k)^2 ∂wij​∂E​=∂wij​∂​21​k∈K∑​(Ok​−tk​)2 ∂ E ∂ w i j = ∑ k ∈ K ( O k − t k ) ∂ ∂ W i j O k\frac{\partial E}{\partial w_{ij}}=\sum_{k\in K}(O_k -t_k)\frac{\partial}{\partial W_{ij}}O_k ∂wij​∂E​=k∈K∑​(Ok​−tk​)∂Wij​∂​Ok​ ∂ E ∂ w i j = ∑ k ∈ K ( O k − t k ) ∂ ∂ W i j σ ( x k )\frac{\partial E}{\partial w_{ij}}=\sum_{k\in K}(O_k -t_k)\frac{\partial}{\partial W_{ij}}\sigma(x_k) ∂wij​∂E​=k∈K∑​(Ok​−tk​)∂Wij​∂​σ(xk​) ∂ E ∂ w i j = ∑ k ∈ K ( O k − t k ) O k ( 1 − O k ) ∂ x k ∂ O j . ∂ O j ∂ W i j\frac{\partial E}{\partial w_{ij}}=\sum_{k\in K}(O_k -t_k)O_k(1-O_k)\frac{\partial x_k}{\partial O_j}.\frac{\partial O_j}{\partial W_{ij}} ∂wij​∂E​=k∈K∑​(Ok​−tk​)Ok​(1−Ok​)∂Oj​∂xk​​.∂Wij​∂Oj​​ ∂ E ∂ w i j = ∑ k ∈ K ( O k − t k ) O k ( 1 − O k ) W j k ∂ O j ∂ W i j\frac{\partial E}{\partial w_{ij}}=\sum_{k\in K}(O_k -t_k)O_k(1-O_k)W_{jk}\frac{\partial O_j}{\partial W_{ij}} ∂wij​∂E​=k∈K∑​(Ok​−tk​)Ok​(1−Ok​)Wjk​∂Wij​∂Oj​​ ∂ E ∂ w i j = ∂ O j ∂ W i j ∑ k ∈ K ( O k − t k ) O k ( 1 − O k ) W j k\frac{\partial E}{\partial w_{ij}}=\frac{\partial O_j}{\partial W_{ij}}\sum_{k\in K}(O_k -t_k)O_k(1-O_k)W_{jk} ∂wij​∂E​=∂Wij​∂Oj​​k∈K∑​(Ok​−tk​)Ok​(1−Ok​)Wjk​ ∂ E ∂ w i j = O j ( 1 − O j ) ∂ x j ∂ W i j ∑ k ∈ K ( O k − t k ) O k ( 1 − O k ) W j k\frac{\partial E}{\partial w_{ij}}= O_j(1-O_j)\frac{\partial x_j}{\partial W_{ij}}\sum_{k\in K}(O_k -t_k)O_k(1-O_k)W_{jk} ∂wij​∂E​=Oj​(1−Oj​)∂Wij​∂xj​​k∈K∑​(Ok​−tk​)Ok​(1−Ok​)Wjk​ ∂ E ∂ w i j = O j ( 1 − O j ) O i ∑ k ∈ K ( O k − t k ) O k ( 1 − O k ) W j k\frac{\partial E}{\partial w_{ij}}= O_j(1-O_j)O_i\sum_{k\in K}(O_k -t_k)O_k(1-O_k)W_{jk} ∂wij​∂E​=Oj​(1−Oj​)Oi​k∈K∑​(Ok​−tk​)Ok​(1−Ok​)Wjk​ ∂ E ∂ w i j = O i O j ( 1 − O j ) ∑ k ∈ K δ k W j k\frac{\partial E}{\partial w_{ij}}= O_iO_j(1-O_j)\sum_{k\in K}\delta_kW_{jk} ∂wij​∂E​=Oi​Oj​(1−Oj​)k∈K∑​δk​Wjk​
    For an output layer node k ∈ K k\in K k∈K ∂ E ∂ W j k = O j δ k\frac{\partial E}{\partial W_{jk}}=O_j\delta_k ∂Wjk​∂E​=Oj​δk​ where δ k = O k ( 1 − O k ) ( O k − t k )\delta_k=O_k(1-O_k)(O_k-t_k) δk​=Ok​(1−Ok​)(Ok​−tk​)
    For a hidden layer node j ∈ J j\in J j∈J ∂ E ∂ W i j = O i δ j\frac{\partial E}{\partial W_{ij}}=O_i\delta_j ∂Wij​∂E​=Oi​δj​ where δ j = O j ( 1 − O j ) ∑ k ∈ K δ k W j k\delta_j=O_j(1-O_j)\sum_{k\in K}\delta_kW_{jk} δj​=Oj​(1−Oj​)k∈K∑​δk​Wjk​
    関数の最適化
    import numpy as np
    import matplotlib.pyplot as plt
    from mpl_toolkits.mplot3d import Axes3D
    
    
    def himmelblau(x):
        return (x[0] ** 2 + x[1] - 11) ** 2 + (x[0] + x[1] ** 2 - 7) ** 2
    
    
    x = np.arange(-6, 6, 0.1)
    y = np.arange(-6, 6, 0.1)
    print('x,y range:', x.shape, y.shape)
    X, Y = np.meshgrid(x, y)
    print('X,Y maps:', X.shape, Y.shape)
    Z = himmelblau([X, Y])
    
    fig = plt.figure('himmelblau')
    ax = fig.gca(projection='3d')
    ax.plot_surface(X, Y, Z)
    ax.view_init(60, -30)
    ax.set_xlabel('x')
    ax.set_ylabel('y')
    plt.show()
    
    x = tf.constant([-4., 0.])
    for step in range(200):
        with tf.GradientTape() as tape:
            tape.watch(x)
            y = himmelblau(x)
            
        grads = tape.gradient(y, [x])[0]
        x -= 0.01 * grads
        
        if step % 20 == 0:
            print('step {}: x = {}, f(x) = {}'.format(step, x.numpy(), y.numpy()))
    
    
    import os
    import tensorflow as tf
    import tensorflow_datasets as tfds
    
    os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
    
    dataset, metadata = tfds.load('fashion_mnist', as_supervised=True, with_info=True)
    train_dataset, test_dataset = dataset['train'], dataset['test']
    
    
    def normalize(images, labels):
        images = tf.cast(images, tf.float32)
        images /= 255
        return images, labels
    
    
    print("datasets", train_dataset.map(normalize))
    train_dataset = train_dataset.map(normalize)
    test_dataset = test_dataset.map(normalize)
    
    num_train_examples = metadata.splits['train'].num_examples
    num_test_examples = metadata.splits['test'].num_examples
    
    BATCH_SIZE = 100
    train_dataset = train_dataset.repeat().shuffle(num_train_examples).batch(BATCH_SIZE)
    test_dataset = test_dataset.batch(BATCH_SIZE)
    
    model = tf.keras.Sequential([
        tf.keras.layers.Dense(256, activation=tf.nn.relu),
        tf.keras.layers.Dense(128, activation=tf.nn.relu),
        tf.keras.layers.Dense(64, activation=tf.nn.relu),
        tf.keras.layers.Dense(32, activation=tf.nn.relu),
        tf.keras.layers.Dense(10)
    ])
    
    model.build(input_shape=[None, 28*28])
    model.summary()
    optimizer = tf.keras.optimizers.Adam(lr=1e-3)
    
    
    def main():
        for epoch in range(30):
            for step, (x,y) in enumerate(train_dataset):
                x = tf.reshape(x, [-1, 28*28])
    
                with tf.GradientTape() as tape:
                    logits = model(x)
                    y_ = tf.one_hot(y, depth=10)
                    #loss_mse = tf.reduce_mean(tf.losses.MSE(y_, logits))
                    loss_ce = tf.reduce_mean(tf.losses.categorical_crossentropy(y_, logits, from_logits=True))
    
                grads = tape.gradient(loss_ce, model.trainable_variables)
                optimizer.apply_gradients(zip(grads, model.trainable_variables))
    
                if step % 500 == 0:
                    # test
                    total_correct = 0
                    total_num = 0
                    for x, y in test_dataset:
                        x = tf.reshape(x, [-1, 28 * 28])
                        logits = model(x)
                        prob = tf.nn.softmax(logits, axis=1)
                        pred = tf.argmax(prob, axis=1)
                        correct = tf.equal(pred, y)
                        correct = tf.reduce_sum(tf.cast(correct, dtype=tf.int32)).numpy()
    
                        total_correct += int(correct)
                        total_num += x.shape[0]
    
                    acc = total_correct / total_num
                    print(epoch, step, 'loss:', float(loss_ce), 'test acc:', acc)
    
    
    if __name__ == '__main__':
        main()