本文实现了一个两层的模拟异或(XOR)逻辑运算的神经网络。实现没有借助Pytorch等机器学习库,从而有助于读者更好地理解反向传播的计算细节。
我们知道感知学习(Perceptron learning)能表达的模型是线性的,如下图所示,其可以表达逻辑运算 AND, OR, NOR, 但是确不能表达 XOR。 image.png
但我们可以使用2层的网络,且引入能够让空间能够发生”扭曲“的激活函数(Activation Function),是能够让模型表达 XOR 的。 具体原理是 $x1\enspace XOR\enspace x2$ 可以被改写为: $(x_1\enspace AND\enspace x_2)\enspace NOR\enspace (x_1\enspace NOR\enspace x_2)$。如下图网络的结构所示,其中$x_1$, $x_2$是输入,$z$是输出,$g()$ 是激活函数。$u_1$ 和 $u_2$ 代表 $(x1 \enspace AND \enspace x2)$ 和 $(x1 \enspace NOR \enspace x2)$ 的运算结果: image.png 我们可以手工推导可能的weights来验证模型的可行性:

# 选择 Step function 为激活函数 g:
def g(num):
    if num >= 0:
        return 1
    else:
        return 0

u1 = x1 AND x2 = x1 + x2 - 1.5 -> w11=1, w12=1, b1=-1.5
u2 = x1 NOR x2 = -x1 - x2 + 0.5 -> w21=-1, w22=-1, b2=0.5
s = y1 NOR y2 = -y1 - y2 + 0.5 -> v1=-1, v2=-1, c=0.5
s = -g(x1w11 + x2w12 + w10) - g(x1w21 + x2w22 + w20) + 0.5
  = -g(x1 + x2 - 1.5) - g(-x1 - x2 + 0.5) + 0.5

# 验证:
x1=1, x2=1 -> s=-g(2 - 1.5) - g(-2 + 0.5) + 0.5 = -1 - 0 + 0.5 = -0.5 -> z=0
x1=0, x2=1 -> s=-g(1 - 1.5) - g(-1 + 0.5) + 0.5 = -0 - 0 + 0.5 = 0.5 -> z=1
x1=0, x2=0 -> s=-g(0 - 1.5) - g(0 + 0.5) + 0.5 = -0 - 1 + 0.5 = -0.5 -> z=0


延伸上面的逻辑改写的思路。我们知道任何的function都可以转化为一组DNF(Disjunctive Normal Form),这也就意味着任何的function都可以转化为2层的感知网络加activation。这也就不严谨的证明了万能近似定理。也就是说理论上2层的神经网络就可以表达任意的函数。


上述神经网络的求导过程入下:

∂E/∂z = z-t
∂z/∂s = z*(1-z)
∂E/∂s = ∂E/∂z * ∂z/∂s = (z-t) * z * (1-z) = δout
∂E/∂c = ∂E/∂s * ∂s/∂c = ∂E/∂s
∂E/∂v1 = δout * y1
∂E/∂v2 = δout * y2
∂E/∂y1 = δout * v1
∂E/∂y2 = δout * v2
∂y1/∂u1 = y1 *(1-y1)
∂E/∂u1 = ∂E/∂y1 * ∂y1/∂u1 = δout * v1 *y1 *(1-y1) = δ1
∂y2/∂u2 = y2 *(1-y2)
∂E/∂u2 = ∂E/∂y2 * ∂y2/∂u2 = δout * v2 *y2 *(1-y2)
∂u1/∂b1 = 1
∂u1/∂w11 = x1
∂u2/∂w21= x1
∂u1/∂w12 = x2
∂u2/∂w22 = x2
∂u2/∂b2 = 1
∂E/∂b1 = ∂E/∂u1 * ∂u1/∂b1 
∂E/∂b2 = ∂E/∂u2 * ∂u2/∂b2
∂E/∂w11 = ∂E/∂u1 * ∂u1/∂w11 =  δout * v1 *y1 *(1-y1) * x1 = (z-t) * z * (1-z) * v1 *y1 *(1-y1) * x1
- z = g( c + v1*g(b1 + w11x1 + w12x2) + v2*g(b2 + w21x1 + w22x2) )
- y1 = g(b1 + w11x1 + w12x2)
∂E/∂w12 = ∂E/∂u1 * ∂u1/∂w12 = δout * v1 *y1 *(1-y1) * x2
∂E/∂w21 = ∂E/∂u2 * ∂u2/∂w21 = δout * v2 *y2 *(1-y2) * x1
∂E/∂w22 = ∂E/∂u2 * ∂u2/∂w22 = δout * v2 *y2 *(1-y2) * x2


我们可以将上述求导过程实现为如下的代码。 该代码拷贝到本地后可以在CUP上进行训练。随机初始化的weights对是否能训练成功有一定影响,读者可以运行多次以进行尝试。

# learning XOR throw a two layer network use gradient descent
# w 的初始化值对是否能训练成功和训练的速度都有较大影响。这是无限模型带来的问题
import math
import random
import numpy as np
from datetime import datetime

# random.seed(3)
random.seed(datetime.now())

weightSet = set()


def random_weight():
    while (True):
        w = (random.randint(-10, 10)) / 10
        if w == 0:
            continue
        if w not in weightSet:
            weightSet.add(w)
            return w


class Weights:
    indexMap = {"b1": 0, "w11": 1, "w12": 2, "b2": 3, "w21": 4, "w22": 5, "c": 6, "v1": 7, "v2": 8}
    weights = []

    def __init__(self):
        self.weights = [-1.5, 1, 1, 0.5, -1, -1, 0.5, -1, -1]

    def getIndex(self, name):
        return self.indexMap[name]

    def get(self, index):
        return self.weights[index]

    def getWeights(self):
        return self.weights

    def update(self, index, w):
        self.weights[index] = w


η = 0.05

weights = Weights()
for i in range(len(weights.getWeights())):
    weights.update(i, random_weight())

# "b1", "w11", "w12", "b2", "w21", "w22", "c", "v1", "v2"
# weights.weights = [-1.5, 1, 1, 0.5, -1, -1, 0.5, -1, -1]  # target
# target:
# weights.weights = [-3.6256294452213225, 2.472015878775767, 2.472015878775767, 1.4097549175948436, -3.0465380385550866,
#                    -3.0465380385550866, -3.7807463009027975, -4.221045218825591, -4.194984747863033]

# weights.weights = [0.5,-1,1,0.5,-1,-1,1.5,-1,-1]
# weights.weights = [-1.0, 0, 1, 0.5, -1, -1, 0.5, -1, -1]
# weights.weights = [0.0, 0.0, 0.0,  0.0, 0.0,  0.0, 0.0, 0.0, 0.0] # 不能初始化为0

inputs = [[1, 1], [1, 0], [0, 1], [0, 0]]
ts = [0, 1, 1, 0]


def cost():
    return


λ = 0.001  # weight decay λ


def pW(weights, λ):
    r = 0
    for w in weights.weights:
        r = r + w * w
    r = (r * λ) / 2
    return r


def sigmoid(num):
    # return step(num)
    # num = num * 5
    r = 1 / (1 + math.exp(-num))
    return r


def tanh(num) -> object:
    r = 2 / (1 + math.exp(-2 * num)) - 1
    return r


def step(num):
    if num > 0:
        return 1
    else:
        return 0


def activation(num):
    # if(num)>0:
    #     return 1
    # else:
    #     return 0
    return sigmoid(num)


def calculate_s(inputs, weights):
    result = weights[0]
    for i in range(len(inputs)):
        x = inputs[i]
        result = result + x * weights[i + 1]
    return result


def c_z(x1, x2, weights):
    b1, w11, w12, b2, w21, w22, c, v1, v2 = weights.getWeights()

    y1 = tanh(x1 * w11 + x2 * w12 + b1)
    y2 = tanh(x1 * w21 + x2 * w22 + b2)
    s = y1 * v1 + y2 * v2 + c
    z = activation(s)
    return z


def c_z_step(x1, x2, weights):
    b1, w11, w12, b2, w21, w22, c, v1, v2 = weights.getWeights()

    y1 = (x1 * w11 + x2 * w12 + b1)
    if y1 > 0:
        y1 = 1
    else:
        y1 = 0
    y2 = (x1 * w21 + x2 * w22 + b2)
    if y2 > 0:
        y2 = 1
    else:
        y2 = 0
    s = y1 * v1 + y2 * v2 + c
    if s > 0:
        s = 1
    else:
        s = 0
    return s


def calculate_dE_dz(t, z, weights):
    dE_dz = z - t
    # dE_dz = dE_dz + λ * sum(weights.weights)
    return dE_dz


def calculatePartialDerivatives_c(t, x1, x2, weights):
    b1, w11, w12, b2, w21, w22, c, v1, v2 = weights.getWeights()

    y1 = tanh(x1 * w11 + x2 * w12 + b1)
    y2 = tanh(x1 * w21 + x2 * w22 + b2)
    s = y1 * v1 + y2 * v2 + c
    z = activation(s)
    dE_dz = calculate_dE_dz(t, z, weights)
    dz_ds = z * (1 - z)
    dE_ds = dE_dz * dz_ds
    dE_dc = dE_ds
    return dE_dc


def calculatePartialDerivatives_v1(t, x1, x2, weights):
    b1, w11, w12, b2, w21, w22, c, v1, v2 = weights.getWeights()

    y1 = tanh(x1 * w11 + x2 * w12 + b1)
    y2 = tanh(x1 * w21 + x2 * w22 + b2)
    s = y1 * v1 + y2 * v2 + c
    z = activation(s)
    dE_dz = calculate_dE_dz(t, z, weights)
    dz_ds = z * (1 - z)
    dE_ds = dE_dz * dz_ds
    dE_dv1 = dE_ds * y1
    return dE_dv1


def calculatePartialDerivatives_v2(t, x1, x2, weights):
    b1, w11, w12, b2, w21, w22, c, v1, v2 = weights.getWeights()

    y1 = tanh(x1 * w11 + x2 * w12 + b1)
    y2 = tanh(x1 * w21 + x2 * w22 + b2)
    s = y1 * v1 + y2 * v2 + c
    z = activation(s)
    dE_dz = calculate_dE_dz(t, z, weights)
    dz_ds = z * (1 - z)
    dE_ds = dE_dz * dz_ds
    dout = dE_ds
    dE_dv2 = dout * y2
    return dE_dv2


def calculatePartialDerivatives_b1(t, x1, x2, weights):
    b1, w11, w12, b2, w21, w22, c, v1, v2 = weights.getWeights()

    y1 = tanh(x1 * w11 + x2 * w12 + b1)
    y2 = tanh(x1 * w21 + x2 * w22 + b2)
    s = y1 * v1 + y2 * v2 + c
    z = activation(s)
    dE_dz = calculate_dE_dz(t, z, weights)
    dz_ds = z * (1 - z)
    dE_ds = dE_dz * dz_ds
    dout = dE_ds
    dE_dy1 = dout * v1
    dy1_du1 = 1 - y1 ** 2
    dE_du1 = dE_dy1 * dy1_du1
    du1_db1 = 1
    dE_db1 = dE_du1 * du1_db1
    return dE_db1


def calculatePartialDerivatives_w11(t, x1, x2, weights):
    b1, w11, w12, b2, w21, w22, c, v1, v2 = weights.getWeights()

    y1 = tanh(x1 * w11 + x2 * w12 + b1)
    y2 = tanh(x1 * w21 + x2 * w22 + b2)
    s = y1 * v1 + y2 * v2 + c

    z = activation(s)
    dE_dz = calculate_dE_dz(t, z, weights)
    dz_ds = z * (1 - z)
    dE_ds = dE_dz * dz_ds
    dout = dE_ds

    dE_dy1 = dout * v1

    dy1_du1 = 1 - y1 ** 2
    dE_du1 = dE_dy1 * dy1_du1

    du1_dw11 = x1
    dE_dw11 = dE_du1 * du1_dw11

    return dE_dw11


def calculatePartialDerivatives_w12(t, x1, x2, weights):
    b1, w11, w12, b2, w21, w22, c, v1, v2 = weights.getWeights()

    y1 = tanh(x1 * w11 + x2 * w12 + b1)
    y2 = tanh(x1 * w21 + x2 * w22 + b2)
    s = y1 * v1 + y2 * v2 + c

    z = activation(s)
    dE_dz = calculate_dE_dz(t, z, weights)
    dz_ds = z * (1 - z)
    dE_ds = dE_dz * dz_ds
    dout = dE_ds

    dE_dy1 = dout * v1

    dy1_du1 = 1 - y1 ** 2
    dE_du1 = dE_dy1 * dy1_du1

    du1_dw12 = x2
    dE_dw12 = dE_du1 * du1_dw12

    return dE_dw12


def calculatePartialDerivatives_b2(t, x1, x2, weights):
    b1, w11, w12, b2, w21, w22, c, v1, v2 = weights.getWeights()

    y1 = tanh(x1 * w11 + x2 * w12 + b1)
    y2 = tanh(x1 * w21 + x2 * w22 + b2)
    s = y1 * v1 + y2 * v2 + c
    z = activation(s)
    dE_dz = calculate_dE_dz(t, z, weights)
    dz_ds = z * (1 - z)
    dE_ds = dE_dz * dz_ds
    dout = dE_ds
    dE_dy2 = dout * v2
    dy2_du2 = 1 - y2 ** 2
    dE_du2 = dE_dy2 * dy2_du2
    du2_db2 = 1
    dE_db2 = dE_du2 * du2_db2
    return dE_db2


def calculatePartialDerivatives_w21(t, x1, x2, weights):
    b1, w11, w12, b2, w21, w22, c, v1, v2 = weights.getWeights()

    y1 = tanh(x1 * w11 + x2 * w12 + b1)
    y2 = tanh(x1 * w21 + x2 * w22 + b2)
    s = y1 * v1 + y2 * v2 + c
    z = activation(s)
    dE_dz = calculate_dE_dz(t, z, weights)
    dz_ds = z * (1 - z)
    dE_ds = dE_dz * dz_ds
    dout = dE_ds
    dE_dy2 = dout * v2
    dy2_du2 = 1 - y2 ** 2
    dE_du2 = dE_dy2 * dy2_du2
    du2_dw21 = x1
    dE_dw21 = dE_du2 * du2_dw21

    return dE_dw21


def calculatePartialDerivatives_w22(t, x1, x2, weights):
    b1, w11, w12, b2, w21, w22, c, v1, v2 = weights.getWeights()

    y1 = tanh(x1 * w11 + x2 * w12 + b1)
    y2 = tanh(x1 * w21 + x2 * w22 + b2)
    s = y1 * v1 + y2 * v2 + c
    z = activation(s)
    dE_dz = calculate_dE_dz(t, z, weights)
    dz_ds = z * (1 - z)
    dE_ds = dE_dz * dz_ds
    dout = dE_ds
    dE_dy2 = dout * v2
    dy2_du2 = 1 - y2 ** 2
    dE_du2 = dE_dy2 * dy2_du2
    du2_dw22 = x2
    dE_dw22 = dE_du2 * du2_dw22
    return dE_dw22


def calculatePartialDerivatives_sum(inputs, weights, calculatePartialDerivatives_single):
    d = 0
    for i in range(len(inputs)):
        input = inputs[i]
        t = ts[i]
        x1 = input[0]
        x2 = input[1]
        d = d + calculatePartialDerivatives_single(t, x1, x2, weights)
        # print(x1, x2, d)
    return d


calulatePartialDerivativesFunctions = [
    calculatePartialDerivatives_b1,
    calculatePartialDerivatives_w11,
    calculatePartialDerivatives_w12,
    calculatePartialDerivatives_b2,
    calculatePartialDerivatives_w21,
    calculatePartialDerivatives_w22,
    calculatePartialDerivatives_c,
    calculatePartialDerivatives_v1,
    calculatePartialDerivatives_v2]


def getCalulatePartialDerivativesFunction(index):
    return calulatePartialDerivativesFunctions[index]


def calulateError(inputs, ts, weightIndex, w, weights):
    weights.update(weightIndex, w)
    # print(weights.__dict__)
    E = 0
    for i in range(len(inputs)):
        input = inputs[i]
        t = ts[i]
        x1 = input[0]
        x2 = input[1]
        q = c_z(x1, x2, weights)

        E = E + ((q - t) ** 2)

        # p = activation(t)
        # D_KL = (p * (math.log(p, 2) - math.log(q, 2)))
        # E = E + D_KL

        # print((z-t), (z-t)**2)
    E = E / 2
    # E = E + pW(weights, λ)  # Weight decay
    return E


epochs = 10000
# epochs = 10


def adjustWeights():
    weightNames = ["b1", "w11", "w12", "b2", "w21", "w22", "c", "v1", "v2"]
    # weightNames = ["w11", "w12"]
    weightIndexs = []
    for weightName in weightNames:
        i = weights.getIndex(weightName)
        weightIndexs.append(i)
    derivatives = [0] * 9
    for i in range(epochs):
        # for i in range(1):
        # 计算完所有的derivatives之后再更新ws或者每次都更新每个w都可以。
        # 不能保证解决平原,马鞍等问题。但因为初始的w都比较小,从而很大概率?上避开了这些问题
        for weightIndex in weightIndexs:
            cF = getCalulatePartialDerivativesFunction(weightIndex)
            d = calculatePartialDerivatives_sum(inputs, weights, cF)
            derivatives[weightIndex] = d
            # beforeW = weights.get(weightIndex)
            # weights.update(weightIndex, beforeW - η * d)

        for weightIndex in weightIndexs:
            d = derivatives[weightIndex]
            beforeW = weights.get(weightIndex)
            weights.update(weightIndex, beforeW - η * d)
        print("middle:", weights.__dict__)


def check():
    E = 0
    for i in range(len(inputs)):
        input = inputs[i]
        t = ts[i]
        x1 = input[0]
        x2 = input[1]
        z = c_z(x1, x2, weights)
        E = E + (z - t) ** 2
        print("[{0},{1}] target: {2}, actual:{3}, E:{4}".format(x1, x2, t, z, E))
    print("E", E / 2)


if __name__ == '__main__':
    # weights.update(weights.getIndex("w21"), 10)
    print("start:", weights.__dict__)
    # check()
    adjustWeights()
    print("final:", weights.__dict__)
    check()


训练输出如下:

...
final: {'weights': [-1.4450043568054496, 2.982519563105286, -3.0561807995146717, 1.365020614336376, 2.915482554021478, -2.835712113443007, 3.644913260490926, 4.032086569727394, -4.054900067525553]}
[1,1] target: 0, actual:0.025435505534571242, E:0.0006469649417992043
[1,0] target: 1, actual:0.9633013476840432, E:0.0019937560236066867
[0,1] target: 1, actual:0.9631037292967697, E:0.0033550908154127384
[0,0] target: 0, actual:0.028721019871054448, E:0.004179987797846243
E 0.0020899938989231213