Batch Normalization批标准化

#author:victor

#为什么要批标准化（Batch Normalization）？
#Why need batch normalization?
"""
将分散的数据的统一标准化的方法。
数据分布会对神经网络训练产生影响
因为没有进行标准化，导致数据不敏感。
是为了克服神经网络层数加深导致难以训练而诞生的一个算法。
根据ICS理论，当训练集的样本数据和目标样本集分布不一致的时候，
训练得到的模型无法很好的泛化
在神经网络中，每一层的输入在经过层内操作之后必然会导致与原来对应的输入信号分布不同
,并且前层神经网络的增加会被后面的神经网络不对的累积放大。
这个问题的一个解决思路就是根据训练样本
与目标样本的比例对训练样本进行一个矫正，
而BN算法（批标准化）则可以用来规范化某些层或者所有层的输入
从而固定每层输入信号的均值与方差

Batch也就是把Data分成小批小批的来进行梯度下降。
解决方法：
显示数据X，然后经过全连接层fully connection layer，然后Batch Normalization(BN)
添加在数据X和全连接层之间。
然后在经过激励函数，再经过全连接层，这么下去
BN可以加快你的机器学习，也可以很有效的训练。
"""
#import module
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt


#ACTIVATION = tf.nn.relu#activation function,所有层都使用relu
ACTIVATION = tf.nn.tanh#activation function,所有层都使用tanh
N_LAYERS = 7#搭建7个hidden layer
N_HIDDEN_UNITS = 30#每个hidden layer有30个神经元

#重复观看的功能
def fix_seed(seed=1):
    # reproducible
    np.random.seed(seed)
    tf.set_random_seed(seed)

#打印图
def plot_his(inputs, inputs_norm):
    # plot histogram for the inputs of every layer
    for j, all_inputs in enumerate([inputs, inputs_norm]):
        for i, input in enumerate(all_inputs):
            plt.subplot(2, len(all_inputs), j*len(all_inputs)+(i+1))
            plt.cla()
            if i == 0:
                the_range = (-7, 10)
            else:
                the_range = (-1, 1)
            plt.hist(input.ravel(), bins=15, range=the_range, color='#FF5733')
            plt.yticks(())
            if j == 1:
                plt.xticks(the_range)
            else:
                plt.xticks(())
            ax = plt.gca()
            ax.spines['right'].set_color('none')
            ax.spines['top'].set_color('none')
        plt.title("%s normalizing" % ("Without" if j == 0 else "With"))
    plt.draw()
    plt.pause(0.01)

#搭建神经网络
def built_net(xs, ys, norm):
    def add_layer(inputs, in_size, out_size, activation_function=None, norm=False):
        # weights and biases (bad initialization for this case)
        Weights = tf.Variable(tf.random_normal([in_size, out_size], mean=0., stddev=1.))
        biases = tf.Variable(tf.zeros([1, out_size]) + 0.1)

        # fully connected product
        Wx_plus_b = tf.matmul(inputs, Weights) + biases

        # normalize fully connected product
        if norm:
            # Batch Normalize
            #fc_mean：整批数据的均值
            #fc_var：整批数据的方差
            fc_mean, fc_var = tf.nn.moments(
                Wx_plus_b,
                axes=[0],   # the dimension you wanna normalize, here [0] for batch
                            # for image, you wanna do [0, 1, 2] for [batch, height, width] but not channel
                            #如果你是图片的话，就在0，1，2（batch, height, width）三个维度上求均值，方差
            )
            scale = tf.Variable(tf.ones([out_size]))
            shift = tf.Variable(tf.zeros([out_size]))
            epsilon = 0.001

            # apply moving average for mean and var when train on batch
            ema = tf.train.ExponentialMovingAverage(decay=0.5)
            def mean_var_with_update():
                ema_apply_op = ema.apply([fc_mean, fc_var])
                with tf.control_dependencies([ema_apply_op]):
                    return tf.identity(fc_mean), tf.identity(fc_var)
            mean, var = mean_var_with_update()

            
            Wx_plus_b = tf.nn.batch_normalization(Wx_plus_b, mean, var, shift, scale, epsilon)
            #使用了tf.nn.batch_normalization方法就是和下面注释计算的本质88一样。
            # similar with this two steps:
            # Wx_plus_b = (Wx_plus_b - fc_mean) / tf.sqrt(fc_var + 0.001)
            # Wx_plus_b = Wx_plus_b * scale + shift
            #scale是扩大的参数
            #shift是平移的参数
            

        # activation，也就是上面的Weights+biases计算完后放到激活函数激活
        if activation_function is None:
            outputs = Wx_plus_b
        else:
            outputs = activation_function(Wx_plus_b)

        return outputs

    fix_seed(1)
    
    #如果使用normalization，也就是加入BN层
    if norm:
        # BN for the first input
        #fc_mean：整批数据的均值
        #fc_var：整批数据的方差
        fc_mean, fc_var = tf.nn.moments(
            xs,
            axes=[0],
        )
        scale = tf.Variable(tf.ones([1]))
        shift = tf.Variable(tf.zeros([1]))
        epsilon = 0.001
        # apply moving average for mean and var when train on batch
        ema = tf.train.ExponentialMovingAverage(decay=0.5)
        def mean_var_with_update():
            ema_apply_op = ema.apply([fc_mean, fc_var])
            with tf.control_dependencies([ema_apply_op]):
                return tf.identity(fc_mean), tf.identity(fc_var)
        mean, var = mean_var_with_update()
        xs = tf.nn.batch_normalization(xs, mean, var, shift, scale, epsilon)

    # record inputs for every layer
    layers_inputs = [xs]

    # build hidden layers
    for l_n in range(N_LAYERS):
        layer_input = layers_inputs[l_n]
        in_size = layers_inputs[l_n].get_shape()[1].value

        output = add_layer(
            layer_input,    # input
            in_size,        # input size
            N_HIDDEN_UNITS, # output size
            ACTIVATION,     # activation function
            norm,           # normalize before activation
        )
        layers_inputs.append(output)    # add output for next run

    # build output layer
    prediction = add_layer(layers_inputs[-1], 30, 1, activation_function=None)

    cost = tf.reduce_mean(tf.reduce_sum(tf.square(ys - prediction), reduction_indices=[1]))
    train_op = tf.train.GradientDescentOptimizer(0.001).minimize(cost)
    return [train_op, cost, layers_inputs]#network的功能就是输出train_op, cost, layers_inputs

# make up data
fix_seed(1)
x_data = np.linspace(-7, 10, 2500)[:, np.newaxis]
np.random.shuffle(x_data)
noise = np.random.normal(0, 8, x_data.shape)
y_data = np.square(x_data) - 5 + noise

# plot input data
plt.scatter(x_data, y_data)
plt.show()

xs = tf.placeholder(tf.float32, [None, 1])  # [num_samples, num_features]
ys = tf.placeholder(tf.float32, [None, 1])

train_op, cost, layers_inputs = built_net(xs, ys, norm=False)   # without BN
train_op_norm, cost_norm, layers_inputs_norm = built_net(xs, ys, norm=True) # with BN

sess = tf.Session()
if int((tf.__version__).split('.')[1]) < 12 and int((tf.__version__).split('.')[0]) < 1:
    init = tf.initialize_all_variables()
else:
    init = tf.global_variables_initializer()
sess.run(init)

# record cost
cost_his = []
cost_his_norm = []
record_step = 5

plt.ion()
plt.figure(figsize=(7, 3))
for i in range(250):
    if i % 50 == 0:
        # plot histogram
        all_inputs, all_inputs_norm = sess.run([layers_inputs, layers_inputs_norm], feed_dict={xs: x_data, ys: y_data})
        plot_his(all_inputs, all_inputs_norm)

    # train on batch
    sess.run([train_op, train_op_norm], feed_dict={xs: x_data[i*10:i*10+10], ys: y_data[i*10:i*10+10]})

    if i % record_step == 0:
        # record cost
        cost_his.append(sess.run(cost, feed_dict={xs: x_data, ys: y_data}))
        cost_his_norm.append(sess.run(cost_norm, feed_dict={xs: x_data, ys: y_data}))

#matplotlib的默认显示模式为block模式。就是使用Plt.show()，程序会暂停，
#并不会继续执行下去，如果要展示动态图就要使用plt.ion()
#把block模式改为interactive交互模式
#plt.show()之前一定不要忘了加plt.ioff()，否则界面一闪而过，并不会停留
plt.ioff()
plt.figure()
#display no batch normalizatoin 
plt.plot(np.arange(len(cost_his))*record_step, np.array(cost_his), label='no BN')

#display batch normalization result
plt.plot(np.arange(len(cost_his))*record_step, np.array(cost_his_norm), label='BN')   

plt.legend()
plt.show()
运行结果：

batch normalization
总结：发现批标准化后的数据更集中，而不是分散与某个极端，使得训练结果更好泛化。