Fork me on GitHub

Batch Normalization批标准化

Batch Normalization批标准化

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
#author:victor

#为什么要批标准化(Batch Normalization)?
#Why need batch normalization?
"""
将分散的数据的统一标准化的方法。
数据分布会对神经网络训练产生影响
因为没有进行标准化,导致数据不敏感。
是为了克服神经网络层数加深导致难以训练而诞生的一个算法。
根据ICS理论,当训练集的样本数据和目标样本集分布不一致的时候,
训练得到的模型无法很好的泛化
在神经网络中,每一层的输入在经过层内操作之后必然会导致与原来对应的输入信号分布不同
,并且前层神经网络的增加会被后面的神经网络不对的累积放大。
这个问题的一个解决思路就是根据训练样本
与目标样本的比例对训练样本进行一个矫正,
而BN算法(批标准化)则可以用来规范化某些层或者所有层的输入
从而固定每层输入信号的均值与方差

Batch也就是把Data分成小批小批的来进行梯度下降。
解决方法:
显示数据X,然后经过全连接层fully connection layer,然后Batch Normalization(BN)
添加在数据X和全连接层之间。
然后在经过激励函数,再经过全连接层,这么下去
BN可以加快你的机器学习,也可以很有效的训练。
"""
#import module
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt


#ACTIVATION = tf.nn.relu#activation function,所有层都使用relu
ACTIVATION = tf.nn.tanh#activation function,所有层都使用tanh
N_LAYERS = 7#搭建7个hidden layer
N_HIDDEN_UNITS = 30#每个hidden layer有30个神经元

#重复观看的功能
def fix_seed(seed=1):
# reproducible
np.random.seed(seed)
tf.set_random_seed(seed)

#打印图
def plot_his(inputs, inputs_norm):
# plot histogram for the inputs of every layer
for j, all_inputs in enumerate([inputs, inputs_norm]):
for i, input in enumerate(all_inputs):
plt.subplot(2, len(all_inputs), j*len(all_inputs)+(i+1))
plt.cla()
if i == 0:
the_range = (-7, 10)
else:
the_range = (-1, 1)
plt.hist(input.ravel(), bins=15, range=the_range, color='#FF5733')
plt.yticks(())
if j == 1:
plt.xticks(the_range)
else:
plt.xticks(())
ax = plt.gca()
ax.spines['right'].set_color('none')
ax.spines['top'].set_color('none')
plt.title("%s normalizing" % ("Without" if j == 0 else "With"))
plt.draw()
plt.pause(0.01)

#搭建神经网络
def built_net(xs, ys, norm):
def add_layer(inputs, in_size, out_size, activation_function=None, norm=False):
# weights and biases (bad initialization for this case)
Weights = tf.Variable(tf.random_normal([in_size, out_size], mean=0., stddev=1.))
biases = tf.Variable(tf.zeros([1, out_size]) + 0.1)

# fully connected product
Wx_plus_b = tf.matmul(inputs, Weights) + biases

# normalize fully connected product
if norm:
# Batch Normalize
#fc_mean:整批数据的均值
#fc_var:整批数据的方差
fc_mean, fc_var = tf.nn.moments(
Wx_plus_b,
axes=[0], # the dimension you wanna normalize, here [0] for batch
# for image, you wanna do [0, 1, 2] for [batch, height, width] but not channel
#如果你是图片的话,就在0,1,2(batch, height, width)三个维度上求均值,方差
)
scale = tf.Variable(tf.ones([out_size]))
shift = tf.Variable(tf.zeros([out_size]))
epsilon = 0.001

# apply moving average for mean and var when train on batch
ema = tf.train.ExponentialMovingAverage(decay=0.5)
def mean_var_with_update():
ema_apply_op = ema.apply([fc_mean, fc_var])
with tf.control_dependencies([ema_apply_op]):
return tf.identity(fc_mean), tf.identity(fc_var)
mean, var = mean_var_with_update()


Wx_plus_b = tf.nn.batch_normalization(Wx_plus_b, mean, var, shift, scale, epsilon)
#使用了tf.nn.batch_normalization方法就是和下面注释计算的本质88一样。
# similar with this two steps:
# Wx_plus_b = (Wx_plus_b - fc_mean) / tf.sqrt(fc_var + 0.001)
# Wx_plus_b = Wx_plus_b * scale + shift
#scale是扩大的参数
#shift是平移的参数


# activation,也就是上面的Weights+biases计算完后放到激活函数激活
if activation_function is None:
outputs = Wx_plus_b
else:
outputs = activation_function(Wx_plus_b)

return outputs

fix_seed(1)

#如果使用normalization,也就是加入BN层
if norm:
# BN for the first input
#fc_mean:整批数据的均值
#fc_var:整批数据的方差
fc_mean, fc_var = tf.nn.moments(
xs,
axes=[0],
)
scale = tf.Variable(tf.ones([1]))
shift = tf.Variable(tf.zeros([1]))
epsilon = 0.001
# apply moving average for mean and var when train on batch
ema = tf.train.ExponentialMovingAverage(decay=0.5)
def mean_var_with_update():
ema_apply_op = ema.apply([fc_mean, fc_var])
with tf.control_dependencies([ema_apply_op]):
return tf.identity(fc_mean), tf.identity(fc_var)
mean, var = mean_var_with_update()
xs = tf.nn.batch_normalization(xs, mean, var, shift, scale, epsilon)

# record inputs for every layer
layers_inputs = [xs]

# build hidden layers
for l_n in range(N_LAYERS):
layer_input = layers_inputs[l_n]
in_size = layers_inputs[l_n].get_shape()[1].value

output = add_layer(
layer_input, # input
in_size, # input size
N_HIDDEN_UNITS, # output size
ACTIVATION, # activation function
norm, # normalize before activation
)
layers_inputs.append(output) # add output for next run

# build output layer
prediction = add_layer(layers_inputs[-1], 30, 1, activation_function=None)

cost = tf.reduce_mean(tf.reduce_sum(tf.square(ys - prediction), reduction_indices=[1]))
train_op = tf.train.GradientDescentOptimizer(0.001).minimize(cost)
return [train_op, cost, layers_inputs]#network的功能就是输出train_op, cost, layers_inputs

# make up data
fix_seed(1)
x_data = np.linspace(-7, 10, 2500)[:, np.newaxis]
np.random.shuffle(x_data)
noise = np.random.normal(0, 8, x_data.shape)
y_data = np.square(x_data) - 5 + noise

# plot input data
plt.scatter(x_data, y_data)
plt.show()

xs = tf.placeholder(tf.float32, [None, 1]) # [num_samples, num_features]
ys = tf.placeholder(tf.float32, [None, 1])

train_op, cost, layers_inputs = built_net(xs, ys, norm=False) # without BN
train_op_norm, cost_norm, layers_inputs_norm = built_net(xs, ys, norm=True) # with BN

sess = tf.Session()
if int((tf.__version__).split('.')[1]) < 12 and int((tf.__version__).split('.')[0]) < 1:
init = tf.initialize_all_variables()
else:
init = tf.global_variables_initializer()
sess.run(init)

# record cost
cost_his = []
cost_his_norm = []
record_step = 5

plt.ion()
plt.figure(figsize=(7, 3))
for i in range(250):
if i % 50 == 0:
# plot histogram
all_inputs, all_inputs_norm = sess.run([layers_inputs, layers_inputs_norm], feed_dict={xs: x_data, ys: y_data})
plot_his(all_inputs, all_inputs_norm)

# train on batch
sess.run([train_op, train_op_norm], feed_dict={xs: x_data[i*10:i*10+10], ys: y_data[i*10:i*10+10]})

if i % record_step == 0:
# record cost
cost_his.append(sess.run(cost, feed_dict={xs: x_data, ys: y_data}))
cost_his_norm.append(sess.run(cost_norm, feed_dict={xs: x_data, ys: y_data}))

#matplotlib的默认显示模式为block模式。就是使用Plt.show(),程序会暂停,
#并不会继续执行下去,如果要展示动态图就要使用plt.ion()
#把block模式改为interactive交互模式
#plt.show()之前一定不要忘了加plt.ioff(),否则界面一闪而过,并不会停留
plt.ioff()
plt.figure()
#display no batch normalizatoin
plt.plot(np.arange(len(cost_his))*record_step, np.array(cost_his), label='no BN')

#display batch normalization result
plt.plot(np.arange(len(cost_his))*record_step, np.array(cost_his_norm), label='BN')

plt.legend()
plt.show()

运行结果:

batch normalization

总结:发现批标准化后的数据更集中,而不是分散与某个极端,使得训练结果更好泛化。