Best Python code snippet using locust
bn.py
Source:bn.py
1from __future__ import absolute_import, print_function, division2import numpy3import theano4from theano import Apply, Op5from theano.gof import local_optimizer6from theano.gof.opt import copy_stack_trace7from theano.tensor import as_tensor_variable, TensorType8from theano.tensor import basic as T9from theano.tensor.opt import register_specialize_device10from theano.scalar import Composite, as_common_dtype11from theano.scalar import add, sub, true_div, mul12class BNComposite(Composite):13 init_param = ('dtype',)14 @theano.configparser.change_flags(compute_test_value='off')15 def __init__(self, dtype):16 self.dtype = dtype17 x = theano.scalar.Scalar(dtype=dtype).make_variable()18 mean = theano.scalar.Scalar(dtype=dtype).make_variable()19 std = theano.scalar.Scalar(dtype=dtype).make_variable()20 gamma = theano.scalar.Scalar(dtype=dtype).make_variable()21 beta = theano.scalar.Scalar(dtype=dtype).make_variable()22 o = add(mul(true_div(sub(x, mean), std), gamma), beta)23 inputs = [x, mean, std, gamma, beta]24 outputs = [o]25 super(BNComposite, self).__init__(inputs, outputs)26 def grad(self, inps, grads):27 x, mean, std, gamma, beta = inps28 top, = grads29 top_gamma = top * gamma30 x_mean = x - mean31 dx = top_gamma / std32 dmean = -dx33 dstd = -(top_gamma * x_mean) / (std * std)34 dgamma = top * x_mean / std35 return [dx, dmean, dstd, dgamma, top]36def batch_normalization(inputs, gamma, beta, mean, std,37 mode='low_mem'):38 """39 This function will build the symbolic graph for applying batch normalization40 to a set of activations.41 Also works on GPUs, but is not optimized using cuDNN.42 .. versionadded:: 0.7.143 Parameters44 ----------45 inputs : symbolic tensor46 Mini-batch of activations47 gamma: symbolic tensor48 BN scale parameter, must be of same dimensionality as49 inputs and broadcastable against it50 beta: symbolic tensor51 BN shift parameter, must be of same dimensionality as52 inputs and broadcastable against it53 mean: symbolic tensor54 inputs means, must be of same dimensionality as55 inputs and broadcastable against it56 std: symbolic tensor57 inputs standard deviation, must be of same dimensionality as58 inputs and broadcastable against it59 mode: 'low_mem' or 'high_mem'60 Specify which batch_normalization implementation that will be61 used.62 As no intermediate representations are stored for the back-propagation,63 'low_mem' implementation lower the memory usage, however,64 it is 5-10% slower than 'high_mem' implementation. Note that 5-10% computation65 time difference compare the batch_normalization operation only, time difference66 between implementation is likely to be less important on the full model fprop/bprop.67 """68 if mode == 'low_mem':69 elm_bn = theano.tensor.elemwise.Elemwise(scalar_op=BNComposite(dtype=inputs.dtype))70 rval = elm_bn(inputs, mean, std, gamma, beta)71 elif mode == 'high_mem':72 rval = (inputs - mean) * (gamma / std) + beta73 else:74 raise ValueError(75 'mode must be either "low_mem", "high_mem"')76 return rval77def _prepare_batch_normalization_axes(axes, ndim):78 if axes == 'per-activation':79 axes = (0,)80 elif axes == 'spatial':81 axes = (0,) + tuple(range(2, ndim))82 elif isinstance(axes, (tuple, list, numpy.ndarray)):83 axes = tuple(int(a) for a in axes)84 else:85 raise ValueError('invalid axes: %s', str(axes))86 axes = tuple(sorted(axes))87 if len(axes) == 0:88 raise ValueError('there should be at least one normalization axis')89 if min(axes) < 0 or max(axes) >= ndim:90 raise ValueError('axes should be less than ndim (<%d), but %s given' % (ndim, str(axes)))91 non_bc_axes = tuple(i for i in range(ndim) if i not in axes)92 return axes, non_bc_axes93def batch_normalization_train(inputs, gamma, beta, axes='per-activation',94 epsilon=1e-4, running_average_factor=0.1,95 running_mean=None, running_var=None):96 """97 Performs batch normalization of the given inputs, using the mean and98 variance of the inputs.99 Parameters100 ----------101 axes : 'per-activation', 'spatial' or a tuple of ints102 The axes along which the input should be normalized. ``'per-activation'``103 normalizes per activation and is equal to ``axes=(0,)``.104 ``'spatial'`` shares normalization factors across spatial dimensions105 (i.e., all dimensions past the second), which for 4D inputs would be106 equal to ``axes=(0, 2, 3)``.107 gamma : tensor108 Learnable scale factors. The shape must match the shape of `inputs`,109 except for the axes in `axes`. These axes should be set to 1 or be110 skipped altogether (such that `gamma.ndim == inputs.ndim - len(axes)`).111 beta : tensor112 Learnable biases. Must match the tensor layout of `gamma`.113 epsilon : float114 Epsilon value used in the batch normalization formula. Minimum allowed115 value is 1e-5 (imposed by cuDNN).116 running_average_factor : float117 Factor for updating the values or `running_mean` and `running_var`.118 If the factor is close to one, the running averages will update quickly,119 if the factor is close to zero it will update slowly.120 running_mean : tensor or None121 Previous value of the running mean. If this is given, the new value122 ``running_mean * (1 - r_a_factor) + batch mean * r_a_factor``123 will be returned as one of the outputs of this function.124 `running_mean` and `running_var` should either both be given or125 both be None. The shape should match that of `gamma` and `beta`.126 running_var : tensor or None127 Previous value of the running variance. If this is given, the new value128 ``running_var * (1 - r_a_factor) + (m / (m - 1)) * batch var * r_a_factor``129 will be returned as one of the outputs of this function,130 where `m` is the product of lengths of the averaged-over dimensions.131 `running_mean` and `running_var` should either both be given or132 both be None. The shape should match that of `gamma` and `beta`.133 Returns134 -------135 out : tensor136 Batch-normalized inputs.137 mean : tensor138 Means of `inputs` across the normalization axes.139 invstd : tensor140 Inverse standard deviations of `inputs` across the normalization axes.141 new_running_mean : tensor142 New value of the running mean (only if both `running_mean` and143 `running_var` were given).144 new_running_var : tensor145 New value of the running variance (only if both `running_var` and146 `running_mean` were given).147 Notes148 -----149 If per-activation or spatial normalization is selected, this operation150 will use the cuDNN implementation. (This requires cuDNN 5 or newer.)151 The returned values are equivalent to:152 .. code-block:: python153 # for per-activation normalization154 axes = (0,)155 # for spatial normalization156 axes = (0,) + tuple(range(2, inputs.ndim))157 mean = inputs.mean(axes, keepdims=True)158 var = inputs.var(axes, keepdims=True)159 invstd = T.inv(T.sqrt(var + epsilon))160 out = (inputs - mean) * gamma * invstd + beta161 m = T.cast(T.prod(inputs.shape) / T.prod(mean.shape), 'float32')162 running_mean = running_mean * (1 - running_average_factor) + \\163 mean * running_average_factor164 running_var = running_var * (1 - running_average_factor) + \\165 (m / (m - 1)) * var * running_average_factor166 """167 ndim = inputs.ndim168 axes, non_bc_axes = _prepare_batch_normalization_axes(axes, ndim)169 # have the parameter tensors been broadcasted yet?170 if gamma.ndim == ndim:171 params_ndim = ndim172 else:173 params_ndim = len(non_bc_axes)174 params_dimshuffle_pattern = ['x'] * ndim175 for i, axis in enumerate(non_bc_axes):176 params_dimshuffle_pattern[axis] = i177 if gamma.ndim != params_ndim or beta.ndim != params_ndim:178 raise ValueError("gamma and beta dimensionality must match the "179 "number of non-normalized axes, or have the "180 "same number of dimensions as the inputs; "181 "got %d and %d instead of %d" %182 (gamma.ndim, beta.ndim, params_ndim))183 if (running_mean is None) != (running_var is None):184 raise ValueError("running_mean and running_var must either both be "185 "given or both be None")186 if running_mean is not None and running_mean.ndim != params_ndim:187 raise ValueError("running_mean must be of the same dimensionality "188 "as gamma and beta; got %d instead of %d" %189 (running_mean.ndim, params_ndim))190 if running_var is not None and running_var.ndim != params_ndim:191 raise ValueError("running_var must be of the same dimensionality "192 "as gamma and beta; got %d instead of %d" %193 (running_var.ndim, params_ndim))194 # epsilon will be converted to floatX later. we need to check195 # for rounding errors now, since numpy.float32(1e-5) < 1e-5.196 epsilon = numpy.cast[theano.config.floatX](epsilon)197 if epsilon < 1e-5:198 raise ValueError("epsilon must be at least 1e-5, got %s" % str(epsilon))199 inputs = as_tensor_variable(inputs)200 gamma = as_tensor_variable(gamma)201 beta = as_tensor_variable(beta)202 if params_ndim != ndim:203 gamma = gamma.dimshuffle(params_dimshuffle_pattern)204 beta = beta.dimshuffle(params_dimshuffle_pattern)205 else:206 gamma = T.addbroadcast(gamma, *axes)207 beta = T.addbroadcast(beta, *axes)208 batchnorm_op = AbstractBatchNormTrain(axes=axes)209 if running_mean is not None and running_var is not None:210 running_mean = as_tensor_variable(running_mean)211 running_var = as_tensor_variable(running_var)212 if params_ndim != ndim:213 running_mean = running_mean.dimshuffle(params_dimshuffle_pattern)214 running_var = running_var.dimshuffle(params_dimshuffle_pattern)215 else:216 running_mean = T.addbroadcast(running_mean, *axes)217 running_var = T.addbroadcast(running_var, *axes)218 out, mean, invstd, new_running_mean, new_running_var = batchnorm_op(219 inputs, gamma, beta, epsilon=epsilon,220 running_average_factor=running_average_factor,221 running_mean=running_mean, running_var=running_var)222 if new_running_mean.broadcastable != running_mean.broadcastable:223 new_running_mean = T.patternbroadcast(new_running_mean, running_mean.broadcastable)224 if new_running_var.broadcastable != running_var.broadcastable:225 new_running_var = T.patternbroadcast(new_running_var, running_var.broadcastable)226 results = (out, mean, invstd, new_running_mean, new_running_var)227 else:228 results = batchnorm_op(inputs, gamma, beta, epsilon=epsilon)229 if params_ndim != ndim:230 # remove the broadcasted dimensions (except from the output)231 results = ([results[0]] +232 [r.dimshuffle(non_bc_axes) for r in results[1:]])233 return tuple(results)234def batch_normalization_test(inputs, gamma, beta, mean, var,235 axes='per-activation', epsilon=1e-4):236 """237 Performs batch normalization of the given inputs, using the given mean and238 variance.239 Parameters240 ----------241 axes : 'per-activation', 'spatial' or a tuple of ints242 The axes along which the input should be normalized. ``'per-activation'``243 normalizes per activation and is equal to ``axes=(0,)``.244 ``'spatial'`` shares normalization factors across spatial dimensions245 (i.e., all dimensions past the second), which for 4D inputs would be246 equal to ``axes=(0, 2, 3)``.247 gamma : tensor248 Scale factors. The shape must match the shape of `inputs`,249 except for the axes in `axes`. These axes should be set to 1 or be250 skipped altogether (such that `gamma.ndim == inputs.ndim - len(axes)`).251 beta : tensor252 Biases. Must match the tensor layout of `gamma`.253 mean : tensor254 Means. Usually these are running averages computed during training.255 Must match the tensor layout of `gamma`.256 var : tensor257 Variances. Usually these are running averages computed during training.258 Must match the tensor layout of `gamma`.259 epsilon : float260 Epsilon value used in the batch normalization formula. Minimum allowed261 value is 1e-5 (imposed by cuDNN).262 Returns263 -------264 out : tensor265 Batch-normalized inputs.266 Notes267 -----268 If per-activation or spatial normalization is selected, this operation269 will use the cuDNN implementation. (This requires cuDNN 5 or newer.)270 The returned value is equivalent to:271 .. code-block:: python272 # for per-activation normalization273 axes = (0,)274 # for spatial normalization275 axes = (0,) + tuple(range(2, inputs.ndim))276 gamma, beta, mean, var = (T.addbroadcast(t, *axes)277 for t in (gamma, beta, mean, var))278 out = (inputs - mean) * gamma / T.sqrt(var + epsilon) + beta279 """280 ndim = inputs.ndim281 axes, non_bc_axes = _prepare_batch_normalization_axes(axes, ndim)282 # have the parameter tensors been broadcasted yet?283 if gamma.ndim == ndim:284 params_ndim = ndim285 else:286 params_ndim = len(non_bc_axes)287 params_dimshuffle_pattern = ['x'] * ndim288 for i, axis in enumerate(non_bc_axes):289 params_dimshuffle_pattern[axis] = i290 if gamma.ndim != params_ndim or beta.ndim != params_ndim:291 raise ValueError("gamma and beta dimensionality must match the "292 "number of non-normalized axes, or have the "293 "same number of dimensions as the inputs; "294 "got %d and %d instead of %d" %295 (gamma.ndim, beta.ndim, params_ndim))296 if mean.ndim != params_ndim or var.ndim != params_ndim:297 raise ValueError("mean and var must be of the same dimensionality "298 "as gamma and beta; got %d and %d instead of %d" %299 (mean.ndim, var.ndim, params_ndim))300 # epsilon will be converted to floatX later. we need to check301 # for rounding errors now, since numpy.float32(1e-5) < 1e-5.302 epsilon = numpy.cast[theano.config.floatX](epsilon)303 if epsilon < 1e-5:304 raise ValueError("epsilon must be at least 1e-5, got %s" % str(epsilon))305 gamma = as_tensor_variable(gamma)306 beta = as_tensor_variable(beta)307 mean = as_tensor_variable(mean)308 var = as_tensor_variable(var)309 if params_ndim != ndim:310 gamma = gamma.dimshuffle(params_dimshuffle_pattern)311 beta = beta.dimshuffle(params_dimshuffle_pattern)312 mean = mean.dimshuffle(params_dimshuffle_pattern)313 var = var.dimshuffle(params_dimshuffle_pattern)314 else:315 gamma = T.addbroadcast(gamma, *axes)316 beta = T.addbroadcast(beta, *axes)317 mean = T.addbroadcast(mean, *axes)318 var = T.addbroadcast(var, *axes)319 batchnorm_op = AbstractBatchNormInference(axes=axes)320 return batchnorm_op(inputs, gamma, beta, mean, var, epsilon=epsilon)321class AbstractBatchNormTrain(Op):322 """323 Abstract Op for Batch Normalization.324 Parameters325 ----------326 axes : a tuple of ints327 The axes along which the input should be normalized.328 x : tensor329 The input to be normalized along `axes`.330 scale : tensor331 `scale` should have the same number of dimensions as `x`.332 All dimensions listed in `axes` should have length 1.333 bias : tensor334 `bias` should have the same number of dimensions as `x`.335 All dimensions listed in `axes` should have length 1.336 epsilon337 Epsilon value used in the batch normalization formula. Minimum allowed338 value is 1e-5 (imposed by cuDNN).339 running_average_factor : float340 Factor for updating the values or `running_mean` and `running_var`.341 If the factor is close to one, the running averages will update quickly,342 if the factor is close to zero it will update slowly.343 running_mean : tensor or None344 Previous value of the running mean. If this is given, the new value345 ``running_mean * (1 - running_average_factor) + batch mean * running_average_factor``346 will be returned as one of the outputs of this function.347 `running_mean` and `running_var` should either both be given or348 both be None.349 running_var : tensor or None350 Previous value of the running variance. If this is given, the new value351 ``running_var * (1 - running_average_factor) + (m / (m - 1)) * batch var * running_average_factor``352 will be returned as one of the outputs of this function,353 where `m` is the product of lengths of the averaged-over dimensions.354 `running_mean` and `running_var` should either both be given or355 both be None.356 """357 __props__ = ('axes',)358 def __init__(self, axes=(0,)):359 assert isinstance(axes, (tuple, list))360 assert len(axes) > 0361 axes = tuple(int(a) for a in axes)362 self.axes = axes363 def infer_shape(self, node, shape):364 return [shape[0]] + [shape[1]] * (len(node.outputs) - 1)365 def make_node(self, x, scale, bias, epsilon=1e-4,366 running_average_factor=0.1,367 running_mean=None, running_var=None):368 x = as_tensor_variable(x)369 scale = as_tensor_variable(scale)370 bias = as_tensor_variable(bias)371 epsilon = as_tensor_variable(epsilon)372 running_average_factor = as_tensor_variable(running_average_factor)373 if running_mean is not None:374 running_mean = as_tensor_variable(running_mean)375 if running_var is not None:376 running_var = as_tensor_variable(running_var)377 assert x.ndim == scale.ndim == bias.ndim378 assert ((running_mean is None and running_var is None) or379 (running_mean is not None and running_var is not None))380 assert (running_mean is None or running_mean.ndim == x.ndim)381 assert (running_var is None or running_var.ndim == x.ndim)382 # Upcast to common dtype on the non-scalar383 # Keep as is dtype of scalar (epsilon and running_average_factor)384 if running_mean:385 x, scale, bias, running_mean, running_var = as_common_dtype(386 x, scale, bias, running_mean, running_var)387 else:388 x, scale, bias = as_common_dtype(x, scale, bias)389 inputs = [x, scale, bias, epsilon, running_average_factor]390 output_types = [x.type(), scale.type(), scale.type()]391 if running_mean is not None and running_var is not None:392 inputs.append(running_mean)393 inputs.append(running_var)394 output_types.append(scale.type())395 output_types.append(scale.type())396 return Apply(self, inputs, output_types)397 def L_op(self, inputs, outputs, grads):398 x, scale, bias, epsilon, running_average_factor = inputs[:5]399 dy = grads[0]400 _, x_mean, x_invstd = outputs[:3]401 disconnected_outputs = [402 theano.gradient.DisconnectedType()(), # epsilon403 theano.gradient.DisconnectedType()()] # running_average_factor404 # Optional running_mean and running_var.405 for i in range(5, len(inputs)):406 disconnected_outputs.append(theano.gradient.DisconnectedType()())407 return AbstractBatchNormTrainGrad(self.axes)(408 x, dy, scale, x_mean, x_invstd, epsilon) + disconnected_outputs409 def connection_pattern(self, node):410 # Specificy that epsilon and running_average_factor are not connected to outputs.411 patterns = [[True, True, True], # x412 [True, True, True], # scale413 [True, True, True], # bias414 [False, False, False], # epsilon415 [False, False, False]] # running_average_factor416 # Optional running_mean and running_var are only417 # connected to their new values.418 for i in range(5, len(node.inputs)):419 patterns[0].append(True)420 for pattern in patterns[1:]:421 pattern.append(False)422 patterns.append([False] * (3 + i - 5) + [True])423 return patterns424 def perform(self, node, inputs, output_storage):425 x, scale, bias, epsilon, running_average_factor = inputs[:5]426 axes = self.axes427 if min(axes) < 0 or max(axes) >= x.ndim:428 raise ValueError('axes should be less than ndim (<%d), but %s given' % (x.ndim, str(axes)))429 mean = x.mean(axes, keepdims=True)430 var = x.var(axes, keepdims=True)431 invstd = 1.0 / numpy.sqrt(var + epsilon)432 out = (x - mean) * (scale * invstd) + bias433 output_storage[0][0] = out434 output_storage[1][0] = mean435 output_storage[2][0] = invstd436 if len(inputs) > 5:437 running_mean = inputs[5]438 running_mean = running_mean * (1.0 - running_average_factor) + \439 mean * running_average_factor440 output_storage[3][0] = running_mean441 if len(inputs) > 6:442 m = float(numpy.prod(x.shape) / numpy.prod(scale.shape))443 running_var = inputs[6]444 running_var = running_var * (1.0 - running_average_factor) + \445 (m / (m - 1)) * var * running_average_factor446 output_storage[4][0] = running_var447class AbstractBatchNormInference(Op):448 """449 Abstract Op for Batch Normalization.450 Parameters451 ----------452 axes : a tuple of ints453 The axes along which the input is normalized.454 epsilon455 Epsilon value used in the batch normalization formula. Minimum allowed456 value is 1e-5 (imposed by cuDNN).457 """458 __props__ = ('axes',)459 def __init__(self, axes=(0,)):460 assert isinstance(axes, (tuple, list))461 assert len(axes) > 0462 axes = tuple(int(a) for a in axes)463 self.axes = axes464 def infer_shape(self, node, shape):465 return [shape[0]]466 def make_node(self, x, scale, bias, estimated_mean, estimated_variance, epsilon=1e-4):467 x = as_tensor_variable(x)468 scale = as_tensor_variable(scale)469 bias = as_tensor_variable(bias)470 estimated_mean = as_tensor_variable(estimated_mean)471 estimated_variance = as_tensor_variable(estimated_variance)472 epsilon = as_tensor_variable(epsilon)473 # Upcast to common dtype on the non-scalar474 # Keep as is dtype of scalar (epsilon)475 x, scale, bias, estimated_mean, estimated_variance = as_common_dtype(476 x, scale, bias, estimated_mean, estimated_variance)477 assert x.ndim == scale.ndim == bias.ndim == estimated_mean.ndim == estimated_variance.ndim478 return Apply(self, [x, scale, bias, estimated_mean, estimated_variance, epsilon], [x.type()])479 def grad(self, inputs, grads):480 x, scale, bias, est_mean, est_var, epsilon = inputs481 dy = grads[0]482 axes = self.axes483 if min(axes) < 0 or max(axes) >= x.ndim:484 raise ValueError('axes should be less than ndim (<%d), but %s given' % (x.ndim, str(axes)))485 scale, bias, est_mean, est_var = (theano.tensor.addbroadcast(t, *axes)486 for t in (scale, bias, est_mean, est_var))487 # define helper expressions488 est_var_eps = est_var + epsilon489 est_std = theano.tensor.sqrt(est_var_eps)490 two = theano.tensor.constant(2.)491 # define and return gradients492 dx = dy * (scale / est_std)493 dscale = (dy * (x - est_mean)).sum(axes, keepdims=True) / est_std494 dbias = dy.sum(axes, keepdims=True)495 dmean = -dy.sum(axes, keepdims=True) * (scale / est_std)496 dvar = -(dy * (x - est_mean)).sum(axes, keepdims=True) * (scale / (two * est_var_eps * est_std))497 return [dx, dscale, dbias, dmean, dvar, theano.gradient.DisconnectedType()()]498 def connection_pattern(self, node):499 # Specificy that epsilon is not connected to outputs.500 return [[True], [True], [True], [True], [True], [False]]501 def perform(self, node, inputs, output_storage):502 x, scale, bias, estimated_mean, estimated_variance, epsilon = inputs503 out = (x - estimated_mean) * (scale / numpy.sqrt(estimated_variance + epsilon)) + bias504 output_storage[0][0] = out505class AbstractBatchNormTrainGrad(Op):506 __props__ = ('axes',)507 def __init__(self, axes=(0,)):508 assert isinstance(axes, (tuple, list))509 assert len(axes) > 0510 axes = tuple(int(a) for a in axes)511 self.axes = axes512 def make_node(self, x, dy, scale, x_mean, x_invstd, epsilon=1e-4):513 x = as_tensor_variable(x)514 dy = as_tensor_variable(dy)515 scale = as_tensor_variable(scale)516 x_mean = as_tensor_variable(x_mean)517 x_invstd = as_tensor_variable(x_invstd)518 epsilon = as_tensor_variable(epsilon)519 # Upcast to common dtype on the non-scalar520 # Keep as is dtype of scalar (epsilon)521 x, dy, scale, x_mean, x_invstd = as_common_dtype(522 x, dy, scale, x_mean, x_invstd)523 assert x.ndim == dy.ndim == scale.ndim == x_mean.ndim == x_invstd.ndim524 return Apply(self, [x, dy, scale, x_mean, x_invstd, epsilon],525 [x.type(), scale.type(), scale.type()])526 def infer_shape(self, node, shape):527 return [shape[0], shape[2], shape[2]]528 def perform(self, node, inputs, output_storage):529 x, dy, scale, x_mean, x_invstd, epsilon = inputs530 axes = self.axes531 if min(axes) < 0 or max(axes) >= x.ndim:532 raise ValueError('axes should be less than ndim (<%d), but %s given' % (x.ndim, str(axes)))533 x_diff = x - x_mean534 mean_dy_x_diff = numpy.mean(dy * x_diff, axis=axes, keepdims=True)535 c = (dy * x_invstd) - (x_diff * mean_dy_x_diff * (x_invstd ** 3))536 g_wrt_inputs = scale * (c - numpy.mean(c, axis=axes, keepdims=True))537 g_wrt_scale = numpy.sum(dy * x_invstd * x_diff, axis=axes, keepdims=True)538 g_wrt_bias = numpy.sum(dy, axis=axes, keepdims=True)539 output_storage[0][0] = g_wrt_inputs540 output_storage[1][0] = g_wrt_scale541 output_storage[2][0] = g_wrt_bias542@local_optimizer([AbstractBatchNormTrain])543def local_abstract_batch_norm_train(node):544 if not isinstance(node.op, AbstractBatchNormTrain):545 return None546 x, scale, bias, epsilon, running_average_factor = node.inputs[:5]547 axes = node.op.axes548 if min(axes) < 0 or max(axes) > x.ndim:549 return None550 if not isinstance(x.type, TensorType) or \551 not isinstance(scale.type, TensorType) or \552 not isinstance(bias.type, TensorType) or \553 not isinstance(epsilon.type, TensorType) or \554 not isinstance(running_average_factor.type, TensorType):555 return None556 # optional running_mean and running_var557 if len(node.inputs) > 5 and not isinstance(node.inputs[5].type, TensorType):558 return None559 if len(node.inputs) > 6 and not isinstance(node.inputs[6].type, TensorType):560 return None561 mean = x.mean(axes, keepdims=True)562 var = x.var(axes, keepdims=True)563 # The epsilon should not upcast the dtype.564 if var.dtype == 'float32' and epsilon.dtype == 'float64':565 epsilon = epsilon.astype('float32')566 invstd = T.inv(T.sqrt(var + epsilon))567 out = (x - mean) * (scale * invstd) + bias568 results = [out, mean, invstd]569 if len(node.inputs) > 5:570 running_mean = node.inputs[5]571 running_mean = running_mean * (1.0 - running_average_factor) + \572 mean * running_average_factor573 results.append(running_mean)574 if len(node.inputs) > 6:575 m = T.cast(T.prod(x.shape) / T.prod(scale.shape), theano.config.floatX)576 running_var = node.inputs[6]577 running_var = running_var * (1.0 - running_average_factor) + \578 (m / (m - 1)) * var * running_average_factor579 results.append(running_var)580 results = [T.patternbroadcast(r, r_orig.broadcastable)581 for (r, r_orig) in zip(results, node.outputs)]582 for var in theano.gof.graph.variables(node.inputs, results):583 if var not in node.inputs:584 copy_stack_trace(node.outputs[0], var)585 return results586@local_optimizer([AbstractBatchNormTrainGrad])587def local_abstract_batch_norm_train_grad(node):588 if not isinstance(node.op, AbstractBatchNormTrainGrad):589 return None590 x, dy, scale, x_mean, x_invstd, epsilon = node.inputs591 axes = node.op.axes592 if min(axes) < 0 or max(axes) > x.ndim:593 return None594 if not isinstance(x.type, TensorType) or \595 not isinstance(dy.type, TensorType) or \596 not isinstance(scale.type, TensorType) or \597 not isinstance(x_mean.type, TensorType) or \598 not isinstance(x_invstd.type, TensorType) or \599 not isinstance(epsilon.type, TensorType):600 return None601 x_diff = x - x_mean602 mean_dy_x_diff = T.mean(dy * x_diff, axis=axes, keepdims=True)603 c = (dy * x_invstd) - x_diff * (mean_dy_x_diff * (x_invstd ** 3))604 g_wrt_inputs = scale * (c - T.mean(c, axis=axes, keepdims=True))605 g_wrt_scale = T.sum(dy * x_invstd * x_diff, axis=axes, keepdims=True)606 g_wrt_bias = T.sum(dy, axis=axes, keepdims=True)607 results = [g_wrt_inputs, g_wrt_scale, g_wrt_bias]608 results = [T.patternbroadcast(r, r_orig.broadcastable)609 for (r, r_orig) in zip(results, node.outputs)]610 for var in theano.gof.graph.variables(node.inputs, results):611 if var not in node.inputs:612 copy_stack_trace(node.outputs[0], var)613 return results614@local_optimizer([AbstractBatchNormInference])615def local_abstract_batch_norm_inference(node):616 if not isinstance(node.op, AbstractBatchNormInference):617 return None618 x, scale, bias, estimated_mean, estimated_variance, epsilon = node.inputs619 if not isinstance(x.type, TensorType) or \620 not isinstance(scale.type, TensorType) or \621 not isinstance(bias.type, TensorType) or \622 not isinstance(estimated_mean.type, TensorType) or \623 not isinstance(estimated_variance.type, TensorType) or \624 not isinstance(epsilon.type, TensorType):625 return None626 # The epsilon should not upcast the dtype.627 if estimated_variance.dtype == 'float32' and epsilon.dtype == 'float64':628 epsilon = epsilon.astype('float32')629 result = (x - estimated_mean) * (scale / T.sqrt(estimated_variance + epsilon)) + bias630 result = T.patternbroadcast(result, node.outputs[0].broadcastable)631 for var in theano.gof.graph.variables(node.inputs, [result]):632 if var not in node.inputs:633 copy_stack_trace(node.outputs[0], var)634 return [result]635# Register Cpu Optmization636bn_groupopt = theano.gof.optdb.LocalGroupDB()637bn_groupopt.__name__ = 'batchnorm_opts'638register_specialize_device(bn_groupopt, 'fast_compile', 'fast_run')639bn_groupopt.register('local_abstract_batch_norm_train',640 local_abstract_batch_norm_train, 30,641 'fast_compile', 'fast_run')642bn_groupopt.register('local_abstract_batch_norm_train_grad',643 local_abstract_batch_norm_train_grad, 30,644 'fast_compile', 'fast_run')645bn_groupopt.register('local_abstract_batch_norm_inference',646 local_abstract_batch_norm_inference, 30,...
layers.py
Source:layers.py
1import numpy as np2def affine_forward(x, w, b):3 """4 Computes the forward pass for an affine (fully-connected) layer.5 The input x has shape (N, d_1, ..., d_k) where x[i] is the ith input.6 We multiply this against a weight matrix of shape (D, M) where7 D = \prod_i d_i8 Inputs:9 x - Input data, of shape (N, d_1, ..., d_k)10 w - Weights, of shape (D, M)11 b - Biases, of shape (M,)12 Returns a tuple of:13 - out: output, of shape (N, M)14 - cache: (x, w, b)15 """16 out = x.reshape(x.shape[0], -1).dot(w) + b17 cache = (x, w, b)18 return out, cache19def affine_backward(dout, cache):20 """21 Computes the backward pass for an affine layer.22 Inputs:23 - dout: Upstream derivative, of shape (N, M)24 - cache: Tuple of:25 - x: Input data, of shape (N, d_1, ... d_k)26 - w: Weights, of shape (D, M)27 Returns a tuple of:28 - dx: Gradient with respect to x, of shape (N, d1, ..., d_k)29 - dw: Gradient with respect to w, of shape (D, M)30 - db: Gradient with respect to b, of shape (M,)31 """32 x, w, b = cache33 dx = dout.dot(w.T).reshape(x.shape)34 dw = x.reshape(x.shape[0], -1).T.dot(dout)35 db = np.sum(dout, axis=0)36 return dx, dw, db37def relu_forward(x):38 """39 Computes the forward pass for a layer of rectified linear units (ReLUs).40 Input:41 - x: Inputs, of any shape42 Returns a tuple of:43 - out: Output, of the same shape as x44 - cache: x45 """46 out = np.maximum(0, x)47 cache = x48 return out, cache49def relu_backward(dout, cache):50 """51 Computes the backward pass for a layer of rectified linear units (ReLUs).52 Input:53 - dout: Upstream derivatives, of any shape54 - cache: Input x, of same shape as dout55 Returns:56 - dx: Gradient with respect to x57 """58 x = cache59 dx = np.where(x > 0, dout, 0)60 return dx61def batchnorm_forward(x, gamma, beta, bn_param):62 """63 Forward pass for batch normalization.64 During training the sample mean and (uncorrected) sample variance are65 computed from minibatch statistics and used to normalize the incoming data.66 During training we also keep an exponentially decaying running mean of the mean67 and variance of each feature, and these averages are used to normalize data68 at test-time.69 At each timestep we update the running averages for mean and variance using70 an exponential decay based on the momentum parameter:71 running_mean = momentum * running_mean + (1 - momentum) * sample_mean72 running_var = momentum * running_var + (1 - momentum) * sample_var73 Note that the batch normalization paper suggests a different test-time74 behavior: they compute sample mean and variance for each feature using a75 large number of training images rather than using a running average. For76 this implementation we have chosen to use running averages instead since77 they do not require an additional estimation step; the torch7 implementation78 of batch normalization also uses running averages.79 Input:80 - x: Data of shape (N, D)81 - gamma: Scale parameter of shape (D,)82 - beta: Shift paremeter of shape (D,)83 - bn_param: Dictionary with the following keys:84 - mode: 'train' or 'test'; required85 - eps: Constant for numeric stability86 - momentum: Constant for running mean / variance.87 - running_mean: Array of shape (D,) giving running mean of features88 - running_var Array of shape (D,) giving running variance of features89 Returns a tuple of:90 - out: of shape (N, D)91 - cache: A tuple of values needed in the backward pass92 """93 mode = bn_param['mode']94 eps = bn_param.get('eps', 1e-5)95 momentum = bn_param.get('momentum', 0.9)96 N, D = x.shape97 running_mean = bn_param.get('running_mean', np.zeros(D, dtype=x.dtype))98 running_var = bn_param.get('running_var', np.zeros(D, dtype=x.dtype))99 out, cache = None, None100 if mode == 'train':101 # Compute output102 mu = x.mean(axis=0)103 xc = x - mu104 var = np.mean(xc ** 2, axis=0)105 std = np.sqrt(var + eps)106 xn = xc / std107 out = gamma * xn + beta108 cache = (mode, x, gamma, xc, std, xn, out)109 # Update running average of mean110 running_mean *= momentum111 running_mean += (1 - momentum) * mu112 # Update running average of variance113 running_var *= momentum114 running_var += (1 - momentum) * var115 elif mode == 'test':116 # Using running mean and variance to normalize117 std = np.sqrt(running_var + eps)118 xn = (x - running_mean) / std119 out = gamma * xn + beta120 cache = (mode, x, xn, gamma, beta, std)121 else:122 raise ValueError('Invalid forward batchnorm mode "%s"' % mode)123 # Store the updated running means back into bn_param124 bn_param['running_mean'] = running_mean125 bn_param['running_var'] = running_var126 return out, cache127def batchnorm_backward(dout, cache):128 """129 Backward pass for batch normalization.130 For this implementation, you should write out a computation graph for131 batch normalization on paper and propagate gradients backward through132 intermediate nodes.133 Inputs:134 - dout: Upstream derivatives, of shape (N, D)135 - cache: Variable of intermediates from batchnorm_forward.136 Returns a tuple of:137 - dx: Gradient with respect to inputs x, of shape (N, D)138 - dgamma: Gradient with respect to scale parameter gamma, of shape (D,)139 - dbeta: Gradient with respect to shift parameter beta, of shape (D,)140 """141 mode = cache[0]142 if mode == 'train':143 mode, x, gamma, xc, std, xn, out = cache144 N = x.shape[0]145 dbeta = dout.sum(axis=0)146 dgamma = np.sum(xn * dout, axis=0)147 dxn = gamma * dout148 dxc = dxn / std149 dstd = -np.sum((dxn * xc) / (std * std), axis=0)150 dvar = 0.5 * dstd / std151 dxc += (2.0 / N) * xc * dvar152 dmu = np.sum(dxc, axis=0)153 dx = dxc - dmu / N154 elif mode == 'test':155 mode, x, xn, gamma, beta, std = cache156 dbeta = dout.sum(axis=0)157 dgamma = np.sum(xn * dout, axis=0)158 dxn = gamma * dout159 dx = dxn / std160 else:161 raise ValueError(mode)162 return dx, dgamma, dbeta163def spatial_batchnorm_forward(x, gamma, beta, bn_param):164 """165 Computes the forward pass for spatial batch normalization.166 Inputs:167 - x: Input data of shape (N, C, H, W)168 - gamma: Scale parameter, of shape (C,)169 - beta: Shift parameter, of shape (C,)170 - bn_param: Dictionary with the following keys:171 - mode: 'train' or 'test'; required172 - eps: Constant for numeric stability173 - momentum: Constant for running mean / variance. momentum=0 means that174 old information is discarded completely at every time step, while175 momentum=1 means that new information is never incorporated. The176 default of momentum=0.9 should work well in most situations.177 - running_mean: Array of shape (D,) giving running mean of features178 - running_var Array of shape (D,) giving running variance of features179 Returns a tuple of:180 - out: Output data, of shape (N, C, H, W)181 - cache: Values needed for the backward pass182 """183 N, C, H, W = x.shape184 x_flat = x.transpose(0, 2, 3, 1).reshape(-1, C)185 out_flat, cache = batchnorm_forward(x_flat, gamma, beta, bn_param)186 out = out_flat.reshape(N, H, W, C).transpose(0, 3, 1, 2)187 return out, cache188def spatial_batchnorm_backward(dout, cache):189 """190 Computes the backward pass for spatial batch normalization.191 Inputs:192 - dout: Upstream derivatives, of shape (N, C, H, W)193 - cache: Values from the forward pass194 Returns a tuple of:195 - dx: Gradient with respect to inputs, of shape (N, C, H, W)196 - dgamma: Gradient with respect to scale parameter, of shape (C,)197 - dbeta: Gradient with respect to shift parameter, of shape (C,)198 """199 N, C, H, W = dout.shape200 dout_flat = dout.transpose(0, 2, 3, 1).reshape(-1, C)201 dx_flat, dgamma, dbeta = batchnorm_backward(dout_flat, cache)202 dx = dx_flat.reshape(N, H, W, C).transpose(0, 3, 1, 2)203 return dx, dgamma, dbeta204def svm_loss(x, y):205 """206 Computes the loss and gradient using for multiclass SVM classification.207 Inputs:208 - x: Input data, of shape (N, C) where x[i, j] is the score for the jth class209 for the ith input.210 - y: Vector of labels, of shape (N,) where y[i] is the label for x[i] and211 0 <= y[i] < C212 Returns a tuple of:213 - loss: Scalar giving the loss214 - dx: Gradient of the loss with respect to x215 """216 N = x.shape[0]217 correct_class_scores = x[np.arange(N), y]218 margins = np.maximum(0, x - correct_class_scores[:, np.newaxis] + 1.0)219 margins[np.arange(N), y] = 0220 loss = np.sum(margins) / N221 num_pos = np.sum(margins > 0, axis=1)222 dx = np.zeros_like(x)223 dx[margins > 0] = 1224 dx[np.arange(N), y] -= num_pos225 dx /= N226 return loss, dx227def softmax_loss(x, y):228 """229 Computes the loss and gradient for softmax classification.230 Inputs:231 - x: Input data, of shape (N, C) where x[i, j] is the score for the jth class232 for the ith input.233 - y: Vector of labels, of shape (N,) where y[i] is the label for x[i] and234 0 <= y[i] < C235 Returns a tuple of:236 - loss: Scalar giving the loss237 - dx: Gradient of the loss with respect to x238 """239 probs = np.exp(x - np.max(x, axis=1, keepdims=True))240 probs /= np.sum(probs, axis=1, keepdims=True)241 N = x.shape[0]242 loss = -np.sum(np.log(probs[np.arange(N), y])) / N243 dx = probs.copy()244 dx[np.arange(N), y] -= 1245 dx /= N...
batch_norm.py
Source:batch_norm.py
1# encoding: utf-82"""3@author: liaoxingyu4@contact: sherlockliao01@gmail.com5"""6import logging7import torch8import torch.nn.functional as F9from torch import nn10__all__ = ["IBN", "get_norm"]11class BatchNorm(nn.BatchNorm2d):12 def __init__(self, num_features, eps=1e-05, momentum=0.1, weight_freeze=False, bias_freeze=False, weight_init=1.0,13 bias_init=0.0, **kwargs):14 super().__init__(num_features, eps=eps, momentum=momentum)15 if weight_init is not None: nn.init.constant_(self.weight, weight_init)16 if bias_init is not None: nn.init.constant_(self.bias, bias_init)17 self.weight.requires_grad_(not weight_freeze)18 self.bias.requires_grad_(not bias_freeze)19class SyncBatchNorm(nn.SyncBatchNorm):20 def __init__(self, num_features, eps=1e-05, momentum=0.1, weight_freeze=False, bias_freeze=False, weight_init=1.0,21 bias_init=0.0):22 super().__init__(num_features, eps=eps, momentum=momentum)23 if weight_init is not None: nn.init.constant_(self.weight, weight_init)24 if bias_init is not None: nn.init.constant_(self.bias, bias_init)25 self.weight.requires_grad_(not weight_freeze)26 self.bias.requires_grad_(not bias_freeze)27class IBN(nn.Module):28 def __init__(self, planes, bn_norm, **kwargs):29 super(IBN, self).__init__()30 half1 = int(planes / 2)31 self.half = half132 half2 = planes - half133 self.IN = nn.InstanceNorm2d(half1, affine=True)34 self.BN = get_norm(bn_norm, half2, **kwargs)35 def forward(self, x):36 split = torch.split(x, self.half, 1)37 out1 = self.IN(split[0].contiguous())38 out2 = self.BN(split[1].contiguous())39 out = torch.cat((out1, out2), 1)40 return out41class GhostBatchNorm(BatchNorm):42 def __init__(self, num_features, num_splits=1, **kwargs):43 super().__init__(num_features, **kwargs)44 self.num_splits = num_splits45 self.register_buffer('running_mean', torch.zeros(num_features))46 self.register_buffer('running_var', torch.ones(num_features))47 def forward(self, input):48 N, C, H, W = input.shape49 if self.training or not self.track_running_stats:50 self.running_mean = self.running_mean.repeat(self.num_splits)51 self.running_var = self.running_var.repeat(self.num_splits)52 outputs = F.batch_norm(53 input.view(-1, C * self.num_splits, H, W), self.running_mean, self.running_var,54 self.weight.repeat(self.num_splits), self.bias.repeat(self.num_splits),55 True, self.momentum, self.eps).view(N, C, H, W)56 self.running_mean = torch.mean(self.running_mean.view(self.num_splits, self.num_features), dim=0)57 self.running_var = torch.mean(self.running_var.view(self.num_splits, self.num_features), dim=0)58 return outputs59 else:60 return F.batch_norm(61 input, self.running_mean, self.running_var,62 self.weight, self.bias, False, self.momentum, self.eps)63class FrozenBatchNorm(nn.Module):64 """65 BatchNorm2d where the batch statistics and the affine parameters are fixed.66 It contains non-trainable buffers called67 "weight" and "bias", "running_mean", "running_var",68 initialized to perform identity transformation.69 The pre-trained backbone models from Caffe2 only contain "weight" and "bias",70 which are computed from the original four parameters of BN.71 The affine transform `x * weight + bias` will perform the equivalent72 computation of `(x - running_mean) / sqrt(running_var) * weight + bias`.73 When loading a backbone model from Caffe2, "running_mean" and "running_var"74 will be left unchanged as identity transformation.75 Other pre-trained backbone models may contain all 4 parameters.76 The forward is implemented by `F.batch_norm(..., training=False)`.77 """78 _version = 379 def __init__(self, num_features, eps=1e-5, **kwargs):80 super().__init__()81 self.num_features = num_features82 self.eps = eps83 self.register_buffer("weight", torch.ones(num_features))84 self.register_buffer("bias", torch.zeros(num_features))85 self.register_buffer("running_mean", torch.zeros(num_features))86 self.register_buffer("running_var", torch.ones(num_features) - eps)87 def forward(self, x):88 if x.requires_grad:89 # When gradients are needed, F.batch_norm will use extra memory90 # because its backward op computes gradients for weight/bias as well.91 scale = self.weight * (self.running_var + self.eps).rsqrt()92 bias = self.bias - self.running_mean * scale93 scale = scale.reshape(1, -1, 1, 1)94 bias = bias.reshape(1, -1, 1, 1)95 return x * scale + bias96 else:97 # When gradients are not needed, F.batch_norm is a single fused op98 # and provide more optimization opportunities.99 return F.batch_norm(100 x,101 self.running_mean,102 self.running_var,103 self.weight,104 self.bias,105 training=False,106 eps=self.eps,107 )108 def _load_from_state_dict(109 self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs110 ):111 version = local_metadata.get("version", None)112 if version is None or version < 2:113 # No running_mean/var in early versions114 # This will silent the warnings115 if prefix + "running_mean" not in state_dict:116 state_dict[prefix + "running_mean"] = torch.zeros_like(self.running_mean)117 if prefix + "running_var" not in state_dict:118 state_dict[prefix + "running_var"] = torch.ones_like(self.running_var)119 if version is not None and version < 3:120 logger = logging.getLogger(__name__)121 logger.info("FrozenBatchNorm {} is upgraded to version 3.".format(prefix.rstrip(".")))122 # In version < 3, running_var are used without +eps.123 state_dict[prefix + "running_var"] -= self.eps124 super()._load_from_state_dict(125 state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs126 )127 def __repr__(self):128 return "FrozenBatchNorm2d(num_features={}, eps={})".format(self.num_features, self.eps)129 @classmethod130 def convert_frozen_batchnorm(cls, module):131 """132 Convert BatchNorm/SyncBatchNorm in module into FrozenBatchNorm.133 Args:134 module (torch.nn.Module):135 Returns:136 If module is BatchNorm/SyncBatchNorm, returns a new module.137 Otherwise, in-place convert module and return it.138 Similar to convert_sync_batchnorm in139 https://github.com/pytorch/pytorch/blob/master/torch/nn/modules/batchnorm.py140 """141 bn_module = nn.modules.batchnorm142 bn_module = (bn_module.BatchNorm2d, bn_module.SyncBatchNorm)143 res = module144 if isinstance(module, bn_module):145 res = cls(module.num_features)146 if module.affine:147 res.weight.data = module.weight.data.clone().detach()148 res.bias.data = module.bias.data.clone().detach()149 res.running_mean.data = module.running_mean.data150 res.running_var.data = module.running_var.data151 res.eps = module.eps152 else:153 for name, child in module.named_children():154 new_child = cls.convert_frozen_batchnorm(child)155 if new_child is not child:156 res.add_module(name, new_child)157 return res158def get_norm(norm, out_channels, **kwargs):159 """160 Args:161 norm (str or callable): either one of BN, GhostBN, FrozenBN, GN or SyncBN;162 or a callable that takes a channel number and returns163 the normalization layer as a nn.Module164 out_channels: number of channels for normalization layer165 Returns:166 nn.Module or None: the normalization layer167 """168 if isinstance(norm, str):169 if len(norm) == 0:170 return None171 norm = {172 "BN": BatchNorm,173 "syncBN": SyncBatchNorm,174 "GhostBN": GhostBatchNorm,175 "FrozenBN": FrozenBatchNorm,176 "GN": lambda channels, **args: nn.GroupNorm(32, channels),177 }[norm]...
precise_bn.py
Source:precise_bn.py
1#!/usr/bin/env python32# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.3## https://github.com/facebookresearch/fvcore/blob/master/fvcore/nn/precise_bn.py4import itertools5import torch6import torch.nn as nn7import logging8from typing import Iterable, Any9from torch.distributed import ReduceOp, all_reduce10logger = logging.getLogger(__name__)11BN_MODULE_TYPES = (12 torch.nn.BatchNorm1d,13 torch.nn.BatchNorm2d,14 torch.nn.BatchNorm3d,15 torch.nn.SyncBatchNorm,16)17# pyre-fixme[56]: Decorator `torch.no_grad(...)` could not be called, because its18# type `no_grad` is not callable.19@torch.no_grad()20def update_bn_stats(21 args: Any, model: nn.Module, data_loader: Iterable[Any], num_iters: int = 200 # pyre-ignore22) -> None:23 """24 Recompute and update the batch norm stats to make them more precise. During25 training both BN stats and the weight are changing after every iteration, so26 the running average can not precisely reflect the actual stats of the27 current model.28 In this function, the BN stats are recomputed with fixed weights, to make29 the running average more precise. Specifically, it computes the true average30 of per-batch mean/variance instead of the running average.31 Args:32 model (nn.Module): the model whose bn stats will be recomputed.33 Note that:34 1. This function will not alter the training mode of the given model.35 Users are responsible for setting the layers that needs36 precise-BN to training mode, prior to calling this function.37 2. Be careful if your models contain other stateful layers in38 addition to BN, i.e. layers whose state can change in forward39 iterations. This function will alter their state. If you wish40 them unchanged, you need to either pass in a submodule without41 those layers, or backup the states.42 data_loader (iterator): an iterator. Produce data as inputs to the model.43 num_iters (int): number of iterations to compute the stats.44 """45 bn_layers = get_bn_modules(model)46 if len(bn_layers) == 0:47 return48 # In order to make the running stats only reflect the current batch, the49 # momentum is disabled.50 # bn.running_mean = (1 - momentum) * bn.running_mean + momentum * batch_mean51 # Setting the momentum to 1.0 to compute the stats without momentum.52 momentum_actual = [bn.momentum for bn in bn_layers]53 if args.rank == 0:54 a = [round(i.running_mean.cpu().numpy().max(), 4) for i in bn_layers]55 logger.info('bn mean max, %s', max(a))56 logger.info(a)57 a = [round(i.running_var.cpu().numpy().max(), 4) for i in bn_layers]58 logger.info('bn var max, %s', max(a))59 logger.info(a)60 for bn in bn_layers:61 # pyre-fixme[16]: `Module` has no attribute `momentum`.62 # bn.running_mean = torch.ones_like(bn.running_mean)63 # bn.running_var = torch.zeros_like(bn.running_var)64 bn.momentum = 1.065 # Note that PyTorch's running_var means "running average of66 # bessel-corrected batch variance". (PyTorch's BN normalizes by biased67 # variance, but updates EMA by unbiased (bessel-corrected) variance).68 # So we estimate population variance by "simple average of bessel-corrected69 # batch variance". This is the same as in the BatchNorm paper, Sec 3.1.70 # This estimator converges to population variance as long as batch size71 # is not too small, and total #samples for PreciseBN is large enough.72 # Its convergence may be affected by small batch size.73 # Alternatively, one can estimate population variance by the sample variance74 # of all batches combined. However, this needs a way to know the batch size75 # of each batch in this function (otherwise we only have access to the76 # bessel-corrected batch variance given by pytorch), which is an extra77 # requirement.78 running_mean = [torch.zeros_like(bn.running_mean) for bn in bn_layers]79 running_var = [torch.zeros_like(bn.running_var) for bn in bn_layers]80 ind = -181 for ind, inputs in enumerate(itertools.islice(data_loader, num_iters)):82 with torch.no_grad():83 model(inputs)84 for i, bn in enumerate(bn_layers):85 # Accumulates the bn stats.86 running_mean[i] += (bn.running_mean - running_mean[i]) / (ind + 1)87 running_var[i] += (bn.running_var - running_var[i]) / (ind + 1)88 if torch.sum(torch.isnan(bn.running_mean)) > 0 or torch.sum(torch.isnan(bn.running_var)) > 0:89 raise RuntimeError(90 "update_bn_stats ERROR(args.rank {}): Got NaN val".format(args.rank))91 if torch.sum(torch.isinf(bn.running_mean)) > 0 or torch.sum(torch.isinf(bn.running_var)) > 0:92 raise RuntimeError(93 "update_bn_stats ERROR(args.rank {}): Got INf val".format(args.rank))94 if torch.sum(~torch.isfinite(bn.running_mean)) > 0 or torch.sum(~torch.isfinite(bn.running_var)) > 0:95 raise RuntimeError(96 "update_bn_stats ERROR(args.rank {}): Got INf val".format(args.rank))97 assert ind == num_iters - 1, (98 "update_bn_stats is meant to run for {} iterations, "99 "but the dataloader stops at {} iterations.".format(num_iters, ind)100 )101 for i, bn in enumerate(bn_layers):102 if args.distributed:103 all_reduce(running_mean[i], op=ReduceOp.SUM)104 all_reduce(running_var[i], op=ReduceOp.SUM)105 running_mean[i] = running_mean[i] / args.gpu_nums106 running_var[i] = running_var[i] / args.gpu_nums107 # Sets the precise bn stats.108 # pyre-fixme[16]: `Module` has no attribute `running_mean`.109 bn.running_mean = running_mean[i]110 # pyre-fixme[16]: `Module` has no attribute `running_var`.111 bn.running_var = running_var[i]112 bn.momentum = momentum_actual[i]113 if args.rank == 0:114 a = [round(i.cpu().numpy().max(), 4) for i in running_mean]115 logger.info('bn mean max, %s (%s)', max(a), a)116 a = [round(i.cpu().numpy().max(), 4) for i in running_var]117 logger.info('bn var max, %s (%s)', max(a), a)118def get_bn_modules(model):119 """120 Find all BatchNorm (BN) modules that are in training mode. See121 fvcore.precise_bn.BN_MODULE_TYPES for a list of all modules that are122 included in this search.123 Args:124 model (nn.Module): a model possibly containing BN modules.125 Returns:126 list[nn.Module]: all BN modules in the model.127 """128 # Finds all the bn layers.129 bn_layers = [130 m131 for m in model.modules()132 if m.training and isinstance(m, BN_MODULE_TYPES)133 ]...
Learn to execute automation testing from scratch with LambdaTest Learning Hub. Right from setting up the prerequisites to run your first automation test, to following best practices and diving deeper into advanced test scenarios. LambdaTest Learning Hubs compile a list of step-by-step guides to help you be proficient with different test automation frameworks i.e. Selenium, Cypress, TestNG etc.
You could also refer to video tutorials over LambdaTest YouTube channel to get step by step demonstration from industry experts.
Get 100 minutes of automation test minutes FREE!!