diff --git a/include/caffe/layers/batch_norm_layer.hpp b/include/caffe/layers/batch_norm_layer.hpp index 9b2d5126efb..43f7b28be95 100644 --- a/include/caffe/layers/batch_norm_layer.hpp +++ b/include/caffe/layers/batch_norm_layer.hpp @@ -13,25 +13,22 @@ namespace caffe { * @brief Normalizes the input to have 0-mean and/or unit (1) variance across * the batch. * - * This layer computes Batch Normalization described in [1]. For - * each channel in the data (i.e. axis 1), it subtracts the mean and divides - * by the variance, where both statistics are computed across both spatial - * dimensions and across the different examples in the batch. + * This layer computes Batch Normalization as described in [1]. For each channel + * in the data (i.e. axis 1), it subtracts the mean and divides by the variance, + * where both statistics are computed across both spatial dimensions and across + * the different examples in the batch. * - * By default, during training time, the network is computing global mean/ - * variance statistics via a running average, which is then used at test - * time to allow deterministic outputs for each input. You can manually - * toggle whether the network is accumulating or using the statistics via the - * use_global_stats option. IMPORTANT: for this feature to work, you MUST - * set the learning rate to zero for all three parameter blobs, i.e., - * param {lr_mult: 0} three times in the layer definition. + * By default, during training time, the network is computing global + * mean/variance statistics via a running average, which is then used at test + * time to allow deterministic outputs for each input. You can manually toggle + * whether the network is accumulating or using the statistics via the + * use_global_stats option. For reference, these statistics are kept in the + * layer's three blobs: (0) mean, (1) variance, and (2) moving average factor. * * Note that the original paper also included a per-channel learned bias and - * scaling factor. It is possible (though a bit cumbersome) to implement - * this in caffe using a single-channel DummyDataLayer filled with zeros, - * followed by a Convolution layer with output the same size as the current. - * This produces a channel-specific value that can be added or multiplied by - * the BatchNorm layer's output. + * scaling factor. To implement this in Caffe, define a `ScaleLayer` configured + * with `bias_term: true` after each `BatchNormLayer` to handle both the bias + * and scaling factor. * * [1] S. Ioffe and C. Szegedy, "Batch Normalization: Accelerating Deep Network * Training by Reducing Internal Covariate Shift." arXiv preprint diff --git a/include/caffe/layers/bias_layer.hpp b/include/caffe/layers/bias_layer.hpp index eedc3aaa351..9639c9cdc8a 100644 --- a/include/caffe/layers/bias_layer.hpp +++ b/include/caffe/layers/bias_layer.hpp @@ -10,13 +10,13 @@ namespace caffe { /** - * @brief Computes a sum of two input Blobs, with the shape of the - * latter Blob "broadcast" to match the shape of the former. - * Equivalent to tiling the latter Blob, then computing the elementwise - * sum. + * @brief Computes a sum of two input Blobs, with the shape of the latter Blob + * "broadcast" to match the shape of the former. Equivalent to tiling + * the latter Blob, then computing the elementwise sum. * * The second input may be omitted, in which case it's learned as a parameter - * of the layer. + * of the layer. Note: in case bias and scaling are desired, both operations can + * be handled by `ScaleLayer` configured with `bias_term: true`. */ template class BiasLayer : public Layer { diff --git a/include/caffe/layers/scale_layer.hpp b/include/caffe/layers/scale_layer.hpp index 924df2e51ab..45b714d4027 100644 --- a/include/caffe/layers/scale_layer.hpp +++ b/include/caffe/layers/scale_layer.hpp @@ -12,13 +12,15 @@ namespace caffe { /** - * @brief Computes a product of two input Blobs, with the shape of the - * latter Blob "broadcast" to match the shape of the former. + * @brief Computes the elementwise product of two input Blobs, with the shape of + * the latter Blob "broadcast" to match the shape of the former. * Equivalent to tiling the latter Blob, then computing the elementwise - * product. + * product. Note: for efficiency and convenience, this layer can + * additionally perform a "broadcast" sum too when `bias_term: true` + * is set. * - * The second input may be omitted, in which case it's learned as a parameter - * of the layer. + * The latter, scale input may be omitted, in which case it's learned as + * parameter of the layer (as is the bias, if it is included). */ template class ScaleLayer: public Layer { diff --git a/include/caffe/util/upgrade_proto.hpp b/include/caffe/util/upgrade_proto.hpp index 14e1936a8c2..b145822af32 100644 --- a/include/caffe/util/upgrade_proto.hpp +++ b/include/caffe/util/upgrade_proto.hpp @@ -65,6 +65,12 @@ bool NetNeedsInputUpgrade(const NetParameter& net_param); // Perform all necessary transformations to upgrade input fields into layers. void UpgradeNetInput(NetParameter* net_param); +// Return true iff the Net contains batch norm layers with manual local LRs. +bool NetNeedsBatchNormUpgrade(const NetParameter& net_param); + +// Perform all necessary transformations to upgrade batch norm layers. +void UpgradeNetBatchNorm(NetParameter* net_param); + // Return true iff the solver contains any old solver_type specified as enums bool SolverNeedsTypeUpgrade(const SolverParameter& solver_param); diff --git a/src/caffe/layers/batch_norm_layer.cpp b/src/caffe/layers/batch_norm_layer.cpp index a69d8f99316..0b1037edc63 100644 --- a/src/caffe/layers/batch_norm_layer.cpp +++ b/src/caffe/layers/batch_norm_layer.cpp @@ -34,6 +34,14 @@ void BatchNormLayer::LayerSetUp(const vector*>& bottom, this->blobs_[i]->mutable_cpu_data()); } } + // Mask statistics from optimization by setting local learning rates + // for mean, variance, and the bias correction to zero. + CHECK_EQ(this->layer_param_.param_size(), 0) + << "Cannot configure batch normalization statistics as layer parameters."; + for (int i = 0; i < this->blobs_.size(); ++i) { + ParamSpec* fixed_param_spec = this->layer_param_.add_param(); + fixed_param_spec->set_lr_mult(0.); + } } template diff --git a/src/caffe/util/upgrade_proto.cpp b/src/caffe/util/upgrade_proto.cpp index 9e186915b43..a0aacbe92f8 100644 --- a/src/caffe/util/upgrade_proto.cpp +++ b/src/caffe/util/upgrade_proto.cpp @@ -14,7 +14,8 @@ namespace caffe { bool NetNeedsUpgrade(const NetParameter& net_param) { return NetNeedsV0ToV1Upgrade(net_param) || NetNeedsV1ToV2Upgrade(net_param) - || NetNeedsDataUpgrade(net_param) || NetNeedsInputUpgrade(net_param); + || NetNeedsDataUpgrade(net_param) || NetNeedsInputUpgrade(net_param) + || NetNeedsBatchNormUpgrade(net_param); } bool UpgradeNetAsNeeded(const string& param_file, NetParameter* param) { @@ -71,6 +72,14 @@ bool UpgradeNetAsNeeded(const string& param_file, NetParameter* param) { LOG(WARNING) << "Note that future Caffe releases will only support " << "input layers and not input fields."; } + // NetParameter uses old style batch norm layers; try to upgrade it. + if (NetNeedsBatchNormUpgrade(*param)) { + LOG(INFO) << "Attempting to upgrade batch norm layers using deprecated " + << "params: " << param_file; + UpgradeNetBatchNorm(param); + LOG(INFO) << "Successfully upgraded batch norm layers using deprecated " + << "params."; + } return success; } @@ -991,6 +1000,29 @@ void UpgradeNetInput(NetParameter* net_param) { net_param->clear_input_dim(); } +bool NetNeedsBatchNormUpgrade(const NetParameter& net_param) { + for (int i = 0; i < net_param.layer_size(); ++i) { + // Check if BatchNorm layers declare three parameters, as required by + // the previous BatchNorm layer definition. + if (net_param.layer(i).type() == "BatchNorm" + && net_param.layer(i).param_size() == 3) { + return true; + } + } + return false; +} + +void UpgradeNetBatchNorm(NetParameter* net_param) { + for (int i = 0; i < net_param->layer_size(); ++i) { + // Check if BatchNorm layers declare three parameters, as required by + // the previous BatchNorm layer definition. + if (net_param->layer(i).type() == "BatchNorm" + && net_param->layer(i).param_size() == 3) { + net_param->mutable_layer(i)->clear_param(); + } + } +} + // Return true iff the solver contains any old solver_type specified as enums bool SolverNeedsTypeUpgrade(const SolverParameter& solver_param) { if (solver_param.has_solver_type()) {