diff --git a/include/caffe/net.hpp b/include/caffe/net.hpp index dfd2e556a2b..bf997553ee2 100644 --- a/include/caffe/net.hpp +++ b/include/caffe/net.hpp @@ -58,6 +58,12 @@ class Net { string Forward(const string& input_blob_protos, Dtype* loss = NULL); /** + * @brief Zeroes out the diffs of all net parameters. + * Should be run before Backward. + */ + void ClearParamDiffs(); + + /** * The network backward should take no input and output, since it solely * computes the gradient w.r.t the parameters, and the data has already been * provided during the forward pass. @@ -84,6 +90,13 @@ class Net { /// @brief Updates the network weights based on the diff values computed. void Update(); + /** + * @brief Shares weight data of owner blobs with shared blobs. + * + * Note: this is called by Net::Init, and thus should normally not be + * called manually. + */ + void ShareWeights(); /** * @brief For an already initialized net, implicitly copies (i.e., using no @@ -148,11 +161,19 @@ class Net { inline const vector > >& params() const { return params_; } - /// @brief returns the parameter learning rate multipliers + inline const vector*>& learnable_params() const { + return learnable_params_; + } + /// @brief returns the learnable parameter learning rate multipliers inline const vector& params_lr() const { return params_lr_; } + inline const vector& has_params_lr() const { return has_params_lr_; } + /// @brief returns the learnable parameter decay multipliers inline const vector& params_weight_decay() const { return params_weight_decay_; } + inline const vector& has_params_decay() const { + return has_params_decay_; + } const map& param_names_index() const { return param_names_index_; } @@ -213,9 +234,6 @@ class Net { /// @brief Helper for displaying debug info in Update. void UpdateDebugInfo(const int param_id); - /// @brief Get misc parameters, e.g. the LR multiplier and weight decay. - void GetLearningRateAndWeightDecay(); - /// @brief The network name string name_; /// @brief The phase: TRAIN or TEST @@ -254,10 +272,21 @@ class Net { vector*> net_output_blobs_; /// The parameters in the network. vector > > params_; - /// the learning rate multipliers + vector*> learnable_params_; + /** + * The mapping from params_ -> learnable_params_: we have + * learnable_param_ids_.size() == params_.size(), + * and learnable_params_[learnable_param_ids_[i]] == params_[i].get() + * if and only if params_[i] is an "owner"; otherwise, params_[i] is a sharer + * and learnable_params_[learnable_param_ids_[i]] gives its owner. + */ + vector learnable_param_ids_; + /// the learning rate multipliers for learnable_params_ vector params_lr_; - /// the weight decay multipliers + vector has_params_lr_; + /// the weight decay multipliers for learnable_params_ vector params_weight_decay_; + vector has_params_decay_; /// The bytes of memory used by this net size_t memory_used_; /// Whether to compute and display debug info for the net. diff --git a/src/caffe/net.cpp b/src/caffe/net.cpp index 0812b367ac3..0e5ed804b73 100644 --- a/src/caffe/net.cpp +++ b/src/caffe/net.cpp @@ -244,7 +244,7 @@ void Net::Init(const NetParameter& in_param) { for (size_t layer_id = 0; layer_id < layer_names_.size(); ++layer_id) { layer_names_index_[layer_names_[layer_id]] = layer_id; } - GetLearningRateAndWeightDecay(); + ShareWeights(); debug_info_ = param.debug_info(); LOG(INFO) << "Network initialization done."; LOG(INFO) << "Memory required for data: " << memory_used_ * sizeof(Dtype); @@ -441,6 +441,9 @@ void Net::AppendParam(const NetParameter& param, const int layer_id, params_.push_back(layers_[layer_id]->blobs()[param_id]); param_id_vecs_[layer_id].push_back(net_param_id); param_layer_indices_.push_back(make_pair(layer_id, param_id)); + ParamSpec default_param_spec; + const ParamSpec* param_spec = (layer_param.param_size() > param_id) ? + &layer_param.param(param_id) : &default_param_spec; if (!param_size || !param_name.size() || (param_name.size() && param_names_index_.find(param_name) == param_names_index_.end())) { // This layer "owns" this parameter blob -- it is either anonymous @@ -450,6 +453,13 @@ void Net::AppendParam(const NetParameter& param, const int layer_id, if (param_name.size()) { param_names_index_[param_name] = net_param_id; } + const int learnable_param_id = learnable_params_.size(); + learnable_params_.push_back(params_[net_param_id].get()); + learnable_param_ids_.push_back(learnable_param_id); + has_params_lr_.push_back(param_spec->has_lr_mult()); + has_params_decay_.push_back(param_spec->has_decay_mult()); + params_lr_.push_back(param_spec->lr_mult()); + params_weight_decay_.push_back(param_spec->decay_mult()); } else { // Named param blob with name we've seen before: share params const int owner_net_param_id = param_names_index_[param_name]; @@ -474,23 +484,25 @@ void Net::AppendParam(const NetParameter& param, const int layer_id, // Strict dimension checking -- all dims must be the same. CHECK(this_blob->shape() == owner_blob->shape()); } - layers_[layer_id]->blobs()[param_id]->ShareData( - *layers_[owner_layer_id]->blobs()[owner_param_id]); - } -} - -template -void Net::GetLearningRateAndWeightDecay() { - LOG(INFO) << "Collecting Learning Rate and Weight Decay."; - ParamSpec default_param_spec; - for (int i = 0; i < layers_.size(); ++i) { - vector > >& layer_blobs = layers_[i]->blobs(); - for (int j = 0; j < layer_blobs.size(); ++j) { - const ParamSpec* param_spec = - (layers_[i]->layer_param().param_size() > j) ? - &layers_[i]->layer_param().param(j) : &default_param_spec; - params_lr_.push_back(param_spec->lr_mult()); - params_weight_decay_.push_back(param_spec->decay_mult()); + const int learnable_param_id = learnable_param_ids_[owner_net_param_id]; + if (param_spec->has_lr_mult()) { + if (has_params_lr_[learnable_param_id]) { + CHECK_EQ(param_spec->lr_mult(), params_lr_[learnable_param_id]) + << "Shared param '" << param_name << "' has mismatched lr_mult."; + } else { + has_params_lr_[learnable_param_id] = true; + params_lr_[learnable_param_id] = param_spec->lr_mult(); + } + } + if (param_spec->has_decay_mult()) { + if (has_params_decay_[learnable_param_id]) { + CHECK_EQ(param_spec->decay_mult(), + params_weight_decay_[learnable_param_id]) + << "Shared param '" << param_name << "' has mismatched decay_mult."; + } else { + has_params_decay_[learnable_param_id] = true; + params_weight_decay_[learnable_param_id] = param_spec->decay_mult(); + } } } } @@ -895,39 +907,38 @@ void Net::ToHDF5(const string& filename, bool write_diff) const { template void Net::Update() { - // First, accumulate the diffs of any shared parameters into their owner's - // diff. (Assumes that the learning rate, weight decay, etc. have already been - // accounted for in the current diff.) - for (int i = 0; i < params_.size(); ++i) { - if (param_owners_[i] < 0) { continue; } - if (debug_info_) { UpdateDebugInfo(i); } - const int count = params_[i]->count(); - const Dtype* this_diff; - Dtype* owner_diff; + for (int i = 0; i < learnable_params_.size(); ++i) { + learnable_params_[i]->Update(); + } +} + +template +void Net::ClearParamDiffs() { + for (int i = 0; i < learnable_params_.size(); ++i) { + Blob* blob = learnable_params_[i]; switch (Caffe::mode()) { case Caffe::CPU: - this_diff = params_[i]->cpu_diff(); - owner_diff = params_[param_owners_[i]]->mutable_cpu_diff(); - caffe_add(count, this_diff, owner_diff, owner_diff); + caffe_set(blob->count(), static_cast(0), + blob->mutable_cpu_diff()); break; case Caffe::GPU: #ifndef CPU_ONLY - this_diff = params_[i]->gpu_diff(); - owner_diff = params_[param_owners_[i]]->mutable_gpu_diff(); - caffe_gpu_add(count, this_diff, owner_diff, owner_diff); + caffe_gpu_set(blob->count(), static_cast(0), + blob->mutable_gpu_diff()); #else NO_GPU; #endif break; - default: - LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode(); } } - // Now, update the owned parameters. +} + +template +void Net::ShareWeights() { for (int i = 0; i < params_.size(); ++i) { - if (param_owners_[i] >= 0) { continue; } - if (debug_info_) { UpdateDebugInfo(i); } - params_[i]->Update(); + if (param_owners_[i] < 0) { continue; } + params_[i]->ShareData(*params_[param_owners_[i]]); + params_[i]->ShareDiff(*params_[param_owners_[i]]); } } diff --git a/src/caffe/solver.cpp b/src/caffe/solver.cpp index 75271138bdd..32276ac148a 100644 --- a/src/caffe/solver.cpp +++ b/src/caffe/solver.cpp @@ -173,24 +173,7 @@ void Solver::Step(int iters) { while (iter_ < stop_iter) { // zero-init the params - for (int i = 0; i < net_->params().size(); ++i) { - shared_ptr > blob = net_->params()[i]; - switch (Caffe::mode()) { - case Caffe::CPU: - caffe_set(blob->count(), static_cast(0), - blob->mutable_cpu_diff()); - break; - case Caffe::GPU: -#ifndef CPU_ONLY - caffe_gpu_set(blob->count(), static_cast(0), - blob->mutable_gpu_diff()); -#else - NO_GPU; -#endif - break; - } - } - + net_->ClearParamDiffs(); if (param_.test_interval() && iter_ % param_.test_interval() == 0 && (iter_ > 0 || param_.test_initialization())) { TestAll(); @@ -462,7 +445,7 @@ Dtype SGDSolver::GetLearningRate() { template void SGDSolver::PreSolve() { // Initialize the history - const vector > >& net_params = this->net_->params(); + const vector*>& net_params = this->net_->learnable_params(); history_.clear(); update_.clear(); temp_.clear(); @@ -478,12 +461,10 @@ template void SGDSolver::ClipGradients() { const Dtype clip_gradients = this->param_.clip_gradients(); if (clip_gradients < 0) { return; } - const vector > >& net_params = this->net_->params(); + const vector*>& net_params = this->net_->learnable_params(); Dtype sumsq_diff = 0; for (int i = 0; i < net_params.size(); ++i) { - if (this->net_->param_owners()[i] < 0) { - sumsq_diff += net_params[i]->sumsq_diff(); - } + sumsq_diff += net_params[i]->sumsq_diff(); } const Dtype l2norm_diff = std::sqrt(sumsq_diff); if (l2norm_diff > clip_gradients) { @@ -492,9 +473,7 @@ void SGDSolver::ClipGradients() { << l2norm_diff << " > " << clip_gradients << ") " << "by scale factor " << scale_factor; for (int i = 0; i < net_params.size(); ++i) { - if (this->net_->param_owners()[i] < 0) { - net_params[i]->scale_diff(scale_factor); - } + net_params[i]->scale_diff(scale_factor); } } } @@ -506,7 +485,8 @@ void SGDSolver::ApplyUpdate() { LOG(INFO) << "Iteration " << this->iter_ << ", lr = " << rate; } ClipGradients(); - for (int param_id = 0; param_id < this->net_->params().size(); ++param_id) { + for (int param_id = 0; param_id < this->net_->learnable_params().size(); + ++param_id) { Normalize(param_id); Regularize(param_id); ComputeUpdateValue(param_id, rate); @@ -518,7 +498,7 @@ template void SGDSolver::Normalize(int param_id) { if (this->param_.iter_size() == 1) { return; } // Scale gradient to counterbalance accumulation. - const vector > >& net_params = this->net_->params(); + const vector*>& net_params = this->net_->learnable_params(); const Dtype accum_normalization = Dtype(1.) / this->param_.iter_size(); switch (Caffe::mode()) { case Caffe::CPU: { @@ -542,7 +522,7 @@ void SGDSolver::Normalize(int param_id) { template void SGDSolver::Regularize(int param_id) { - const vector > >& net_params = this->net_->params(); + const vector*>& net_params = this->net_->learnable_params(); const vector& net_params_weight_decay = this->net_->params_weight_decay(); Dtype weight_decay = this->param_.weight_decay(); @@ -604,7 +584,7 @@ void SGDSolver::Regularize(int param_id) { template void SGDSolver::ComputeUpdateValue(int param_id, Dtype rate) { - const vector > >& net_params = this->net_->params(); + const vector*>& net_params = this->net_->learnable_params(); const vector& net_params_lr = this->net_->params_lr(); Dtype momentum = this->param_.momentum(); Dtype local_rate = rate * net_params_lr[param_id]; @@ -743,7 +723,7 @@ void SGDSolver::RestoreSolverStateFromHDF5(const string& state_file) { template void NesterovSolver::ComputeUpdateValue(int param_id, Dtype rate) { - const vector > >& net_params = this->net_->params(); + const vector*>& net_params = this->net_->learnable_params(); const vector& net_params_lr = this->net_->params_lr(); Dtype momentum = this->param_.momentum(); Dtype local_rate = rate * net_params_lr[param_id]; @@ -803,7 +783,7 @@ void NesterovSolver::ComputeUpdateValue(int param_id, Dtype rate) { template void AdaGradSolver::ComputeUpdateValue(int param_id, Dtype rate) { - const vector > >& net_params = this->net_->params(); + const vector*>& net_params = this->net_->learnable_params(); const vector& net_params_lr = this->net_->params_lr(); Dtype delta = this->param_.delta(); Dtype local_rate = rate * net_params_lr[param_id]; diff --git a/src/caffe/test/test_gradient_based_solver.cpp b/src/caffe/test/test_gradient_based_solver.cpp index 78bf4b3121e..e1bbbf17726 100644 --- a/src/caffe/test/test_gradient_based_solver.cpp +++ b/src/caffe/test/test_gradient_based_solver.cpp @@ -24,12 +24,14 @@ class GradientBasedSolverTest : public MultiDeviceTest { protected: GradientBasedSolverTest() : - seed_(1701), num_(4), channels_(3), height_(10), width_(10) {} + seed_(1701), num_(4), channels_(3), height_(10), width_(10), + constant_data_(false), share_(false) {} string snapshot_prefix_; shared_ptr > solver_; int seed_; int num_, channels_, height_, width_; + bool constant_data_, share_; Dtype delta_; // Stability constant for AdaGrad. virtual SolverParameter_SolverType solver_type() = 0; @@ -79,7 +81,9 @@ class GradientBasedSolverTest : public MultiDeviceTest { " height: 1 " " width: 1 " " data_filler { " - " type: 'constant' " + " type: '" << string(constant_data_ ? "constant" : "gaussian") + << "' " + " std: 1.0 " " value: 1.0 " " } " " data_filler { " @@ -89,10 +93,26 @@ class GradientBasedSolverTest : public MultiDeviceTest { " } " " top: 'data' " " top: 'targets' " - " } " + " } "; + if (share_) { + proto << + " layer { " + " name: 'slice' " + " type: 'Slice' " + " bottom: 'data' " + " top: 'data1' " + " top: 'data2' " + " slice_param { " + " axis: 0 " + " } " + " } "; + } + proto << " layer { " " name: 'innerprod' " " type: 'InnerProduct' " + " param { name: 'weights' } " + " param { name: 'bias' } " " inner_product_param { " " num_output: 1 " " weight_filler { " @@ -104,9 +124,42 @@ class GradientBasedSolverTest : public MultiDeviceTest { " std: 1.0 " " } " " } " - " bottom: 'data' " - " top: 'innerprod' " - " } " + " bottom: '" << string(share_ ? "data1": "data") << "' " + " top: '" << string(share_ ? "innerprod1": "innerprod") << "' " + " } "; + if (share_) { + proto << + " layer { " + " name: 'innerprod2' " + " type: 'InnerProduct' " + " param { name: 'weights' } " + " param { name: 'bias' } " + " inner_product_param { " + " num_output: 1 " + " weight_filler { " + " type: 'gaussian' " + " std: 1.0 " + " } " + " bias_filler { " + " type: 'gaussian' " + " std: 1.0 " + " } " + " } " + " bottom: 'data2' " + " top: 'innerprod2' " + " } " + " layer { " + " name: 'concat' " + " type: 'Concat' " + " bottom: 'innerprod1' " + " bottom: 'innerprod2' " + " top: 'innerprod' " + " concat_param { " + " axis: 0 " + " } " + " } "; + } + proto << " layer { " " name: 'loss' " " type: 'EuclideanLoss' " @@ -302,6 +355,7 @@ class GradientBasedSolverTest : public MultiDeviceTest { const Dtype kMomentum, const int kNumIters, const int kIterSize) { const double kPrecision = 1e-2; const double kMinPrecision = 1e-7; + constant_data_ = true; // Solve without accumulation and save parameters. this->RunLeastSquaresSolver(kLearningRate, kWeightDecay, kMomentum, kNumIters); @@ -383,8 +437,8 @@ class GradientBasedSolverTest : public MultiDeviceTest { // Save the resulting param values. vector > > param_copies; - const vector > >& orig_params = - solver_->net()->params(); + const vector*>& orig_params = + solver_->net()->learnable_params(); param_copies.resize(orig_params.size()); for (int i = 0; i < orig_params.size(); ++i) { param_copies[i].reset(new Blob()); @@ -417,7 +471,7 @@ class GradientBasedSolverTest : public MultiDeviceTest { momentum, total_num_iters, kIterSize, snapshot, snapshot_name.c_str()); // Check that params now match. - const vector > >& params = solver_->net()->params(); + const vector*>& params = solver_->net()->learnable_params(); for (int i = 0; i < params.size(); ++i) { for (int j = 0; j < params[i]->count(); ++j) { EXPECT_EQ(param_copies[i]->cpu_data()[j], params[i]->cpu_data()[j]) @@ -461,23 +515,38 @@ TYPED_TEST(SGDSolverTest, TestLeastSquaresUpdate) { this->TestLeastSquaresUpdate(); } -TYPED_TEST(SGDSolverTest, TestLeastSquaresUpdateLROneTenth) { +TYPED_TEST(SGDSolverTest, TestLeastSquaresUpdateLROneHundredth) { typedef typename TypeParam::Dtype Dtype; - const Dtype kLearningRate = 0.1; + const Dtype kLearningRate = 0.01; this->TestLeastSquaresUpdate(kLearningRate); } TYPED_TEST(SGDSolverTest, TestLeastSquaresUpdateWithWeightDecay) { typedef typename TypeParam::Dtype Dtype; - const Dtype kLearningRate = 1.0; + const Dtype kLearningRate = 0.01; const Dtype kWeightDecay = 0.5; - this->TestLeastSquaresUpdate(kLearningRate, kWeightDecay); + const Dtype kMomentum = 0; + const int kNumIters = 1; + for (int i = 0; i <= kNumIters; ++i) { + this->TestLeastSquaresUpdate(kLearningRate, kWeightDecay, kMomentum, i); + } +} + +TYPED_TEST(SGDSolverTest, TestLeastSquaresUpdateWithWeightDecayMultiIter) { + typedef typename TypeParam::Dtype Dtype; + const Dtype kLearningRate = 0.01; + const Dtype kWeightDecay = 0.5; + const Dtype kMomentum = 0; + const int kNumIters = 4; + for (int i = 0; i <= kNumIters; ++i) { + this->TestLeastSquaresUpdate(kLearningRate, kWeightDecay, kMomentum, i); + } } TYPED_TEST(SGDSolverTest, TestLeastSquaresUpdateWithMomentum) { typedef typename TypeParam::Dtype Dtype; - const Dtype kLearningRate = 1.0; - const Dtype kWeightDecay = 0.0; + const Dtype kLearningRate = 0.01; + const Dtype kWeightDecay = 0; const Dtype kMomentum = 0.5; const int kNumIters = 1; for (int i = 0; i <= kNumIters; ++i) { @@ -487,8 +556,8 @@ TYPED_TEST(SGDSolverTest, TestLeastSquaresUpdateWithMomentum) { TYPED_TEST(SGDSolverTest, TestLeastSquaresUpdateWithMomentumMultiIter) { typedef typename TypeParam::Dtype Dtype; - const Dtype kLearningRate = 1.0; - const Dtype kWeightDecay = 0.0; + const Dtype kLearningRate = 0.01; + const Dtype kWeightDecay = 0; const Dtype kMomentum = 0.5; const int kNumIters = 4; for (int i = 0; i <= kNumIters; ++i) { @@ -499,9 +568,21 @@ TYPED_TEST(SGDSolverTest, TestLeastSquaresUpdateWithMomentumMultiIter) { TYPED_TEST(SGDSolverTest, TestLeastSquaresUpdateWithEverything) { typedef typename TypeParam::Dtype Dtype; const Dtype kLearningRate = 0.01; - const Dtype kWeightDecay = 0.1; - const Dtype kMomentum = 0.9; + const Dtype kWeightDecay = 0.5; + const Dtype kMomentum = 0.5; + const int kNumIters = 4; + for (int i = 0; i <= kNumIters; ++i) { + this->TestLeastSquaresUpdate(kLearningRate, kWeightDecay, kMomentum, i); + } +} + +TYPED_TEST(SGDSolverTest, TestLeastSquaresUpdateWithEverythingShare) { + typedef typename TypeParam::Dtype Dtype; + const Dtype kLearningRate = 0.01; + const Dtype kWeightDecay = 0.5; + const Dtype kMomentum = 0.5; const int kNumIters = 4; + this->share_ = true; for (int i = 0; i <= kNumIters; ++i) { this->TestLeastSquaresUpdate(kLearningRate, kWeightDecay, kMomentum, i); } @@ -510,10 +591,22 @@ TYPED_TEST(SGDSolverTest, TestLeastSquaresUpdateWithEverything) { TYPED_TEST(SGDSolverTest, TestLeastSquaresUpdateWithEverythingAccum) { typedef typename TypeParam::Dtype Dtype; const Dtype kLearningRate = 0.01; - const Dtype kWeightDecay = 0.1; + const Dtype kWeightDecay = 0.5; + const Dtype kMomentum = 0.9; + const int kNumIters = 4; + const int kIterSize = 2; + this->CheckAccumulation(kLearningRate, kWeightDecay, kMomentum, kNumIters, + kIterSize); +} + +TYPED_TEST(SGDSolverTest, TestLeastSquaresUpdateWithEverythingAccumShare) { + typedef typename TypeParam::Dtype Dtype; + const Dtype kLearningRate = 0.01; + const Dtype kWeightDecay = 0.5; const Dtype kMomentum = 0.9; const int kNumIters = 4; const int kIterSize = 2; + this->share_ = true; this->CheckAccumulation(kLearningRate, kWeightDecay, kMomentum, kNumIters, kIterSize); } @@ -521,7 +614,7 @@ TYPED_TEST(SGDSolverTest, TestLeastSquaresUpdateWithEverythingAccum) { TYPED_TEST(SGDSolverTest, TestSnapshot) { typedef typename TypeParam::Dtype Dtype; const Dtype kLearningRate = 0.01; - const Dtype kWeightDecay = 0.1; + const Dtype kWeightDecay = 0.5; const Dtype kMomentum = 0.9; const int kNumIters = 4; for (int i = 1; i <= kNumIters; ++i) { @@ -529,6 +622,18 @@ TYPED_TEST(SGDSolverTest, TestSnapshot) { } } +TYPED_TEST(SGDSolverTest, TestSnapshotShare) { + typedef typename TypeParam::Dtype Dtype; + const Dtype kLearningRate = 0.01; + const Dtype kWeightDecay = 0.5; + const Dtype kMomentum = 0.9; + const int kNumIters = 4; + this->share_ = true; + for (int i = 1; i <= kNumIters; ++i) { + this->TestSnapshot(kLearningRate, kWeightDecay, kMomentum, i); + } +} + template class AdaGradSolverTest : public GradientBasedSolverTest { @@ -549,15 +654,15 @@ TYPED_TEST(AdaGradSolverTest, TestAdaGradLeastSquaresUpdate) { this->TestLeastSquaresUpdate(); } -TYPED_TEST(AdaGradSolverTest, TestAdaGradLeastSquaresUpdateLROneTenth) { +TYPED_TEST(AdaGradSolverTest, TestAdaGradLeastSquaresUpdateLROneHundredth) { typedef typename TypeParam::Dtype Dtype; - const Dtype kLearningRate = 0.1; + const Dtype kLearningRate = 0.01; this->TestLeastSquaresUpdate(kLearningRate); } TYPED_TEST(AdaGradSolverTest, TestAdaGradLeastSquaresUpdateWithWeightDecay) { typedef typename TypeParam::Dtype Dtype; - const Dtype kLearningRate = 1.0; + const Dtype kLearningRate = 0.01; const Dtype kWeightDecay = 0.5; this->TestLeastSquaresUpdate(kLearningRate, kWeightDecay); } @@ -565,21 +670,46 @@ TYPED_TEST(AdaGradSolverTest, TestAdaGradLeastSquaresUpdateWithWeightDecay) { TYPED_TEST(AdaGradSolverTest, TestAdaGradLeastSquaresUpdateWithEverything) { typedef typename TypeParam::Dtype Dtype; const Dtype kLearningRate = 0.01; - const Dtype kWeightDecay = 0.1; - const Dtype kMomentum = 0.0; + const Dtype kWeightDecay = 0.5; + const Dtype kMomentum = 0; const int kNumIters = 4; for (int i = 0; i <= kNumIters; ++i) { this->TestLeastSquaresUpdate(kLearningRate, kWeightDecay, kMomentum, i); } } +TYPED_TEST(AdaGradSolverTest, + TestAdaGradLeastSquaresUpdateWithEverythingShare) { + typedef typename TypeParam::Dtype Dtype; + const Dtype kLearningRate = 0.01; + const Dtype kWeightDecay = 0.5; + const Dtype kMomentum = 0; + const int kNumIters = 4; + this->share_ = true; + for (int i = 0; i <= kNumIters; ++i) { + this->TestLeastSquaresUpdate(kLearningRate, kWeightDecay, kMomentum, i); + } +} + TYPED_TEST(AdaGradSolverTest, TestLeastSquaresUpdateWithEverythingAccum) { typedef typename TypeParam::Dtype Dtype; const Dtype kLearningRate = 0.01; - const Dtype kWeightDecay = 0.1; - const Dtype kMomentum = 0.0; + const Dtype kWeightDecay = 0.5; + const Dtype kMomentum = 0; + const int kNumIters = 4; + const int kIterSize = 2; + this->CheckAccumulation(kLearningRate, kWeightDecay, kMomentum, kNumIters, + kIterSize); +} + +TYPED_TEST(AdaGradSolverTest, TestLeastSquaresUpdateWithEverythingAccumShare) { + typedef typename TypeParam::Dtype Dtype; + const Dtype kLearningRate = 0.01; + const Dtype kWeightDecay = 0.5; + const Dtype kMomentum = 0; const int kNumIters = 4; const int kIterSize = 2; + this->share_ = true; this->CheckAccumulation(kLearningRate, kWeightDecay, kMomentum, kNumIters, kIterSize); } @@ -587,14 +717,26 @@ TYPED_TEST(AdaGradSolverTest, TestLeastSquaresUpdateWithEverythingAccum) { TYPED_TEST(AdaGradSolverTest, TestSnapshot) { typedef typename TypeParam::Dtype Dtype; const Dtype kLearningRate = 0.01; - const Dtype kWeightDecay = 0.1; - const Dtype kMomentum = 0.0; + const Dtype kWeightDecay = 0.5; + const Dtype kMomentum = 0; const int kNumIters = 4; for (int i = 1; i <= kNumIters; ++i) { this->TestSnapshot(kLearningRate, kWeightDecay, kMomentum, i); } } +TYPED_TEST(AdaGradSolverTest, TestSnapshotShare) { + typedef typename TypeParam::Dtype Dtype; + const Dtype kLearningRate = 0.01; + const Dtype kWeightDecay = 0.5; + const Dtype kMomentum = 0; + const int kNumIters = 4; + this->share_ = true; + for (int i = 1; i <= kNumIters; ++i) { + this->TestSnapshot(kLearningRate, kWeightDecay, kMomentum, i); + } +} + template class NesterovSolverTest : public GradientBasedSolverTest { @@ -615,23 +757,35 @@ TYPED_TEST(NesterovSolverTest, TestNesterovLeastSquaresUpdate) { this->TestLeastSquaresUpdate(); } -TYPED_TEST(NesterovSolverTest, TestNesterovLeastSquaresUpdateLROneTenth) { +TYPED_TEST(NesterovSolverTest, TestNesterovLeastSquaresUpdateLROneHundredth) { typedef typename TypeParam::Dtype Dtype; - const Dtype kLearningRate = 0.1; + const Dtype kLearningRate = 0.01; this->TestLeastSquaresUpdate(kLearningRate); } TYPED_TEST(NesterovSolverTest, TestNesterovLeastSquaresUpdateWithWeightDecay) { typedef typename TypeParam::Dtype Dtype; - const Dtype kLearningRate = 1.0; + const Dtype kLearningRate = 0.01; const Dtype kWeightDecay = 0.5; this->TestLeastSquaresUpdate(kLearningRate, kWeightDecay); } +TYPED_TEST(NesterovSolverTest, + TestNesterovLeastSquaresUpdateWithWeightDecayMultiIter) { + typedef typename TypeParam::Dtype Dtype; + const Dtype kLearningRate = 0.01; + const Dtype kWeightDecay = 0.5; + const Dtype kMomentum = 0; + const int kNumIters = 4; + for (int i = 0; i <= kNumIters; ++i) { + this->TestLeastSquaresUpdate(kLearningRate, kWeightDecay, kMomentum, i); + } +} + TYPED_TEST(NesterovSolverTest, TestNesterovLeastSquaresUpdateWithMomentum) { typedef typename TypeParam::Dtype Dtype; - const Dtype kLearningRate = 1.0; - const Dtype kWeightDecay = 0.0; + const Dtype kLearningRate = 0.01; + const Dtype kWeightDecay = 0; const Dtype kMomentum = 0.5; const int kNumIters = 1; for (int i = 0; i <= kNumIters; ++i) { @@ -641,8 +795,8 @@ TYPED_TEST(NesterovSolverTest, TestNesterovLeastSquaresUpdateWithMomentum) { TYPED_TEST(NesterovSolverTest, TestLeastSquaresUpdateWithMomentumMultiIter) { typedef typename TypeParam::Dtype Dtype; - const Dtype kLearningRate = 1.0; - const Dtype kWeightDecay = 0.0; + const Dtype kLearningRate = 0.01; + const Dtype kWeightDecay = 0; const Dtype kMomentum = 0.5; const int kNumIters = 4; for (int i = 0; i <= kNumIters; ++i) { @@ -653,7 +807,7 @@ TYPED_TEST(NesterovSolverTest, TestLeastSquaresUpdateWithMomentumMultiIter) { TYPED_TEST(NesterovSolverTest, TestNesterovLeastSquaresUpdateWithEverything) { typedef typename TypeParam::Dtype Dtype; const Dtype kLearningRate = 0.01; - const Dtype kWeightDecay = 0.1; + const Dtype kWeightDecay = 0.5; const Dtype kMomentum = 0.9; const int kNumIters = 4; for (int i = 0; i <= kNumIters; ++i) { @@ -661,13 +815,38 @@ TYPED_TEST(NesterovSolverTest, TestNesterovLeastSquaresUpdateWithEverything) { } } +TYPED_TEST(NesterovSolverTest, + TestNesterovLeastSquaresUpdateWithEverythingShare) { + typedef typename TypeParam::Dtype Dtype; + const Dtype kLearningRate = 0.01; + const Dtype kWeightDecay = 0.5; + const Dtype kMomentum = 0.9; + const int kNumIters = 4; + this->share_ = true; + for (int i = 0; i <= kNumIters; ++i) { + this->TestLeastSquaresUpdate(kLearningRate, kWeightDecay, kMomentum, i); + } +} + TYPED_TEST(NesterovSolverTest, TestLeastSquaresUpdateWithEverythingAccum) { typedef typename TypeParam::Dtype Dtype; const Dtype kLearningRate = 0.01; - const Dtype kWeightDecay = 0.1; + const Dtype kWeightDecay = 0.5; + const Dtype kMomentum = 0.9; + const int kNumIters = 4; + const int kIterSize = 2; + this->CheckAccumulation(kLearningRate, kWeightDecay, kMomentum, kNumIters, + kIterSize); +} + +TYPED_TEST(NesterovSolverTest, TestLeastSquaresUpdateWithEverythingAccumShare) { + typedef typename TypeParam::Dtype Dtype; + const Dtype kLearningRate = 0.01; + const Dtype kWeightDecay = 0.5; const Dtype kMomentum = 0.9; const int kNumIters = 4; const int kIterSize = 2; + this->share_ = true; this->CheckAccumulation(kLearningRate, kWeightDecay, kMomentum, kNumIters, kIterSize); } @@ -675,9 +854,21 @@ TYPED_TEST(NesterovSolverTest, TestLeastSquaresUpdateWithEverythingAccum) { TYPED_TEST(NesterovSolverTest, TestSnapshot) { typedef typename TypeParam::Dtype Dtype; const Dtype kLearningRate = 0.01; - const Dtype kWeightDecay = 0.1; - const Dtype kMomentum = 0.0; + const Dtype kWeightDecay = 0.5; + const Dtype kMomentum = 0.9; + const int kNumIters = 4; + for (int i = 1; i <= kNumIters; ++i) { + this->TestSnapshot(kLearningRate, kWeightDecay, kMomentum, i); + } +} + +TYPED_TEST(NesterovSolverTest, TestSnapshotShare) { + typedef typename TypeParam::Dtype Dtype; + const Dtype kLearningRate = 0.01; + const Dtype kWeightDecay = 0.5; + const Dtype kMomentum = 0.9; const int kNumIters = 4; + this->share_ = true; for (int i = 1; i <= kNumIters; ++i) { this->TestSnapshot(kLearningRate, kWeightDecay, kMomentum, i); } diff --git a/src/caffe/test/test_net.cpp b/src/caffe/test/test_net.cpp index 56959f4793b..12998d8912f 100644 --- a/src/caffe/test/test_net.cpp +++ b/src/caffe/test/test_net.cpp @@ -1107,11 +1107,10 @@ TYPED_TEST(NetTest, TestSharedWeightsUpdate) { EXPECT_EQ(this->net_->layer_names()[2], "innerproduct2"); Blob* ip1_weights = this->net_->layers()[1]->blobs()[0].get(); Blob* ip2_weights = this->net_->layers()[2]->blobs()[0].get(); - // Check that data blobs of shared weights share the same location in memory. + // Check that data and diff blobs of shared weights share the same memory + // locations. EXPECT_EQ(ip1_weights->cpu_data(), ip2_weights->cpu_data()); - // Check that diff blobs of shared weights are at different locations in - // memory. (The diffs should be accumulated at update time.) - EXPECT_NE(ip1_weights->cpu_diff(), ip2_weights->cpu_diff()); + EXPECT_EQ(ip1_weights->cpu_diff(), ip2_weights->cpu_diff()); this->net_->Forward(bottom); this->net_->Backward(); // Compute the expected update as the data minus the two diffs. @@ -1124,11 +1123,7 @@ TYPED_TEST(NetTest, TestSharedWeightsUpdate) { // Make sure the diffs are non-trivial. for (int i = 0; i < count; ++i) { EXPECT_NE(0, ip1_weights->cpu_diff()[i]); - EXPECT_NE(0, ip2_weights->cpu_diff()[i]); - EXPECT_NE(ip1_weights->cpu_diff()[i], ip2_weights->cpu_diff()[i]); } - caffe_axpy(count, Dtype(1), ip2_weights->cpu_diff(), - shared_params.mutable_cpu_diff()); caffe_axpy(count, Dtype(-1), shared_params.cpu_diff(), shared_params.mutable_cpu_data()); const Dtype* expected_updated_params = shared_params.cpu_data(); @@ -1165,8 +1160,8 @@ TYPED_TEST(NetTest, TestSharedWeightsUpdate) { EXPECT_NE(0, ip1_weights->cpu_diff()[i]); EXPECT_NE(0, ip2_weights->cpu_diff()[i]); EXPECT_NE(ip1_weights->cpu_diff()[i], ip2_weights->cpu_diff()[i]); - EXPECT_EQ(ip1_weights->cpu_diff()[i] + ip2_weights->cpu_diff()[i], - shared_params.cpu_diff()[i]); + EXPECT_FLOAT_EQ(ip1_weights->cpu_diff()[i] + ip2_weights->cpu_diff()[i], + shared_params.cpu_diff()[i]); } caffe_axpy(count, Dtype(-1), ip1_weights->cpu_diff(), unshared_params1.mutable_cpu_data()); @@ -1196,11 +1191,10 @@ TYPED_TEST(NetTest, TestSharedWeightsResume) { EXPECT_EQ(this->net_->layer_names()[2], "innerproduct2"); Blob* ip1_weights = this->net_->layers()[1]->blobs()[0].get(); Blob* ip2_weights = this->net_->layers()[2]->blobs()[0].get(); - // Check that data blobs of shared weights share the same location in memory. + // Check that data and diff blobs of shared weights share the same memory + // locations. EXPECT_EQ(ip1_weights->cpu_data(), ip2_weights->cpu_data()); - // Check that diff blobs of shared weights are at different locations in - // memory. (The diffs should be accumulated at update time.) - EXPECT_NE(ip1_weights->cpu_diff(), ip2_weights->cpu_diff()); + EXPECT_EQ(ip1_weights->cpu_diff(), ip2_weights->cpu_diff()); this->net_->ForwardBackward(bottom); this->net_->Update(); Blob shared_params; @@ -1223,14 +1217,13 @@ TYPED_TEST(NetTest, TestSharedWeightsResume) { ASSERT_FALSE(NULL == ip1_weights); ASSERT_FALSE(NULL == ip2_weights); EXPECT_NE(ip1_weights, ip2_weights); - // Check that data blobs of shared weights share the same location in memory. + // Check that data and diff blobs of shared weights share the same memory + // locations. EXPECT_EQ(ip1_weights->cpu_data(), ip2_weights->cpu_data()); + EXPECT_EQ(ip1_weights->cpu_diff(), ip2_weights->cpu_diff()); for (int i = 0; i < count; ++i) { EXPECT_FLOAT_EQ(shared_params.cpu_data()[i], ip1_weights->cpu_data()[i]); } - // Check that diff blobs of shared weights are at different locations in - // memory. (The diffs should be accumulated at update time.) - EXPECT_NE(ip1_weights->cpu_diff(), ip2_weights->cpu_diff()); } TYPED_TEST(NetTest, TestParamPropagateDown) {