From 249aba49ab3100d0409d93ef187ebcd5a53865b9 Mon Sep 17 00:00:00 2001 From: wieschol Date: Wed, 22 Oct 2014 15:22:36 +0200 Subject: [PATCH 1/2] shuffle data --- include/caffe/data_layers.hpp | 2 ++ src/caffe/layers/hdf5_data_layer.cpp | 43 +++++++++++++++++++++++++++++++----- src/caffe/layers/hdf5_data_layer.cu | 12 +++++++--- src/caffe/proto/caffe.proto | 2 ++ 4 files changed, 51 insertions(+), 8 deletions(-) diff --git a/include/caffe/data_layers.hpp b/include/caffe/data_layers.hpp index 1f154408c27..2bb9d948169 100644 --- a/include/caffe/data_layers.hpp +++ b/include/caffe/data_layers.hpp @@ -171,6 +171,8 @@ class HDF5DataLayer : public Layer { unsigned int current_file_; hsize_t current_row_; std::vector > > hdf_blobs_; + std::vector data_permutation_; + std::vector file_permutation_; }; /** diff --git a/src/caffe/layers/hdf5_data_layer.cpp b/src/caffe/layers/hdf5_data_layer.cpp index 1ceb6c24431..afa77e1e026 100644 --- a/src/caffe/layers/hdf5_data_layer.cpp +++ b/src/caffe/layers/hdf5_data_layer.cpp @@ -52,7 +52,20 @@ void HDF5DataLayer::LoadHDF5FileData(const char* filename) { for (int i = 1; i < top_size; ++i) { CHECK_EQ(hdf_blobs_[i]->num(), num); } - DLOG(INFO) << "Successully loaded " << hdf_blobs_[0]->num() << " rows"; + // permutation in file is identity by default + data_permutation_.clear(); + data_permutation_.resize(hdf_blobs_[0]->num()); + for (int i = 0; i < hdf_blobs_[0]->num(); i++) + data_permutation_[i] = i; + + // shuffle data when asked + if (this->layer_param_.hdf5_data_param().shuffle()) { + std::random_shuffle(data_permutation_.begin(), data_permutation_.end()); + DLOG(INFO) << "Successully loaded " << hdf_blobs_[0]->num() + << " rows (shuffled)"; + } else { + DLOG(INFO) << "Successully loaded " << hdf_blobs_[0]->num() << " rows"; + } } template @@ -81,9 +94,23 @@ void HDF5DataLayer::LayerSetUp(const vector*>& bottom, CHECK_GE(num_files_, 1) << "Must have at least 1 HDF5 filename listed in " << source; + file_permutation_.clear(); + file_permutation_.resize(num_files_); + // default order-permutation is identity permutation + for (int i = 0; i < num_files_; i++) + file_permutation_[i] = i; + + // only change order, when asked + if (this->layer_param_.hdf5_data_param().shuffle()) { + std::random_shuffle(file_permutation_.begin(), file_permutation_.end()); + } + // Load the first HDF5 file and initialize the line counter. - LoadHDF5FileData(hdf_filenames_[current_file_].c_str()); + LoadHDF5FileData(hdf_filenames_[file_permutation_[current_file_]].c_str()); current_row_ = 0; + if (this->layer_param_.hdf5_data_param().shuffle()) { + std::random_shuffle(file_permutation_.begin(), file_permutation_.end()); + } // Reshape blobs. const int batch_size = this->layer_param_.hdf5_data_param().batch_size(); @@ -109,17 +136,23 @@ void HDF5DataLayer::Forward_cpu(const vector*>& bottom, ++current_file_; if (current_file_ == num_files_) { current_file_ = 0; + if (this->layer_param_.hdf5_data_param().shuffle()) + std::random_shuffle(file_permutation_.begin(), + file_permutation_.end()); DLOG(INFO) << "Looping around to first file."; } - LoadHDF5FileData(hdf_filenames_[current_file_].c_str()); + LoadHDF5FileData(hdf_filenames_[file_permutation_[current_file_]]. + c_str()); } current_row_ = 0; + if (this->layer_param_.hdf5_data_param().shuffle()) + std::random_shuffle(data_permutation_.begin(), data_permutation_.end()); } for (int j = 0; j < this->layer_param_.top_size(); ++j) { int data_dim = top[j]->count() / top[j]->num(); caffe_copy(data_dim, - &hdf_blobs_[j]->cpu_data()[current_row_ * data_dim], - &top[j]->mutable_cpu_data()[i * data_dim]); + &hdf_blobs_[j]->cpu_data()[data_permutation_[current_row_] + * data_dim], &top[j]->mutable_cpu_data()[i * data_dim]); } } } diff --git a/src/caffe/layers/hdf5_data_layer.cu b/src/caffe/layers/hdf5_data_layer.cu index 02e3821d104..990cfa66920 100644 --- a/src/caffe/layers/hdf5_data_layer.cu +++ b/src/caffe/layers/hdf5_data_layer.cu @@ -26,17 +26,23 @@ void HDF5DataLayer::Forward_gpu(const vector*>& bottom, current_file_ += 1; if (current_file_ == num_files_) { current_file_ = 0; + if (this->layer_param_.hdf5_data_param().shuffle()) + std::random_shuffle(file_permutation_.begin(), + file_permutation_.end()); DLOG(INFO) << "Looping around to first file."; } - LoadHDF5FileData(hdf_filenames_[current_file_].c_str()); + LoadHDF5FileData(hdf_filenames_[file_permutation_[current_file_]] + .c_str()); } current_row_ = 0; + if (this->layer_param_.hdf5_data_param().shuffle()) + std::random_shuffle(data_permutation_.begin(), data_permutation_.end()); } for (int j = 0; j < this->layer_param_.top_size(); ++j) { int data_dim = top[j]->count() / top[j]->num(); caffe_copy(data_dim, - &hdf_blobs_[j]->cpu_data()[current_row_ * data_dim], - &top[j]->mutable_gpu_data()[i * data_dim]); + &hdf_blobs_[j]->cpu_data()[data_permutation_[current_row_] + * data_dim], &top[j]->mutable_gpu_data()[i * data_dim]); } } } diff --git a/src/caffe/proto/caffe.proto b/src/caffe/proto/caffe.proto index 888371dc778..a7489c23088 100644 --- a/src/caffe/proto/caffe.proto +++ b/src/caffe/proto/caffe.proto @@ -518,6 +518,8 @@ message HDF5DataParameter { optional string source = 1; // Specify the batch size. optional uint32 batch_size = 2; + // Specify shuffling the order of training data + optional bool shuffle = 3 [default = false]; } // Message that stores parameters used by HDF5OutputLayer From 6fe2b04dcb2663543c8a5d0b3e139c00b5199d37 Mon Sep 17 00:00:00 2001 From: Jeff Donahue Date: Fri, 13 Mar 2015 01:27:59 -0700 Subject: [PATCH 2/2] HDF5DataLayer shuffle: minor cleanup; clarification in HDF5DataParameter --- src/caffe/layers/hdf5_data_layer.cpp | 41 +++++++++++++++++++----------------- src/caffe/layers/hdf5_data_layer.cu | 15 +++++++------ src/caffe/proto/caffe.proto | 7 +++++- 3 files changed, 36 insertions(+), 27 deletions(-) diff --git a/src/caffe/layers/hdf5_data_layer.cpp b/src/caffe/layers/hdf5_data_layer.cpp index afa77e1e026..1d3dc1e62ac 100644 --- a/src/caffe/layers/hdf5_data_layer.cpp +++ b/src/caffe/layers/hdf5_data_layer.cpp @@ -14,9 +14,9 @@ #include "hdf5_hl.h" #include "stdint.h" +#include "caffe/data_layers.hpp" #include "caffe/layer.hpp" #include "caffe/util/io.hpp" -#include "caffe/vision_layers.hpp" namespace caffe { @@ -48,23 +48,24 @@ void HDF5DataLayer::LoadHDF5FileData(const char* filename) { CHECK_GE(status, 0) << "Failed to close HDF5 file: " << filename; // MinTopBlobs==1 guarantees at least one top blob - int num = hdf_blobs_[0]->num(); + CHECK_GE(hdf_blobs_[0]->num_axes(), 1) << "Input must have at least 1 axis."; + const int num = hdf_blobs_[0]->shape(0); for (int i = 1; i < top_size; ++i) { - CHECK_EQ(hdf_blobs_[i]->num(), num); + CHECK_EQ(hdf_blobs_[i]->shape(0), num); } - // permutation in file is identity by default + // Default to identity permutation. data_permutation_.clear(); - data_permutation_.resize(hdf_blobs_[0]->num()); - for (int i = 0; i < hdf_blobs_[0]->num(); i++) + data_permutation_.resize(hdf_blobs_[0]->shape(0)); + for (int i = 0; i < hdf_blobs_[0]->shape(0); i++) data_permutation_[i] = i; - // shuffle data when asked + // Shuffle if needed. if (this->layer_param_.hdf5_data_param().shuffle()) { std::random_shuffle(data_permutation_.begin(), data_permutation_.end()); - DLOG(INFO) << "Successully loaded " << hdf_blobs_[0]->num() - << " rows (shuffled)"; + DLOG(INFO) << "Successully loaded " << hdf_blobs_[0]->shape(0) + << " rows (shuffled)"; } else { - DLOG(INFO) << "Successully loaded " << hdf_blobs_[0]->num() << " rows"; + DLOG(INFO) << "Successully loaded " << hdf_blobs_[0]->shape(0) << " rows"; } } @@ -96,11 +97,12 @@ void HDF5DataLayer::LayerSetUp(const vector*>& bottom, file_permutation_.clear(); file_permutation_.resize(num_files_); - // default order-permutation is identity permutation - for (int i = 0; i < num_files_; i++) + // Default to identity permutation. + for (int i = 0; i < num_files_; i++) { file_permutation_[i] = i; + } - // only change order, when asked + // Shuffle if needed. if (this->layer_param_.hdf5_data_param().shuffle()) { std::random_shuffle(file_permutation_.begin(), file_permutation_.end()); } @@ -131,25 +133,26 @@ void HDF5DataLayer::Forward_cpu(const vector*>& bottom, const vector*>& top) { const int batch_size = this->layer_param_.hdf5_data_param().batch_size(); for (int i = 0; i < batch_size; ++i, ++current_row_) { - if (current_row_ == hdf_blobs_[0]->num()) { + if (current_row_ == hdf_blobs_[0]->shape(0)) { if (num_files_ > 1) { ++current_file_; if (current_file_ == num_files_) { current_file_ = 0; - if (this->layer_param_.hdf5_data_param().shuffle()) + if (this->layer_param_.hdf5_data_param().shuffle()) { std::random_shuffle(file_permutation_.begin(), - file_permutation_.end()); + file_permutation_.end()); + } DLOG(INFO) << "Looping around to first file."; } - LoadHDF5FileData(hdf_filenames_[file_permutation_[current_file_]]. - c_str()); + LoadHDF5FileData( + hdf_filenames_[file_permutation_[current_file_]].c_str()); } current_row_ = 0; if (this->layer_param_.hdf5_data_param().shuffle()) std::random_shuffle(data_permutation_.begin(), data_permutation_.end()); } for (int j = 0; j < this->layer_param_.top_size(); ++j) { - int data_dim = top[j]->count() / top[j]->num(); + int data_dim = top[j]->count() / top[j]->shape(0); caffe_copy(data_dim, &hdf_blobs_[j]->cpu_data()[data_permutation_[current_row_] * data_dim], &top[j]->mutable_cpu_data()[i * data_dim]); diff --git a/src/caffe/layers/hdf5_data_layer.cu b/src/caffe/layers/hdf5_data_layer.cu index 990cfa66920..5e3e4ced141 100644 --- a/src/caffe/layers/hdf5_data_layer.cu +++ b/src/caffe/layers/hdf5_data_layer.cu @@ -10,9 +10,9 @@ TODO: #include "hdf5.h" #include "hdf5_hl.h" +#include "caffe/data_layers.hpp" #include "caffe/layer.hpp" #include "caffe/util/io.hpp" -#include "caffe/vision_layers.hpp" namespace caffe { @@ -21,25 +21,26 @@ void HDF5DataLayer::Forward_gpu(const vector*>& bottom, const vector*>& top) { const int batch_size = this->layer_param_.hdf5_data_param().batch_size(); for (int i = 0; i < batch_size; ++i, ++current_row_) { - if (current_row_ == hdf_blobs_[0]->num()) { + if (current_row_ == hdf_blobs_[0]->shape(0)) { if (num_files_ > 1) { current_file_ += 1; if (current_file_ == num_files_) { current_file_ = 0; - if (this->layer_param_.hdf5_data_param().shuffle()) + if (this->layer_param_.hdf5_data_param().shuffle()) { std::random_shuffle(file_permutation_.begin(), - file_permutation_.end()); + file_permutation_.end()); + } DLOG(INFO) << "Looping around to first file."; } - LoadHDF5FileData(hdf_filenames_[file_permutation_[current_file_]] - .c_str()); + LoadHDF5FileData( + hdf_filenames_[file_permutation_[current_file_]].c_str()); } current_row_ = 0; if (this->layer_param_.hdf5_data_param().shuffle()) std::random_shuffle(data_permutation_.begin(), data_permutation_.end()); } for (int j = 0; j < this->layer_param_.top_size(); ++j) { - int data_dim = top[j]->count() / top[j]->num(); + int data_dim = top[j]->count() / top[j]->shape(0); caffe_copy(data_dim, &hdf_blobs_[j]->cpu_data()[data_permutation_[current_row_] * data_dim], &top[j]->mutable_gpu_data()[i * data_dim]); diff --git a/src/caffe/proto/caffe.proto b/src/caffe/proto/caffe.proto index a7489c23088..5b21cf20028 100644 --- a/src/caffe/proto/caffe.proto +++ b/src/caffe/proto/caffe.proto @@ -518,7 +518,12 @@ message HDF5DataParameter { optional string source = 1; // Specify the batch size. optional uint32 batch_size = 2; - // Specify shuffling the order of training data + + // Specify whether to shuffle the data. + // If shuffle == true, the ordering of the HDF5 files is shuffled, + // and the ordering of data within any given HDF5 file is shuffled, + // but data between different files are not interleaved; all of a file's + // data are output (in a random order) before moving onto another file. optional bool shuffle = 3 [default = false]; }