From f4617ec993b4787b6c897dd260dbe67f15b30ba5 Mon Sep 17 00:00:00 2001 From: Tong Jin Date: Tue, 11 Jul 2017 18:40:31 +0800 Subject: [PATCH 1/5] Add "append" mode for convert_imageset. Sometimes, we need to append additional images into lmdb/leveldb, especiallly in fine-tuning stage. If we construct all data with convert_imageset from ZERO, it's inefficient when the image count is large. If "append" is set explicitly, new image data are appended after existing rows, no need to touch existing constructed data. --- tools/convert_imageset.cpp | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/tools/convert_imageset.cpp b/tools/convert_imageset.cpp index 90cdb15d427..095231f3255 100644 --- a/tools/convert_imageset.cpp +++ b/tools/convert_imageset.cpp @@ -42,6 +42,8 @@ DEFINE_bool(encoded, false, "When this option is on, the encoded image will be save in datum"); DEFINE_string(encode_type, "", "Optional: What type should we encode the image as ('png','jpg',...)."); +DEFINE_bool(append, false, + "Append to existing lmdb/leveldb."); int main(int argc, char** argv) { #ifdef USE_OPENCV @@ -70,6 +72,7 @@ int main(int argc, char** argv) { const bool check_size = FLAGS_check_size; const bool encoded = FLAGS_encoded; const string encode_type = FLAGS_encode_type; + const bool append = FLAGS_append; std::ifstream infile(argv[2]); std::vector > lines; @@ -96,7 +99,24 @@ int main(int argc, char** argv) { // Create new DB scoped_ptr db(db::GetDB(FLAGS_backend)); - db->Open(argv[3], db::NEW); + + int start_line_id = 0; + if(append){ + db->Open(argv[3], db::WRITE); + scoped_ptr cursor(db->NewCursor()); + while (cursor->valid()) { + start_line_id++; + cursor->Next(); + } + + // start from the next line_id + start_line_id++; + LOG(INFO) << "Append from line " << start_line_id<<"."; + } + else{ + db->Open(argv[3], db::NEW); + } + scoped_ptr txn(db->NewTransaction()); // Storing to db @@ -133,7 +153,7 @@ int main(int argc, char** argv) { } } // sequential - string key_str = caffe::format_int(line_id, 8) + "_" + lines[line_id].first; + string key_str = caffe::format_int(line_id + start_line_id, 8) + "_" + lines[line_id].first; // Put in db string out; From a2dcf4e46160fee7c08e01b233c1eb29498845fe Mon Sep 17 00:00:00 2001 From: Tong Jin Date: Tue, 11 Jul 2017 20:12:18 +0800 Subject: [PATCH 2/5] Add "append" mode for convert_imageset. Sometimes, we need to append additional images into lmdb/leveldb, especiallly in fine-tuning stage. If we construct all data with convert_imageset from ZERO, it's inefficient when the image count is large. If "append" is set explicitly, new image data are appended after existing rows, no need to touch existing constructed data. --- tools/convert_imageset.cpp | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/tools/convert_imageset.cpp b/tools/convert_imageset.cpp index 095231f3255..9b1c6061fb3 100644 --- a/tools/convert_imageset.cpp +++ b/tools/convert_imageset.cpp @@ -101,7 +101,7 @@ int main(int argc, char** argv) { scoped_ptr db(db::GetDB(FLAGS_backend)); int start_line_id = 0; - if(append){ + if (append) { db->Open(argv[3], db::WRITE); scoped_ptr cursor(db->NewCursor()); while (cursor->valid()) { @@ -111,11 +111,9 @@ int main(int argc, char** argv) { // start from the next line_id start_line_id++; - LOG(INFO) << "Append from line " << start_line_id<<"."; - } - else{ - db->Open(argv[3], db::NEW); + LOG(INFO) << "Append from line " << start_line_id << "."; } + else db->Open(argv[3], db::NEW); scoped_ptr txn(db->NewTransaction()); @@ -153,7 +151,8 @@ int main(int argc, char** argv) { } } // sequential - string key_str = caffe::format_int(line_id + start_line_id, 8) + "_" + lines[line_id].first; + string key_str = caffe::format_int(line_id + start_line_id, 8) + + "_" + lines[line_id].first; // Put in db string out; From b443bd068d888bbf91be95394efde13809246071 Mon Sep 17 00:00:00 2001 From: Tong Jin Date: Wed, 12 Jul 2017 09:44:46 +0800 Subject: [PATCH 3/5] Add "append" mode for convert_imageset. Sometimes, we need to append additional images into lmdb/leveldb, especiallly in fine-tuning stage. If we construct all data with convert_imageset from ZERO, it's inefficient when the image count is large. If "append" is set explicitly, new image data are appended after existing rows, no need to touch existing constructed data. --- tools/convert_imageset.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tools/convert_imageset.cpp b/tools/convert_imageset.cpp index 9b1c6061fb3..09c73d506cb 100644 --- a/tools/convert_imageset.cpp +++ b/tools/convert_imageset.cpp @@ -113,7 +113,9 @@ int main(int argc, char** argv) { start_line_id++; LOG(INFO) << "Append from line " << start_line_id << "."; } - else db->Open(argv[3], db::NEW); + else { + db->Open(argv[3], db::NEW); + } scoped_ptr txn(db->NewTransaction()); From 93b4066a0f541418dd31bbd9232edcd7afd3e8ef Mon Sep 17 00:00:00 2001 From: Tong Jin Date: Wed, 12 Jul 2017 10:27:57 +0800 Subject: [PATCH 4/5] Add "append" mode for convert_imageset. Sometimes, we need to append additional images into lmdb/leveldb, especiallly in fine-tuning stage. If we construct all data with convert_imageset from ZERO, it's inefficient when the image count is large. If "append" is set explicitly, new image data are appended after existing rows, no need to touch existing constructed data. --- tools/convert_imageset.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tools/convert_imageset.cpp b/tools/convert_imageset.cpp index 09c73d506cb..af2e8b08536 100644 --- a/tools/convert_imageset.cpp +++ b/tools/convert_imageset.cpp @@ -112,8 +112,7 @@ int main(int argc, char** argv) { // start from the next line_id start_line_id++; LOG(INFO) << "Append from line " << start_line_id << "."; - } - else { + } else { db->Open(argv[3], db::NEW); } From f52bad6795b7c4dc60c9d1978b8d34690aff96b0 Mon Sep 17 00:00:00 2001 From: Tong Jin Date: Tue, 8 Aug 2017 18:00:38 +0800 Subject: [PATCH 5/5] increase batch size of lmdb commit --- tools/convert_imageset.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/convert_imageset.cpp b/tools/convert_imageset.cpp index af2e8b08536..da99595ef1b 100644 --- a/tools/convert_imageset.cpp +++ b/tools/convert_imageset.cpp @@ -160,7 +160,7 @@ int main(int argc, char** argv) { CHECK(datum.SerializeToString(&out)); txn->Put(key_str, out); - if (++count % 1000 == 0) { + if (++count % 10000 == 0) { // Commit db txn->Commit(); txn.reset(db->NewTransaction()); @@ -168,7 +168,7 @@ int main(int argc, char** argv) { } } // write the last batch - if (count % 1000 != 0) { + if (count % 10000 != 0) { txn->Commit(); LOG(INFO) << "Processed " << count << " files."; }